# LDA

In [2]:
import pandas as pd
import numpy as np
import os

### Dataset: mentions - three responses concatenated

In [3]:
data = pd.read_json('~/thesis/data/processed_uscensus/political_mentions.jsonl', orient='index').copy()

In [4]:
data.head(5)

Unnamed: 0,mentions
200015,racim blacks whites false information
200022,coming together country
200039,severe political polarization allow compromise...
200046,pandemic covid 19 unemployment lot people loss...
200053,globalism fake covid law order blm public educ...


In [5]:
# Tokenising and removing the stopwords
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

data = data.mentions.values.tolist()
data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/home/kim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['racim', 'blacks', 'whites', 'false', 'information']


In [6]:
data_words[:5]

[['racim', 'blacks', 'whites', 'false', 'information'],
 ['coming', 'together', 'country'],
 ['severe',
  'political',
  'polarization',
  'allow',
  'compromise',
  'growth',
  'affordable',
  'health',
  'care'],
 ['pandemic', 'covid', 'unemployment', 'lot', 'people', 'lossing', 'job'],
 ['globalism',
  'fake',
  'covid',
  'law',
  'order',
  'blm',
  'public',
  'education',
  'socialism',
  'businesses',
  'destroyed',
  'globalist',
  'governors',
  'globalism',
  'fake',
  'covid',
  'socialism',
  'public',
  'education',
  'eugenics']]

In [7]:
# Convert tokenised object into corpus and dictionary
# The produced corpus shown above is a mapping of (word_id, word_frequency).
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30]) 

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


In [8]:
# LDA model
from pprint import pprint

# number of topics
num_topics = 10 # based on the clustering result from the previous analysis

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,random_state=0,
                                        chunksize=100,
                                        passes=10,
                                        alpha=0.01,
                                        eta=0.9
                                      )
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.002*"none" + 0.001*"media" + 0.001*"covid" + 0.001*"bias" + '
  '0.001*"dishonesty" + 0.001*"news" + 0.001*"press" + 0.001*"imagration" + '
  '0.001*"lack" + 0.001*"fox"'),
 (1,
  '0.001*"economy" + 0.001*"covid" + 0.000*"election" + 0.000*"virus" + '
  '0.000*"racial" + 0.000*"division" + 0.000*"effecting" + 0.000*"socialism" + '
  '0.000*"currently" + 0.000*"cia"'),
 (2,
  '0.040*"covid" + 0.022*"economy" + 0.020*"pandemic" + 0.019*"racism" + '
  '0.014*"climate" + 0.013*"health" + 0.012*"change" + 0.011*"racial" + '
  '0.009*"healthcare" + 0.009*"care"'),
 (3,
  '0.003*"corona" + 0.003*"lack" + 0.002*"virus" + 0.001*"equity" + '
  '0.001*"information" + 0.001*"conspiracy" + 0.001*"empathy" + '
  '0.001*"education" + 0.001*"within" + 0.001*"truth"'),
 (4,
  '0.001*"human" + 0.001*"polar" + 0.001*"freedoms" + 0.000*"attack" + '
  '0.000*"woman" + 0.000*"melting" + 0.000*"journalism" + 0.000*"trafficing" + '
  '0.000*"covid" + 0.000*"dignity"'),
 (5,
  '0.007*"la" + 0.005*"de

In [9]:
from gensim.utils import simple_preprocess
from collections import Counter
from itertools import combinations
import pickle

In [10]:
# Extract top words for each topic
top_words_per_topic = []
for t in range(num_topics):
    top_words = [word for word, _ in lda_model.show_topic(t, topn=20)]
    top_words_per_topic.append(top_words)

In [11]:
top_words_per_topic

[['none',
  'media',
  'covid',
  'bias',
  'dishonesty',
  'news',
  'press',
  'imagration',
  'lack',
  'fox',
  'politics',
  'lord',
  'illeagle',
  'censorship',
  'publicly',
  'stupidity',
  'jobs',
  'greed',
  'cnn',
  'moment'],
 ['economy',
  'covid',
  'election',
  'virus',
  'racial',
  'division',
  'effecting',
  'socialism',
  'currently',
  'cia',
  'machine',
  'crashing',
  'animal',
  'possess',
  'government',
  'measures',
  'dual',
  'highest',
  'foreign',
  'policy'],
 ['covid',
  'economy',
  'pandemic',
  'racism',
  'climate',
  'health',
  'change',
  'racial',
  'healthcare',
  'care',
  'inequality',
  'economic',
  'education',
  'unemployment',
  'social',
  'systemic',
  'police',
  'global',
  'coronavirus',
  'injustice'],
 ['corona',
  'lack',
  'virus',
  'equity',
  'information',
  'conspiracy',
  'empathy',
  'education',
  'within',
  'truth',
  'store',
  'covid',
  'health',
  'problem',
  'facts',
  'description',
  'liquor',
  'robbed',
 

In [12]:
# Compute co-occurrence matrix
def compute_cooccurrence_matrix(texts):
    word_counts = Counter(word for text in texts for word in text)
    total_count = sum(word_counts.values())
    word_pairs = Counter()
    for text in texts:
        for i, j in combinations(set(text), 2):
            word_pairs[tuple(sorted([i, j]))] += 1
    return word_pairs, word_counts, total_count

word_pairs, word_counts, total_count = compute_cooccurrence_matrix(data_words)

# Compute NPMI
def compute_npmi(word_pairs, word_counts, total_count):
    npmi_matrix = {}
    for (w_i, w_j), cooccur_count in word_pairs.items():
        p_i = word_counts[w_i] / total_count
        p_j = word_counts[w_j] / total_count
        p_ij = cooccur_count / total_count
        if p_ij > 0:
            pmi = np.log(p_ij / (p_i * p_j))
            npmi = pmi / -np.log(p_ij)
            npmi_matrix[(w_i, w_j)] = npmi
    return npmi_matrix

npmi_matrix = compute_npmi(word_pairs, word_counts, total_count)

In [13]:
# Calculate average NPMI for each topic
def average_npmi_for_topics(top_words_per_topic, npmi_matrix):
    topic_npmis = []
    for top_words in top_words_per_topic:
        npmis = [npmi_matrix.get(tuple(sorted([w_i, w_j])), 0) for w_i, w_j in combinations(top_words, 2)]
        if npmis:
            topic_npmi = np.mean(npmis)
            topic_npmis.append(topic_npmi)
    return np.mean(topic_npmis) if topic_npmis else 0

average_npmi = average_npmi_for_topics(top_words_per_topic, npmi_matrix)
print("Average NPMI for LDA topics:", average_npmi)

Average NPMI for LDA topics: 0.28073560794997093


## Visualize the result of topic modelling

In [14]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/mnt/home/kim/thesis/data/processed_data/ldavis_'+str(num_topics))

In [15]:
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '/mnt/home/kim/thesis/data/processed_data/ldavis_'+ str(num_topics) +'.html')
LDAvis_prepared

### Dataset: responses stacked

In [34]:
data = pd.read_json('~/thesis/data/processed_uscensus/political_mentions_stack.jsonl', orient='records', lines = True)

In [35]:
data

Unnamed: 0,stack
0,racim blacks whites false information
1,coming together country
2,severe political polarization allow compromise...
3,pandemic covid 19
4,globalism fake covid law order blm public educ...
...,...
16510,black lives matter
16511,law reform
16512,get the vaccin for Covid distribute to all for...
16513,Environmental disaster. Global climate change ...


In [13]:
# Tokenising and removing the stopwords
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/home/kim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
data = data['stack'].tolist()
data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['racim', 'blacks', 'whites', 'false', 'information']


In [18]:
data_words[:5]

[['racim', 'blacks', 'whites', 'false', 'information'],
 ['coming', 'together', 'country'],
 ['severe', 'political', 'polarization', 'allow', 'compromise', 'growth'],
 ['pandemic', 'covid'],
 ['globalism',
  'fake',
  'covid',
  'law',
  'order',
  'blm',
  'public',
  'education',
  'socialism',
  'businesses',
  'destroyed',
  'globalist',
  'governors']]

In [19]:
# Convert tokenised object into corpus and dictionary
# The produced corpus shown above is a mapping of (word_id, word_frequency).
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30]) 

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


In [20]:
# LDA model
from pprint import pprint

# number of topics
num_topics = 10 # based on the clustering result from the previous analysis

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,random_state=0,
                                        chunksize=100,
                                        passes=10,
                                        alpha=0.01,
                                        eta=0.9
                                      )
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.014*"election" + 0.010*"fraud" + 0.007*"la" + 0.006*"voter" + '
  '0.006*"voting" + 0.005*"que" + 0.005*"de" + 0.004*"el" + 0.003*"los" + '
  '0.003*"electoral"'),
 (1,
  '0.073*"covid" + 0.037*"pandemic" + 0.024*"economy" + 0.019*"police" + '
  '0.017*"racial" + 0.014*"control" + 0.010*"brutality" + 0.009*"racism" + '
  '0.008*"virus" + 0.007*"lack"'),
 (2,
  '0.027*"media" + 0.018*"political" + 0.011*"news" + 0.009*"division" + '
  '0.007*"social" + 0.005*"parties" + 0.005*"divide" + 0.005*"polarization" + '
  '0.004*"party" + 0.003*"bias"'),
 (3,
  '0.005*"people" + 0.004*"wage" + 0.003*"minimum" + 0.003*"hungry" + '
  '0.002*"greed" + 0.002*"fuels" + 0.002*"fossil" + 0.002*"moment" + '
  '0.002*"many" + 0.002*"morals"'),
 (4,
  '0.006*"law" + 0.006*"security" + 0.005*"enforcement" + 0.002*"drug" + '
  '0.002*"national" + 0.002*"social" + 0.002*"funding" + 0.001*"rule" + '
  '0.001*"child" + 0.001*"affairs"'),
 (5,
  '0.022*"health" + 0.017*"care" + 0.015*"immigration" + 0

## Visualize the result of topic modelling

In [21]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/mnt/home/kim/thesis/data/processed_data/ldavis_'+str(num_topics))

In [22]:
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '/mnt/home/kim/thesis/data/processed_data/ldavis_'+ str(num_topics) +'.html')
LDAvis_prepared

In [39]:
from gensim.utils import simple_preprocess
from collections import Counter
from itertools import combinations
import pickle

In [36]:
# Extract top words for each topic
top_words_per_topic = []
for t in range(num_topics):
    top_words = [word for word, _ in lda_model.show_topic(t, topn=20)]
    top_words_per_topic.append(top_words)

In [37]:
top_words_per_topic

[['media',
  'faith',
  'dishonesty',
  'socialist',
  'covid',
  'bias',
  'loss',
  'stream',
  'news',
  'culture',
  'censorship',
  'socialistic',
  'fox',
  'america',
  'growing',
  'lack',
  'cia',
  'makers',
  'times',
  'main'],
 ['economy',
  'division',
  'racial',
  'imagration',
  'covid',
  'illeagle',
  'lord',
  'covered',
  'name',
  'currently',
  'sure',
  'election',
  'policy',
  'crashing',
  'hatred',
  'survival',
  'ehich',
  'rasism',
  'possess',
  'founded'],
 ['covid',
  'economy',
  'racism',
  'pandemic',
  'climate',
  'change',
  'health',
  'racial',
  'inequality',
  'healthcare',
  'care',
  'education',
  'economic',
  'social',
  'police',
  'systemic',
  'lack',
  'division',
  'unemployment',
  'control'],
 ['lack',
  'information',
  'health',
  'corona',
  'care',
  'empathy',
  'conspiracy',
  'education',
  'theories',
  'public',
  'virus',
  'truth',
  'covid',
  'idk',
  'falling',
  'facts',
  'news',
  'inability',
  'homelessness',
  

In [40]:
# Compute co-occurrence matrix
def compute_cooccurrence_matrix(texts):
    word_counts = Counter(word for text in texts for word in text)
    total_count = sum(word_counts.values())
    word_pairs = Counter()
    for text in texts:
        for i, j in combinations(set(text), 2):
            word_pairs[tuple(sorted([i, j]))] += 1
    return word_pairs, word_counts, total_count

word_pairs, word_counts, total_count = compute_cooccurrence_matrix(data_words)

# Compute NPMI
def compute_npmi(word_pairs, word_counts, total_count):
    npmi_matrix = {}
    for (w_i, w_j), cooccur_count in word_pairs.items():
        p_i = word_counts[w_i] / total_count
        p_j = word_counts[w_j] / total_count
        p_ij = cooccur_count / total_count
        if p_ij > 0:
            pmi = np.log(p_ij / (p_i * p_j))
            npmi = pmi / -np.log(p_ij)
            npmi_matrix[(w_i, w_j)] = npmi
    return npmi_matrix

npmi_matrix = compute_npmi(word_pairs, word_counts, total_count)

In [41]:
# Calculate average NPMI for each topic
def average_npmi_for_topics(top_words_per_topic, npmi_matrix):
    topic_npmis = []
    for top_words in top_words_per_topic:
        npmis = [npmi_matrix.get(tuple(sorted([w_i, w_j])), 0) for w_i, w_j in combinations(top_words, 2)]
        if npmis:
            topic_npmi = np.mean(npmis)
            topic_npmis.append(topic_npmi)
    return np.mean(topic_npmis) if topic_npmis else 0

average_npmi = average_npmi_for_topics(top_words_per_topic, npmi_matrix)
print("Average NPMI for LDA topics:", average_npmi)

Average NPMI for LDA topics: 0.27977812216897646


**Interpretation**

High NPMI (close to 1): Indicates strong semantic coherence between words, meaning the words are likely to appear together in similar contexts. This is generally considered good for topics generated by models like LDA.

NPMI around 0: Indicates that the words appear together about as frequently as expected by chance, suggesting neutral association.

Low NPMI (negative values): Indicates that the words are unlikely to appear together, suggesting poor coherence for the topic.