# LDA

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from sklearn.feature_extraction.text import CountVectorizer

## Participant-based analysis
### 1. Dataset: Aggregate the LDA results of each mention 1,2,3
### * with BOW

In [3]:
m1 = pd.read_json('~/thesis/data/processed_uscensus/political_mention1.jsonl', orient='index')
m2 = pd.read_json('~/thesis/data/processed_uscensus/political_mention2.jsonl', orient = 'index')
m3 = pd.read_json('~/thesis/data/processed_uscensus/political_mention3.jsonl', orient = 'index')

In [4]:
m1.head(5)

Unnamed: 0,mention1
200015,racim blacks whites false information
200022,coming together country
200039,severe political polarization allow compromise...
200046,pandemic covid 19
200053,globalism fake covid law order blm public educ...


In [5]:
# Step 1: Prepare the data for LDA without concatenating
texts1 = m1['mention1'].apply(lambda x: x.split())
texts2 = m2['mention2'].apply(lambda x: x.split())
texts3 = m3['mention3'].apply(lambda x: x.split())

In [6]:
# see what's inside the texts1
texts1[:5]

200015          [racim, blacks, whites, false, information]
200022                          [coming, together, country]
200039    [severe, political, polarization, allow, compr...
200046                                [pandemic, covid, 19]
200053    [globalism, fake, covid, law, order, blm, publ...
Name: mention1, dtype: object

In [7]:
# Create dictionaries and corpora for each column
dictionary1 = Dictionary(texts1)
dictionary2 = Dictionary(texts2)
dictionary3 = Dictionary(texts3)

corpus1 = [dictionary1.doc2bow(text) for text in texts1]
corpus2 = [dictionary2.doc2bow(text) for text in texts2]
corpus3 = [dictionary3.doc2bow(text) for text in texts3]

# Step 2: Combine corpora and dictionaries
# Note: In real application, ensure dictionaries are aligned or use a shared dictionary
combined_corpus = corpus1 + corpus2 + corpus3
combined_dictionary = Dictionary(list(texts1) + list(texts2) + list(texts3))

In [8]:
# Assume we have 10 topics
num_topics = 6

In [9]:
# Step 3: Apply LDA
lda_model = LdaModel(corpus=combined_corpus, num_topics= num_topics, id2word=combined_dictionary, passes=10)

In [10]:
# Step 4: Get topic distribution for each document
topic_distributions1 = [lda_model.get_document_topics(bow, minimum_probability=0.0) for bow in corpus1]
topic_distributions2 = [lda_model.get_document_topics(bow, minimum_probability=0.0) for bow in corpus2]
topic_distributions3 = [lda_model.get_document_topics(bow, minimum_probability=0.0) for bow in corpus3]

In [11]:
# Step 2: Concatenate the DataFrames horizontally
combined_df = pd.concat([m1, m2, m3], axis=1)

# Step 1: Handle NaN values by filling with an empty string
combined_df.fillna('', inplace=True)

# Rename the columns if needed
combined_df.columns = ['response1', 'response2', 'response3']

# Reset index if you want to keep 'participant_id' as a regular column
combined_df.reset_index(inplace=True)

# Now you have a single DataFrame with columns: 'participant_id', 'response1', 'response2', 'response3'
print(combined_df.head())

    index                                          response1  \
0  200015              racim blacks whites false information   
1  200022                            coming together country   
2  200039  severe political polarization allow compromise...   
3  200046                                  pandemic covid 19   
4  200053  globalism fake covid law order blm public educ...   

                                           response2  \
0                                                      
1                                                      
2                             affordable health care   
3                unemployment lot people lossing job   
4  globalism fake covid socialism public educatio...   

                    response3  
0                              
1                              
2             systemic racism  
3            health insurance  
4  soros gates ruling country  


In [12]:
import numpy as np

# num_topics
num_topics

# Step 5: Aggregate topic distributions for each participant
aggregated_topic_distributions = []

# Ensure all participants are accounted for by using a loop over the total number of participants
for i in range(len(combined_df)):
    # Get topic distributions for each response, fill with zeros if missing
    dist1 = topic_distributions1[i] if i < len(topic_distributions1) else [(topic_id, 0.0) for topic_id in range(num_topics)]
    dist2 = topic_distributions2[i] if i < len(topic_distributions2) else [(topic_id, 0.0) for topic_id in range(num_topics)]
    dist3 = topic_distributions3[i] if i < len(topic_distributions3) else [(topic_id, 0.0) for topic_id in range(num_topics)]
    
    # Initialize aggregation
    aggregated = np.zeros(num_topics)
    
    # Aggregate the topic probabilities from the three distributions
    for topic_id, prob in dist1:
        aggregated[topic_id] += prob
    for topic_id, prob in dist2:
        aggregated[topic_id] += prob
    for topic_id, prob in dist3:
        aggregated[topic_id] += prob
    
    aggregated_topic_distributions.append(aggregated)

# Convert to DataFrame
aggregated_topic_df = pd.DataFrame(aggregated_topic_distributions, columns=[f'Topic_{i}' for i in range(num_topics)])

In [13]:
len(aggregated_topic_df)

7300

In [14]:
# Ensure the length of the DataFrame matches the number of participants
assert len(aggregated_topic_df) == len(combined_df)

# Optionally, join this back to the combined_df for further analysis
combined_df = pd.concat([combined_df, aggregated_topic_df], axis=1)

In [15]:
combined_df.head()

Unnamed: 0,index,response1,response2,response3,Topic_0,Topic_1,Topic_2,Topic_3,Topic_4,Topic_5
0,200015,racim blacks whites false information,,,0.125048,0.125001,0.125001,2.374914,0.125036,0.125001
1,200022,coming together country,,,1.252974,0.125007,0.125028,1.243884,0.126378,0.126729
2,200039,severe political polarization allow compromise...,affordable health care,systemic racism,1.144104,0.717913,0.20299,0.777416,0.078618,0.07896
3,200046,pandemic covid 19,unemployment lot people lossing job,health insurance,0.199181,0.353539,1.670905,0.377958,0.310373,0.088044
4,200053,globalism fake covid law order blm public educ...,globalism fake covid socialism public educatio...,soros gates ruling country,0.077386,0.077652,0.747757,0.077692,1.942068,0.077445


In [16]:
# Extract top words for each topic
top_words_per_topic = []
for t in range(num_topics):
    top_words = [word for word, _ in lda_model.show_topic(t, topn=20)]
    top_words_per_topic.append(top_words)

### Visualize the result of topic modelling

In [17]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os

# Step 8: Visualize with PyLDAvis
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/mnt/home/kim/thesis/data/processed_data/ldavis_'+str(num_topics))

# Prepare and save the visualization
if not os.path.exists(LDAvis_data_filepath):
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, combined_corpus, combined_dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
else:
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)

# Save the visualization as an HTML file
pyLDAvis.save_html(LDAvis_prepared, '/mnt/home/kim/thesis/data/processed_data/ldavis_'+ str(num_topics) +'.html')

# Display inline in Jupyter Notebook
LDAvis_prepared

* topic = 6 shows the best-humanly-readable clustering results from the LDA
* topic = 10 shows the funny results, one cluster consists of just spanish-speaking people. 

### 2. Dataset: Create the summed BOW after vectorizing each response
### * with BOW

In [62]:
# Step 1: Load Data
m1 = pd.read_json('~/thesis/data/processed_uscensus/political_mention1.jsonl', orient='index')
m2 = pd.read_json('~/thesis/data/processed_uscensus/political_mention2.jsonl', orient='index')
m3 = pd.read_json('~/thesis/data/processed_uscensus/political_mention3.jsonl', orient='index')

# Step 2: Align DataFrames by index
df = m1.join(m2, how='outer', lsuffix='_1').join(m3, how='outer', lsuffix='_2', rsuffix='_3')
df.fillna('', inplace=True)

# Rename columns if needed
df.columns = ['response1', 'response2', 'response3']

In [63]:
# Step 3: Vectorize each column using BoW
vectorizer = CountVectorizer(max_features=1000)

vectors1 = vectorizer.fit_transform(df['response1']).toarray()
vectors2 = vectorizer.fit_transform(df['response2']).toarray()
vectors3 = vectorizer.fit_transform(df['response3']).toarray()

In [64]:
# Step 4: Ensure all vectors have the same shape by padding missing vectors with zeros
max_len = max(vectors1.shape[0], vectors2.shape[0], vectors3.shape[0])

if vectors1.shape[0] < max_len:
    vectors1 = np.pad(vectors1, ((0, max_len - vectors1.shape[0]), (0, 0)), 'constant')

if vectors2.shape[0] < max_len:
    vectors2 = np.pad(vectors2, ((0, max_len - vectors2.shape[0]), (0, 0)), 'constant')

if vectors3.shape[0] < max_len:
    vectors3 = np.pad(vectors3, ((0, max_len - vectors3.shape[0]), (0, 0)), 'constant')

# Step 5: Sum the vectors element-wise
summed_vectors = vectors1 + vectors2 + vectors3


In [21]:
# Step 6: Create DataFrame from the summed vectors
final_df = pd.DataFrame(summed_vectors)

# Step 7: Convert final_df to a Gensim-compatible corpus format
corpus = [list(enumerate(doc)) for doc in final_df.values]

# Create a dummy Dictionary for visualization purposes
dummy_dict = Dictionary()
dummy_dict.token2id = {str(i): i for i in range(final_df.shape[1])}
dummy_dict.id2token = {i: str(i) for i in range(final_df.shape[1])}


In [22]:
# Step 8: Train the LDA model on the corpus
num_topics = 5  # Adjust based on your needs
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dummy_dict, passes=10)


In [23]:
# Step 9: Visualize the LDA Model using PyLDAvis
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/mnt/home/kim/thesis/data/processed_data/ldavis_'+str(num_topics))

# Prepare the visualization
if not os.path.exists(LDAvis_data_filepath):
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dummy_dict)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
else:
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)

# Save the visualization as an HTML file
pyLDAvis.save_html(LDAvis_prepared, '/mnt/home/kim/thesis/data/processed_data/ldavis_'+ str(num_topics) +'.html')

# Display the visualization inline (in Jupyter Notebook)
LDAvis_prepared

### Linkage Matrix

In [75]:
from sklearn.metrics.pairwise import cosine_distances
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt

In [71]:
distance_matrix = cosine_distances(final_df)

In [73]:
linkage_matrix = sch.linkage(distance_matrix, method = 'ward')

  linkage_matrix = sch.linkage(distance_matrix, method = 'ward')


In [76]:
# Step 9: Plot the Dendrogram -> it takes too long to draw. 
plt.figure(figsize=(10, 7))
sch.dendrogram(linkage_matrix, labels=df.index.tolist())
plt.title('Dendrogram for BoW Data')
plt.xlabel('Documents')
plt.ylabel('Distance')
plt.show()

KeyboardInterrupt: 

### 3. Dataset: Create the summed tf-idf after vectorizing each response
### * with tf-idf Vectorizer

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim
import pickle
import pyLDAvis
import os

In [25]:
# Step 1: Load each dataset
m1 = pd.read_json('~/thesis/data/processed_uscensus/political_mention1.jsonl', orient='index')
m2 = pd.read_json('~/thesis/data/processed_uscensus/political_mention2.jsonl', orient='index')
m3 = pd.read_json('~/thesis/data/processed_uscensus/political_mention3.jsonl', orient='index')

In [26]:
# Step 2: Combine the DataFrames
df = pd.concat([m1, m2, m3], axis=1)
df.columns = ['response1', 'response2', 'response3']

# Step 3: Handle NaN values (if any)
df.fillna('', inplace=True)

In [27]:
# Step 4: Vectorize each response separately using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)

In [28]:
# Fit the vectorizer on all responses combined to build a common vocabulary
vectorizer.fit(pd.concat([df['response1'], df['response2'], df['response3']]))

# Transform each response
vectors1 = vectorizer.transform(df['response1']).toarray()
vectors2 = vectorizer.transform(df['response2']).toarray()
vectors3 = vectorizer.transform(df['response3']).toarray()

In [29]:
# Ensure all vectors have the same shape by padding missing vectors with zeros
max_len = max(vectors1.shape[0], vectors2.shape[0], vectors3.shape[0])

if vectors1.shape[0] < max_len:
    vectors1 = np.pad(vectors1, ((0, max_len - vectors1.shape[0]), (0, 0)), 'constant')

if vectors2.shape[0] < max_len:
    vectors2 = np.pad(vectors2, ((0, max_len - vectors2.shape[0]), (0, 0)), 'constant')

if vectors3.shape[0] < max_len:
    vectors3 = np.pad(vectors3, ((0, max_len - vectors3.shape[0]), (0, 0)), 'constant')

# Step 5: Sum the vectors element-wise
summed_vectors = vectors1 + vectors2 + vectors3

In [30]:
# Step 6: Create DataFrame from the summed vectors
final_df = pd.DataFrame(summed_vectors)

# Step 7: Convert final_df to a Gensim-compatible corpus format
corpus = [list(enumerate(doc)) for doc in final_df.values]

# Create a dummy Dictionary for visualization purposes
dummy_dict = Dictionary()
dummy_dict.token2id = {str(i): i for i in range(final_df.shape[1])}
dummy_dict.id2token = {i: str(i) for i in range(final_df.shape[1])}


In [31]:
# Step 8: Train the LDA model on the corpus
num_topics = 5  # Adjust based on your needs
lda_model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dummy_dict, passes=10)


In [32]:
# Step 9: Visualize the LDA Model using PyLDAvis
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/mnt/home/kim/thesis/data/processed_data/ldavis_'+str(num_topics))

# Prepare the visualization
if not os.path.exists(LDAvis_data_filepath):
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, dummy_dict)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
else:
    with open(LDAvis_data_filepath, 'rb') as f:
        LDAvis_prepared = pickle.load(f)

# Save the visualization as an HTML file
pyLDAvis.save_html(LDAvis_prepared, '/mnt/home/kim/thesis/data/processed_data/ldavis_'+ str(num_topics) +'.html')

# Display the visualization inline (in Jupyter Notebook)
LDAvis_prepared

### 4. Dataset: responses stacked
### * with BOW

In [33]:
data = pd.read_json('~/thesis/data/processed_uscensus/political_mentions_stack.jsonl', orient='records', lines = True)

In [34]:
# Tokenising and removing the stopwords
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]


[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/home/kim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
data = data['stack'].tolist()
data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['racim', 'blacks', 'whites', 'false', 'information']


In [36]:
# Convert tokenised object into corpus and dictionary
# The produced corpus shown above is a mapping of (word_id, word_frequency).
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30]) 

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


In [37]:
# LDA model
from pprint import pprint


In [38]:
# number of topics
num_topics = 6 # based on the clustering result from the previous analysis

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,random_state=0,
                                        chunksize=100,
                                        passes=10,
                                        alpha=0.01,
                                        eta=0.9
                                      )

In [39]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.070*"covid" + 0.039*"economy" + 0.034*"pandemic" + 0.012*"unemployment" + '
  '0.010*"getting" + 0.009*"control" + 0.008*"virus" + 0.007*"people" + '
  '0.007*"businesses" + 0.007*"jobs"'),
 (1,
  '0.028*"climate" + 0.025*"change" + 0.021*"racial" + 0.014*"inequality" + '
  '0.010*"division" + 0.010*"racism" + 0.008*"social" + 0.007*"lack" + '
  '0.007*"injustice" + 0.006*"political"'),
 (2,
  '0.019*"country" + 0.015*"people" + 0.011*"media" + 0.007*"immigration" + '
  '0.007*"trump" + 0.007*"political" + 0.006*"government" + 0.006*"one" + '
  '0.006*"problem" + 0.006*"need"'),
 (3,
  '0.030*"people" + 0.015*"racism" + 0.011*"healthcare" + 0.011*"need" + '
  '0.009*"education" + 0.007*"covid" + 0.007*"many" + 0.006*"jobs" + '
  '0.006*"get" + 0.006*"country"'),
 (4,
  '0.030*"health" + 0.023*"care" + 0.021*"police" + 0.011*"national" + '
  '0.009*"law" + 0.009*"lack" + 0.009*"debt" + 0.007*"security" + '
  '0.007*"brutality" + 0.005*"enforcement"'),
 (5,
  '0.012*"corona" + 

In [40]:
### Visualize the result of topic modelling

import pyLDAvis.gensim
import pickle 
import pyLDAvis


In [41]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/mnt/home/kim/thesis/data/processed_data/ldavis_'+str(num_topics))

In [42]:
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '/mnt/home/kim/thesis/data/processed_data/ldavis_'+ str(num_topics) +'.html')

LDAvis_prepared

### 3. Dataset: responses stacked
### * with tf-idf

In [43]:
data = pd.read_json('~/thesis/data/processed_uscensus/political_mentions_stack.jsonl', orient='records', lines = True)

In [44]:
data.head()

Unnamed: 0,stack
0,racim blacks whites false information
1,coming together country
2,severe political polarization allow compromise...
3,pandemic covid 19
4,globalism fake covid law order blm public educ...


In [45]:
# Tokenising and removing the stopwords
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

[nltk_data] Downloading package stopwords to
[nltk_data]     /mnt/home/kim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
data = data['stack'].tolist()
data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['racim', 'blacks', 'whites', 'false', 'information']


In [47]:
data_words[:1]

[['racim', 'blacks', 'whites', 'false', 'information']]

In [48]:
# Step 1: Flatten the tokenized words back into strings
data_words_strings = [' '.join(words) for words in data_words]


In [49]:
# Step 2: Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)

# Step 3: Fit the vectorizer and transform the documents
X_tfidf = vectorizer.fit_transform(data_words_strings)

In [50]:
# Step 4: Get the feature names (i.e., words) and create a mapping for Gensim
feature_names = vectorizer.get_feature_names_out()
id2word = gensim.corpora.Dictionary([feature_names])

# Step 5: Convert the TF-IDF matrix to a Gensim-compatible corpus format
corpus = []
for doc in X_tfidf:
    doc_tuples = list(enumerate(doc.toarray()[0]))
    doc_tuples = [(i, val) for i, val in doc_tuples if val > 0]
    corpus.append(doc_tuples)

# View the first document's first 30 tokens
print(corpus[:1][0][:30])


[(94, 0.5983382896851441), (325, 0.581845018098146), (468, 0.5508608408763729)]


In [51]:
# Step 6: Build the LDA model using the TF-IDF-based corpus
num_topics = 6  # number of topics based on the clustering result from the previous analysis

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       random_state=0,
                                       chunksize=100,
                                       passes=10,
                                       alpha=0.01,
                                       eta=0.9)

In [52]:
from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.056*"racism" + 0.023*"debt" + 0.021*"national" + 0.019*"systemic" + '
  '0.018*"socialism" + 0.017*"people" + 0.014*"rights" + 0.014*"security" + '
  '0.014*"abortion" + 0.014*"law"'),
 (1,
  '0.030*"climate" + 0.030*"inequality" + 0.030*"racial" + 0.027*"change" + '
  '0.027*"police" + 0.019*"education" + 0.018*"healthcare" + 0.017*"income" + '
  '0.016*"economic" + 0.014*"brutality"'),
 (2,
  '0.022*"media" + 0.017*"election" + 0.017*"political" + 0.017*"government" + '
  '0.014*"corruption" + 0.014*"division" + 0.012*"trump" + 0.010*"democrats" + '
  '0.009*"party" + 0.009*"president"'),
 (3,
  '0.036*"immigration" + 0.025*"country" + 0.021*"global" + 0.020*"warming" + '
  '0.016*"illegal" + 0.015*"divided" + 0.013*"problem" + 0.012*"environment" + '
  '0.010*"divide" + 0.009*"covid"'),
 (4,
  '0.023*"people" + 0.015*"need" + 0.012*"get" + 0.012*"think" + 0.011*"covid" '
  '+ 0.011*"country" + 0.011*"work" + 0.010*"back" + 0.009*"getting" + '
  '0.009*"many"'),
 (5,
  '0.0

### Visualize the result of topic modelling

In [53]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('/mnt/home/kim/thesis/data/processed_data/ldavis_'+str(num_topics))

In [61]:
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, '/mnt/home/kim/thesis/data/processed_data/ldavis_'+ str(num_topics) +'.html')
LDAvis_prepared

### Top words

In [55]:
from gensim.utils import simple_preprocess
from collections import Counter
from itertools import combinations
import pickle

In [56]:
# Extract top words for each topic
top_words_per_topic = []
for t in range(num_topics):
    top_words = [word for word, _ in lda_model.show_topic(t, topn=10)]
    top_words_per_topic.append(top_words)

In [57]:
top_words_per_topic[:5]

[['racism',
  'debt',
  'national',
  'systemic',
  'socialism',
  'people',
  'rights',
  'security',
  'abortion',
  'law'],
 ['climate',
  'inequality',
  'racial',
  'change',
  'police',
  'education',
  'healthcare',
  'income',
  'economic',
  'brutality'],
 ['media',
  'election',
  'political',
  'government',
  'corruption',
  'division',
  'trump',
  'democrats',
  'party',
  'president'],
 ['immigration',
  'country',
  'global',
  'warming',
  'illegal',
  'divided',
  'problem',
  'environment',
  'divide',
  'covid'],
 ['people',
  'need',
  'get',
  'think',
  'covid',
  'country',
  'work',
  'back',
  'getting',
  'many']]

In [58]:
# Compute co-occurrence matrix
def compute_cooccurrence_matrix(texts):
    word_counts = Counter(word for text in texts for word in text)
    total_count = sum(word_counts.values())
    word_pairs = Counter()
    for text in texts:
        for i, j in combinations(set(text), 2):
            word_pairs[tuple(sorted([i, j]))] += 1
    return word_pairs, word_counts, total_count

word_pairs, word_counts, total_count = compute_cooccurrence_matrix(data_words)

# Compute NPMI
def compute_npmi(word_pairs, word_counts, total_count):
    npmi_matrix = {}
    for (w_i, w_j), cooccur_count in word_pairs.items():
        p_i = word_counts[w_i] / total_count
        p_j = word_counts[w_j] / total_count
        p_ij = cooccur_count / total_count
        if p_ij > 0:
            pmi = np.log(p_ij / (p_i * p_j))
            npmi = pmi / -np.log(p_ij)
            npmi_matrix[(w_i, w_j)] = npmi
    return npmi_matrix

npmi_matrix = compute_npmi(word_pairs, word_counts, total_count)

In [59]:
# Calculate average NPMI for each topic
def average_npmi_for_topics(top_words_per_topic, npmi_matrix):
    topic_npmis = []
    for top_words in top_words_per_topic:
        npmis = [npmi_matrix.get(tuple(sorted([w_i, w_j])), 0) for w_i, w_j in combinations(top_words, 2)]
        if npmis:
            topic_npmi = np.mean(npmis)
            topic_npmis.append(topic_npmi)
    return np.mean(topic_npmis) if topic_npmis else 0

average_npmi = average_npmi_for_topics(top_words_per_topic, npmi_matrix)
print("Average NPMI for LDA topics:", average_npmi)

Average NPMI for LDA topics: 0.2754209123365574


**Interpretation**

High NPMI (close to 1): Indicates strong semantic coherence between words, meaning the words are likely to appear together in similar contexts. This is generally considered good for topics generated by models like LDA.

NPMI around 0: Indicates that the words appear together about as frequently as expected by chance, suggesting neutral association.

Low NPMI (negative values): Indicates that the words are unlikely to appear together, suggesting poor coherence for the topic.