# Topic Analysis with SpaCy & Gensim

In [1]:
# CONSTANTS ... so far best: 8, 0.5, False
NUM_TOPICS = 8   # number of topics
NO_ABOVE = 0.5   # token filter words appearing in more than X% of documents
REM_SW = True    # True - removes kaggle stop words, false does not

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import spacy
import en_core_web_sm
from wordcloud import WordCloud
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
import dateutil.parser as dparser
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook() #Notebook visualisation enabled
import os
import re
import unicodedata
import en_core_web_md
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

2023-04-24 09:18:37.280811: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-24 09:18:37.435591: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-04-24 09:18:37.944867: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-04-24 09:18:37.944962: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

Read data and exploration

In [3]:
# Load up the files
#path = './DataUCSB/' # Smaller UCSB dataset
#path = './Data/' # larger American Rhetoric dataset
paths = ['./Data/', './NYT/', './WSJ/', './GWB/', './speeches/', './Top10/']

speeches = []

for path in paths:
    list_of_files = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                list_of_files.append(os.path.join(root,file))
   
    for file in list_of_files:
        with open(file, encoding='utf-8') as f:
            text = f.read()
        f.close()
        speeches.append(text)

#clean out goofy unicode  space characters 
speeches = [unicodedata.normalize("NFKD", speech) for speech in speeches if len(speech)>0 ]
#clean out xa0 space characters
[speech.replace(u'\xa0', '') for speech in speeches]; # ; supresses output
# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub(r'(\[[^w]*\]\s)', '',text)
speeches = [remove_bracket(speech) for speech in speeches]
# Clean up whitespace
speeches = [re.sub(r'[\s+]', ' ', speech) for speech in speeches]
print("Total documents:",len(speeches))

Total documents: 743


In [4]:
# Run this if you want to remove stop words 
def rem_stop_words(df):
    kaggle_file = open("./word_lists/kaggle_stopwords.txt", "r")
    kaggle_data = kaggle_file.read()
    kaggle_list = [word for word in kaggle_data.split('\n')]
    kaggle_file.close()
    my_list = ['thats', 'just', 'im', 'did', 'thing', 'mr', 'al', 'thank', 'okay', 'thank','thanks', 
               'question', 'joshua', 'president', 'obama', 'â', u'\x99s', u'\x99t', u'\x99ve', u'\x99m',u'\x99re', '\x99']
    stop_list = list(set(kaggle_list) | set(my_list))
    stop_words = ENGLISH_STOP_WORDS.union(stop_list)

    for i in range(len(df)):
            df.iloc[i] = ' '.join([word for word in df.iloc[i].split() if word.lower() not in stop_words])
    return(df)

Step 3: Tokenization and text cleanup

In [5]:
oba_scripts=pd.DataFrame(speeches)
oba_scripts = oba_scripts[0]
if REM_SW:
    oba_scripts=rem_stop_words(oba_scripts)

In [6]:
#Load SpaCy English Model
nlp = en_core_web_md.load()
#Tags to remove
extags = ['PRON','CCONJ','PUNCT','PART','DET','ADP','NUM','SYM','SPACE']
docs = oba_scripts.apply(nlp)
tokens=[]
#SpaCy tokenization + lemmatization + lowercase
for speech in docs:
    scr_tok = [token.lemma_.lower() for token in speech if token.pos_ not in extags and not token.is_stop and token.is_alpha]
    tokens.append(scr_tok)
data = pd.DataFrame()
data['tokens'] = tokens
data['tokens']


0      [governor, family, responder, community, newto...
1      [selamat, pagi, wonderful, university, indones...
2      [let, collins, introduction, incredible, leade...
3      [hello, right, seat, want, becky, patton, extr...
4      [morning, great, honor, today, course, truly, ...
                             ...                        
738    [hello, chicago, doubt, america, place, possib...
739    [chairman, dean, great, friend, dick, durbin, ...
740    [hello, america, hello, democrats, year, ago, ...
741    [majesty, royal, highnesses, distinguished, me...
742    [rare, honor, life, follow, hero, john, lewis,...
Name: tokens, Length: 743, dtype: object

## LDA Topic Analysis model and coherence scores

In [7]:
token_dict = Dictionary(data['tokens'])
#Filter out tokens that appear in less than 5 speeches, and tokens that appear in more than 70% of speeches since they are too general. Keep the top 1000 most frequent tokens
token_dict.filter_extremes(no_below=5,no_above=NO_ABOVE,keep_n=1000)

#Convert token counts into bag of words (BoW) corpus
corpus = [token_dict.doc2bow(speech) for speech in data['tokens']]

In [8]:
# Don't need to run this everytime, commented out to save time
'''
#Construct and train unsupervised LDA model + Determine optimal number of topics
umtopics, umscore = [], []
cvtopics, cvscore = [], []
#Compute coherence score using C_umass:
for i in range(3,15,1):
    model = LdaMulticore(corpus=corpus, id2word=token_dict, iterations=50, num_topics=i, workers=4, passes=10, random_state=47)
    #LdamultiCore uses multiple cores to speed up model training, use with caution if you have a weaker PC! (Find your max number of cores with ctrl+shift+esc, under CPU)
    cm = CoherenceModel(model=model, corpus=corpus, dictionary=token_dict, coherence='u_mass')

    umtopics.append(i)
    umscore.append(cm.get_coherence())
#Compute coherence score using C_v: 
for i in range (3,15,1):
    model = LdaMulticore(corpus=corpus, id2word=token_dict, iterations=10, num_topics=i, workers = 4, passes=10, random_state=47)
    cm = CoherenceModel(model=model, texts = data['tokens'], corpus=corpus, dictionary=token_dict, coherence='c_v')

    cvtopics.append(i)
    cvscore.append(cm.get_coherence())
#The difference in coherence score measures is the method in which the text is segmented and probability is calculated
#Adjustable threshold for visualising with red vertical lines
threshold=9
fig, (ax1, ax2) = plt.subplots(1,2)
fig.suptitle('Coherence score by topic count using C_umass and C_v measure')
fig.subplots_adjust(wspace=0.4)

ax1.plot(umtopics,umscore)
ax1.set_xlabel('Number of Topics')
ax1.set_ylabel('Coherence Score (C_umass)')
ax1.axvline(x=threshold,c='red')

ax2.plot(cvtopics,cvscore)
ax2.set_xlabel('Number of Topics')
ax2.set_ylabel('Coherence Score (C_v)')
ax2.axvline(x=threshold,c='red')
#For both scores, higher values are better. Choice of topic count is subjective, but both scores must be taken into account.
plt.show()
'''

"\n#Construct and train unsupervised LDA model + Determine optimal number of topics\numtopics, umscore = [], []\ncvtopics, cvscore = [], []\n#Compute coherence score using C_umass:\nfor i in range(3,15,1):\n    model = LdaMulticore(corpus=corpus, id2word=token_dict, iterations=50, num_topics=i, workers=4, passes=10, random_state=47)\n    #LdamultiCore uses multiple cores to speed up model training, use with caution if you have a weaker PC! (Find your max number of cores with ctrl+shift+esc, under CPU)\n    cm = CoherenceModel(model=model, corpus=corpus, dictionary=token_dict, coherence='u_mass')\n\n    umtopics.append(i)\n    umscore.append(cm.get_coherence())\n#Compute coherence score using C_v: \nfor i in range (3,15,1):\n    model = LdaMulticore(corpus=corpus, id2word=token_dict, iterations=10, num_topics=i, workers = 4, passes=10, random_state=47)\n    cm = CoherenceModel(model=model, texts = data['tokens'], corpus=corpus, dictionary=token_dict, coherence='c_v')\n\n    cvtopics.app

In [9]:
'''
um = pd.DataFrame({'number of topics': umtopics, 'score':umscore})
um['method'] = 'umass'
cv = pd.DataFrame({'number of topics': cvtopics, 'score':cvscore})
cv['method'] = 'cv'
coherence = pd.concat([um,cv])

fig = px.line(coherence, x="number of topics", y="score", facet_row="method", width=600, 
              title='Coherence score by topic count using C_umass and C_v measure')
fig.update_yaxes(matches=None)
fig.show()
'''

'\num = pd.DataFrame({\'number of topics\': umtopics, \'score\':umscore})\num[\'method\'] = \'umass\'\ncv = pd.DataFrame({\'number of topics\': cvtopics, \'score\':cvscore})\ncv[\'method\'] = \'cv\'\ncoherence = pd.concat([um,cv])\n\nfig = px.line(coherence, x="number of topics", y="score", facet_row="method", width=600, \n              title=\'Coherence score by topic count using C_umass and C_v measure\')\nfig.update_yaxes(matches=None)\nfig.show()\n'

In [10]:
#fig.write_image("./plots/coherence_score_by_topic_num.png", format='png', engine='kaleido')

Based on the coherence scores, best is usually the max. In this case we choose to select 6 topics, since it is the max value for the C_v score while also having a relatively higher C_umass score.

In [11]:
#Construct final model with NUM_TOPICS topics (Increase iterations and passes because it is the final model)
finalmodel = LdaMulticore(corpus=corpus, 
                          id2word=token_dict, 
                          iterations=100, 
                          num_topics=NUM_TOPICS, 
                          workers = 4, 
                          passes=100, 
                          random_state=47)

finalmodel.print_topics()

[(0,
  '0.017*"energy" + 0.017*"business" + 0.015*"company" + 0.011*"oil" + 0.011*"economic" + 0.010*"financial" + 0.009*"crisis" + 0.009*"clean" + 0.009*"industry" + 0.008*"plan"'),
 (1,
  '0.016*"peace" + 0.015*"freedom" + 0.012*"democracy" + 0.011*"free" + 0.010*"human" + 0.010*"citizen" + 0.009*"europe" + 0.008*"generation" + 0.007*"century" + 0.007*"common"'),
 (2,
  '0.014*"progress" + 0.012*"africa" + 0.012*"partner" + 0.012*"region" + 0.010*"human" + 0.010*"global" + 0.010*"partnership" + 0.009*"asia" + 0.008*"democracy" + 0.008*"trade"'),
 (3,
  '0.017*"gun" + 0.016*"intelligence" + 0.012*"protect" + 0.012*"national" + 0.010*"court" + 0.009*"enforcement" + 0.009*"attack" + 0.008*"public" + 0.008*"terrorist" + 0.008*"review"'),
 (4,
  '0.022*"health" + 0.021*"care" + 0.015*"tax" + 0.015*"pay" + 0.013*"insurance" + 0.013*"business" + 0.011*"reform" + 0.010*"cut" + 0.010*"college" + 0.009*"education"'),
 (5,
  '0.031*"iraq" + 0.023*"military" + 0.020*"terrorist" + 0.017*"afghanis

In [None]:
#Construct final model with NUM_TOPICS topics (Increase iterations and passes because it is the final model)
finalmodel_8 = LdaMulticore(corpus=corpus, 
                          id2word=token_dict, 
                          iterations=100, 
                          num_topics=8, 
                          workers = 4, 
                          passes=100, 
                          random_state=47)

finalmodel_8.print_topics()

In [None]:
save_model=pd.DataFrame(finalmodel_8.print_topics(),columns=['del', 'topic'])
save_model.drop('del', axis=1, inplace=True)
#save_model.to_csv('./topics/topics_8.csv', index=False)

In [None]:
finalmodel_8.get_topics() # shape: num_topics x vocabulary_size

In [None]:
[topic for num, topic in finalmodel_8.show_topics(num_topics=8, num_words=8,formatted=True)]

In [None]:
x=finalmodel_8.show_topics(num_topics=8, num_words=8,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]
#Below Code Prints Topics and Words
for topic,words in topics_words:
    print(str(topic)+ "::"+ str(words))
print()

In [None]:
#Checking for first speech - DNC keynote speech 2004
finalmodel_8[corpus][0]
#Main topic is Topic 3, which seems to be some sort of mixed bag. The speech itself covers his personal life and the democratic party.

What follows is an advanced visualisation of the topics. Each circle represents a topic and upon hovering over a circle, the bars on the right illustrate the frequency of words that appear in the topic. Closer circles are more related, which is logical since political arguments correlate highly with voting campaigns. Visualisation of circles is done through PCA dimension reduction.

### The topic numbers and order in the plot below do NOT correspond to the topic numbers in print_topics

In [None]:
#       vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
lda_display = pyLDAvis.gensim_models.prepare(topic_model=finalmodel_8, corpus=corpus, dictionary=token_dict)
pyLDAvis.display(lda_display)

In [None]:
columns = list(dict(token_dict).values())

In [None]:
# num topics x vocab size, here 9x1000
topic_vals = pd.DataFrame(finalmodel_8.get_topics(), columns=columns)
topic_vals.shape

In [None]:
topic_by_doc=list(finalmodel_8[corpus])

In [None]:
#[doc][topic][0-id] or [1-value]
doc_topics = []
for doc_num in range(len(topic_by_doc)):
    topics_for_doc = [0] * len(finalmodel_8.print_topics())
    for i in range(len(finalmodel_8.print_topics())):
        if (i < len(topic_by_doc[doc_num])):
            topics_for_doc[topic_by_doc[doc_num][i][0]]=topic_by_doc[doc_num][i][1]
    doc_topics.append(topics_for_doc);

In [None]:
# doc_topics is num_docs x num_topics or 743 rows x 9 cols
doc_topics = pd.DataFrame(doc_topics)
doc_topics.shape

In [None]:
docs_words = doc_topics.dot(topic_vals)

In [None]:
# sum for each document should by 1-ish
docs_words.sum(axis=1) # 0-rows, 1-cols

### Load just the 101 speeches used for comparison

In [None]:
# Load up the files

paths = ['./speeches/']

speeches = []

for path in paths:
    list_of_files = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                list_of_files.append(os.path.join(root,file))
   
    for file in list_of_files:
        with open(file, encoding='utf-8') as f:
            text = f.read()
        f.close()
        speeches.append(text)

#clean out goofy unicode  space characters 
speeches = [unicodedata.normalize("NFKD", speech) for speech in speeches if len(speech)>0 ]
#clean out xa0 space characters
[speech.replace(u'\xa0', '') for speech in speeches]; # ; supresses output
# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub(r'(\[[^w]*\]\s)', '',text)
speeches = [remove_bracket(speech) for speech in speeches]
# Clean up whitespace
speeches = [re.sub(r'[\s+]', ' ', speech) for speech in speeches]

date_text = [file[11:21] for file in list_of_files]

df = pd.DataFrame({'date' : date_text,
                   'file' : list_of_files,
                   'text' : speeches})
df.date = pd.to_datetime(df.date, format='%Y-%m-%d')
speeches101 = df

### Load the 400+ American Rhetoric speeches

In [None]:
# Load up the files
#path = './DataUCSB/' # Smaller UCSB dataset
#path = './Data/' # larger American Rhetoric dataset
paths = ['./Data/']

speeches = []

for path in paths:
    list_of_files = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith('.txt'):
                list_of_files.append(os.path.join(root,file))
   
    for file in list_of_files:
        with open(file, encoding='utf-8') as f:
            text = f.read()
        f.close()
        speeches.append(text)

#clean out goofy unicode  space characters 
speeches = [unicodedata.normalize("NFKD", speech) for speech in speeches if len(speech)>0 ]
#clean out xa0 space characters
[speech.replace(u'\xa0', '') for speech in speeches]; # ; supresses output
# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub(r'(\[[^w]*\]\s)', '',text)
speeches = [remove_bracket(speech) for speech in speeches]
# Clean up whitespace
speeches = [re.sub(r'[\s+]', ' ', speech) for speech in speeches]

df = pd.DataFrame({'file' : list_of_files,
                   'text' : speeches})
datetitle = pd.read_csv('datetitle.csv')
#datetitle.url = [file.replace('Data/', './Data/') for file in datetitle.url]
datetitle.date = pd.to_datetime(datetitle.date, format='%Y-%m-%d')
datetitle = datetitle.drop('title', axis=1)
datetitle = datetitle.rename(columns={'url': 'file'})
df = pd.merge(df, datetitle, how='inner', on='file')
df = df.sort_values(by='date', ignore_index=True)
amrhet = df[['date', 'file', 'text']]

In [None]:
nlp = en_core_web_md.load()
def prepare_text(text_list):
    #Tags to remove
    extags = ['PRON','CCONJ','PUNCT','PART','DET','ADP','NUM','SYM','SPACE']
    docs = text_list.apply(nlp)
    tokens=[]
    #SpaCy tokenization + lemmatization + lowercase
    for speech in docs:
        scr_tok = [token.lemma_.lower() for token in speech if token.pos_ not in extags and not token.is_stop and token.is_alpha]
        tokens.append(scr_tok)
    data = pd.DataFrame()
    data['tokens'] = tokens

    token_dict = Dictionary(data['tokens'])
    #Filter out tokens that appear in less than 5 speeches, and tokens that appear in more than 70% of speeches since they are too general. Keep the top 1000 most frequent tokens
    token_dict.filter_extremes(no_below=5,no_above=NO_ABOVE,keep_n=1000)

    #Convert token counts into bag of words (BoW) corpus
    corpus = [token_dict.doc2bow(speech) for speech in data['tokens']]
    
    return([token_dict, corpus])

In [None]:
if REM_SW:
    amrhet.text=rem_stop_words(amrhet.text)
    speeches101.text=rem_stop_words(speeches101.text)
amrhet_token_dict, amrhet_corpus = prepare_text(amrhet.text)
speeches_token_dict, speeches_corpus = prepare_text(speeches101.text)

In [None]:
amrhet['topics'] = [finalmodel_8[corpus] for corpus in amrhet_corpus]
speeches101['topics'] = [finalmodel_8[corpus] for corpus in speeches_corpus]
amrhet_topics = [finalmodel_8[corpus] for corpus in amrhet_corpus]
speeches101_topics = [finalmodel_8[corpus] for corpus in speeches_corpus]

In [None]:
# Convert topic list to matrix of values to put in a data frame
def topic_to_matrix(topic_by_doc):
    doc_topics = []
    for doc_num in range(len(topic_by_doc)):
        topics_for_doc = [0] * len(finalmodel_8.print_topics())
        for i in range(len(finalmodel_8.print_topics())):
            if (i < len(topic_by_doc[doc_num])):
                topics_for_doc[topic_by_doc[doc_num][i][0]]=topic_by_doc[doc_num][i][1]
        doc_topics.append(topics_for_doc);
    return(doc_topics)

In [None]:
amrhet_topic_mat = pd.DataFrame(topic_to_matrix(amrhet_topics))
speeches101_topic_mat = pd.DataFrame(topic_to_matrix(speeches101_topics))

In [None]:
amrhet = pd.concat([amrhet,amrhet_topic_mat], axis=1)
speeches101 = pd.concat([speeches101,speeches101_topic_mat], axis=1)

In [None]:
# Leave commented as to not accidently overwrite something important
#amrhet.to_csv('topics_amrhet_oba.csv', index=False)
#speeches101.to_csv('topics_speeches_oba.csv', index=False)

In [None]:
tidy = pd.read_csv('tidy_data_oba.csv')
tidy.date = pd.to_datetime(tidy.date, format='%Y-%m-%d')

In [None]:
ar = speeches101.drop(['file', 'text', 'topics'], axis=1)

In [None]:
topics = pd.read_csv('./topics/topics_8.csv')
topic_lst = list(topics.topic_name)
col_lst = ['date']
for topic in topic_lst: col_lst.append(topic)
ar.columns = col_lst
topic_lst

In [None]:
tidy_topic = pd.merge(tidy, ar, how='left', on='date')

In [None]:
#tidy_topic.to_csv('tidy_topic_oba.csv', index=False)

In [None]:
correlation = tidy_topic.corr(numeric_only=True)

In [None]:
topic_corr = correlation.loc['ADJ':'chars_per_sent_std', 'economy':'civil rights']

In [None]:
topic_corr.columns = list(topics.topic_name)

In [None]:
def highlight_max(x):
    return ['font-weight: bold' if abs(v) > 0.30 else ''
                for v in x]

topic_corr.style.apply(highlight_max)

In [None]:
fig = px.scatter(tidy_topic, x="intl relations", y="NUM")
fig.show()

In [None]:
fig = px.scatter(tidy_topic, x="middle east", y="syl_per_word",
                title="Topic: Middle East vs Syllables per word")
fig.show()

In [None]:
fig.write_image("./plots/middle_east_vs_syllables_per_word.png", format='png', engine='kaleido')

In [None]:
fig = px.scatter(tidy_topic, x="middle east", y="smog",
                title="Topic: Middle East vs SMOG readbility index")
fig.show()

In [None]:
fig.write_image("./plots/middle_east_vs_SMOG.png", format='png', engine='kaleido')

In [None]:
import plotly.express as px
fig = px.scatter(tidy_topic, x="intl relations", y="anger",
                 labels = {'intl relations':'international relations'},
                title="International relations vs anger")
fig.show()

In [None]:
fig.write_image("./plots/scatter_internationalrelations_vs_anger.png", format='png', engine='kaleido')

In [None]:
nlp = spacy.load('en_core_web_md')
# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub(r'(\[[^w]*\]\s)', '',text)
amrhet.text = amrhet.text.apply(remove_bracket)
def get_encodings(text):
    return list(nlp(text).vector)
amrhet['enc'] = amrhet.text.apply(get_encodings)
pca_data = pd.DataFrame(amrhet['enc'].to_list(), index=amrhet.date)

In [None]:
#pca_data.to_csv('amrhet_spacy_encodings.csv')

In [None]:
# remove [stuff] in between square brackets
def remove_bracket(text):
    return re.sub(r'(\[[^w]*\]\s)', '',text)
speeches101.text = speeches101.text.apply(remove_bracket)
def get_encodings(text):
    return list(nlp(text).vector)
speeches101['enc'] = speeches101.text.apply(get_encodings)
pca_data2 = pd.DataFrame(speeches101['enc'].to_list(), index=speeches101.date)

In [None]:
#pca_data2.to_csv('speeches_spacy_encodings.csv')

### Coherence revisisted - investigate all four methods

In [None]:
#Construct and train unsupervised LDA model + Determine optimal number of topics
umtopics, umscore = [], []
cvtopics, cvscore = [], []
ucitopics, uciscore = [], []
npmitopics, npmiscore = [], []
#Compute coherence score using C_umass:
for i in range(3,15,1):
    model = LdaMulticore(corpus=corpus, id2word=token_dict, iterations=50, num_topics=i, workers=4, passes=10, random_state=47)
    #LdamultiCore uses multiple cores to speed up model training, use with caution if you have a weaker PC! (Find your max number of cores with ctrl+shift+esc, under CPU)
    cm = CoherenceModel(model=model, corpus=corpus, dictionary=token_dict, coherence='u_mass')

    umtopics.append(i)
    umscore.append(cm.get_coherence())
#Compute coherence score using C_v: 
for i in range (3,15,1):
    model = LdaMulticore(corpus=corpus, id2word=token_dict, iterations=10, num_topics=i, workers = 4, passes=10, random_state=47)
    cm = CoherenceModel(model=model, texts = data['tokens'], corpus=corpus, dictionary=token_dict, coherence='c_v')

    cvtopics.append(i)
    cvscore.append(cm.get_coherence())
#The difference in coherence score measures is the method in which the text is segmented and probability is calculated

#Compute coherence score using C_uci: 
for i in range (3,15,1):
    model = LdaMulticore(corpus=corpus, id2word=token_dict, iterations=10, num_topics=i, workers = 4, passes=10, random_state=47)
    cm = CoherenceModel(model=model, texts = data['tokens'], corpus=corpus, dictionary=token_dict, coherence='c_uci')

    ucitopics.append(i)
    uciscore.append(cm.get_coherence())
    
#Compute coherence score using C_npmi: 
for i in range (3,15,1):
    model = LdaMulticore(corpus=corpus, id2word=token_dict, iterations=10, num_topics=i, workers = 4, passes=10, random_state=47)
    cm = CoherenceModel(model=model, texts = data['tokens'], corpus=corpus, dictionary=token_dict, coherence='c_npmi')

    npmitopics.append(i)
    npmiscore.append(cm.get_coherence())

In [None]:
#Adjustable threshold for visualising with red vertical lines
threshold=9
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4)
fig.suptitle('Coherence score by topic count using C_umass and C_v measure')
fig.subplots_adjust(wspace=0.4)

ax1.plot(umtopics,umscore)
ax1.set_xlabel('Number of Topics')
ax1.set_ylabel('Coherence Score (C_umass)')
ax1.axvline(x=threshold,c='red')

ax2.plot(cvtopics,cvscore)
ax2.set_xlabel('Number of Topics')
ax2.set_ylabel('Coherence Score (C_v)')
ax2.axvline(x=threshold,c='red')

ax3.plot(ucitopics,uciscore)
ax3.set_xlabel('Number of Topics')
ax3.set_ylabel('Coherence Score (C_uci)')
ax3.axvline(x=threshold,c='red')

ax4.plot(npmitopics,npmiscore)
ax4.set_xlabel('Number of Topics')
ax4.set_ylabel('Coherence Score (C_npmi)')
ax4.axvline(x=threshold,c='red')

#For both scores, higher values are better. Choice of topic count is subjective, but both scores must be taken into account.
plt.show()

In [None]:
um = pd.DataFrame({'number of topics': umtopics, 'score':umscore})
um['method'] = 'umass'
cv = pd.DataFrame({'number of topics': cvtopics, 'score':cvscore})
cv['method'] = 'cv'
uci = pd.DataFrame({'number of topics': ucitopics, 'score':uciscore})
uci['method'] = 'uci'
npmi = pd.DataFrame({'number of topics': npmitopics, 'score':npmiscore})
npmi['method'] = 'npmi'
coherence = pd.concat([um,cv,uci,npmi])

In [None]:
fig = px.line(coherence, x="number of topics", y="score", facet_row="method", width=600, 
              title='Coherence score by topic count using 4 methods')
fig.update_yaxes(matches=None)
fig.show()

In [None]:
fig.write_image("./plots/coherence_scores_4_methods.png", format='png', engine='kaleido')

In [None]:
coherence['rank']=coherence.groupby(by='method').rank(axis=0,ascending=False).score
rank = coherence.groupby('number of topics').sum('rank')
rank['number of topics'] = rank.index

Eight topics returns the best rank

In [None]:
fig = px.bar(rank, x="number of topics", y="rank", width=600, 
             labels={'rank':'sum of ranks'},
             title='Sum of coherence ranks for the four methods')
fig.show()

In [None]:
fig.write_image("./plots/coherence_score_rank.png", format='png', engine='kaleido')

In [None]:
finalmodel_8.show_topics(num_topics=8, num_words=20,formatted=True)[0] # economy

In [None]:
finalmodel_8.show_topics(num_topics=8, num_words=20,formatted=True)[7] # civil rights