# Phrase Modelling Part III - Analyzing the Models

In [1]:
import os
from gensim import corpora, models, utils
from pprint import pprint
import pandas as pd
import time

## Load data

Load dictionaries

In [2]:
path = os.getcwd() + '/tmp/'

sanctity_dict = corpora.Dictionary.load(path + 'sanctity_dict.dict')
degradation_dict = corpora.Dictionary.load(path + 'degradation_dict.dict')
fairness_dict = corpora.Dictionary.load(path + 'fairness_dict.dict')
cheating_dict = corpora.Dictionary.load(path + 'cheating_dict.dict')

Load corpora

In [3]:
path = os.getcwd() + '/tmp/'

sanctity_corpus = corpora.MmCorpus(path + 'sanctity_corpus.mm')
degradation_corpus = corpora.MmCorpus(path + 'degradation_corpus.mm')
fairness_corpus = corpora.MmCorpus(path + 'fairness_corpus.mm')
cheating_corpus = corpora.MmCorpus(path + 'cheating_corpus.mm')

Load models

In [4]:
path = os.getcwd() + '/tmp/'

sanctity_lda = models.ldamodel.LdaModel.load(path + 'sanctity_lda_model')
degradation_lda = models.ldamodel.LdaModel.load(path + 'degradation_lda_model')
fairness_lda = models.ldamodel.LdaModel.load(path + 'fairness_lda_model')
cheating_lda = models.ldamodel.LdaModel.load(path + 'cheating_lda_model')

In [5]:
pprint(degradation_lda.print_topics())

[(0,
  '0.026*"cases" + 0.014*"covid" + 0.009*"black" + 0.008*"plans" + '
  '0.008*"best" + 0.008*"coronavirus" + 0.008*"man" + 0.006*"iran" + '
  '0.006*"democrats" + 0.006*"supreme_court"'),
 (1,
  '0.016*"police" + 0.012*"oil" + 0.011*"coronavirus" + 0.010*"uk" + '
  '0.010*"stock" + 0.009*"market" + 0.008*"update" + 0.007*"hit" + '
  '0.007*"covid" + 0.007*"quarter"'),
 (2,
  '0.021*"china" + 0.021*"virus" + 0.014*"coronavirus" + 0.013*"amid" + '
  '0.013*"stocks" + 0.012*"million" + 0.010*"deal" + 0.009*"global" + '
  '0.009*"surge" + 0.008*"eu"'),
 (3,
  '0.029*"new" + 0.026*"trump" + 0.024*"says" + 0.017*"coronavirus" + '
  '0.008*"covid" + 0.007*"state" + 0.006*"report" + 0.005*"billion" + '
  '0.005*"india" + 0.005*"pandemic"')]


In [6]:
start = time.time()

def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(sanctity_lda, sanctity_corpus, sanctity_dict)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

end = time.time()
print("Elapsed:", end - start, 'seconds')

Elapsed: 1991.9742341041565 seconds


In [11]:
df_dominant_topic.head(100)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,3.0,0.3091,"china, pandemic, coronavirus, amid, covid, time, global, million, stock, people",american
1,1,3.0,0.3068,"china, pandemic, coronavirus, amid, covid, time, global, million, stock, people",bad
2,2,3.0,0.3038,"china, pandemic, coronavirus, amid, covid, time, global, million, stock, people",baghdadi
3,3,3.0,0.3062,"china, pandemic, coronavirus, amid, covid, time, global, million, stock, people",death
4,4,3.0,0.3062,"china, pandemic, coronavirus, amid, covid, time, global, million, stock, people",mean
...,...,...,...,...,...
95,95,3.0,0.3078,"china, pandemic, coronavirus, amid, covid, time, global, million, stock, people",mining
96,96,3.0,0.3110,"china, pandemic, coronavirus, amid, covid, time, global, million, stock, people",nevada
97,97,3.0,0.3093,"china, pandemic, coronavirus, amid, covid, time, global, million, stock, people",protect
98,98,3.0,0.3103,"china, pandemic, coronavirus, amid, covid, time, global, million, stock, people",rare


In [7]:
start = time.time()

# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

end = time.time()
print("Elapsed:", end - start, 'seconds')

Elapsed: 0.08580493927001953 seconds


In [10]:
sent_topics_sorteddf_mallet.head(10)


Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,3.0,0.3275,"china, pandemic, coronavirus, amid, covid, time, global, million, stock, people",reinterpret
