# Topic Model

This code has been adapted from the article Topic Modeling with Gensim (Python) by Selva Prabhakaran at https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/

In [20]:
# Get Libraries
import nltk # Stopwords and other NLP
from nltk.corpus import stopwords
import re #Regex
import numpy as np #Math
import pandas as pd #Dataframe
from pprint import pprint
import spacy #Lemmatization

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

## Model & Data Prep

In [4]:
# Get stopwords
stop_words = stopwords.words('english')

In [5]:
# Import sentiment dataframe
df = pd.read_csv("20201117_AM_Sentiment.csv") 
df.head()

Unnamed: 0.1,Unnamed: 0,Sentence,fileid,Sequence,Sentiment
0,0,baltimore 20 september 1836 dear heinrich: fr...,S10003-D023.txt,1,0.7263
1,1,it was a long and arduous voyage!,S10003-D023.txt,2,0.0
2,2,however we were and are all well and thank the...,S10003-D023.txt,3,0.8777
3,3,i had written down the events of the voyage fo...,S10003-D023.txt,4,0.2263
4,4,"as you know, we set sail on 12 july.",S10003-D023.txt,5,0.0


In [6]:
# Place column values in a list
data = df.Sentence.values.tolist()
data[:5]

[' baltimore 20 september 1836 dear heinrich: friday evening, 16 september, anchor was dropped and we had safely arrived in the harbor!',
 'it was a long and arduous voyage!',
 'however we were and are all well and thank the father in heaven that he has protected us so far!',
 'i had written down the events of the voyage for you, but to my great annoyance i am now missing the whole notebook; perhaps i will find it later.',
 'as you know, we set sail on 12 july.']

In [7]:
# Function to tokenize 
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:5])

[['baltimore', 'september', 'dear', 'heinrich', 'friday', 'evening', 'september', 'anchor', 'was', 'dropped', 'and', 'we', 'had', 'safely', 'arrived', 'in', 'the', 'harbor'], ['it', 'was', 'long', 'and', 'arduous', 'voyage'], ['however', 'we', 'were', 'and', 'are', 'all', 'well', 'and', 'thank', 'the', 'father', 'in', 'heaven', 'that', 'he', 'has', 'protected', 'us', 'so', 'far'], ['had', 'written', 'down', 'the', 'events', 'of', 'the', 'voyage', 'for', 'you', 'but', 'to', 'my', 'great', 'annoyance', 'am', 'now', 'missing', 'the', 'whole', 'notebook', 'perhaps', 'will', 'find', 'it', 'later'], ['as', 'you', 'know', 'we', 'set', 'sail', 'on', 'july']]


In [8]:
# Bigram / trigram models
bigram = gensim.models.Phrases(data_words, min_count=20, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

In [9]:
# Functions for stopwords, bigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove stop words
data_words_nostops = remove_stopwords(data_words)

# Form bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy lemmatizer
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams)

In [11]:
# Create dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [12]:
# Initialize Mallet module
# Obtained from http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = 'mallet-2.0.8/bin/mallet'

## Topics

In [14]:
# Run 9-topic model
ldamallet_09 = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=9, id2word=id2word)

# Show Topics
pprint(ldamallet_09.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet_09 = CoherenceModel(model=ldamallet_09, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet_09 = coherence_model_ldamallet_09.get_coherence()
print('Coherence Score: ', coherence_ldamallet_09)

[(0,
  [('day', 0.07479980811100041),
   ('leave', 0.036680320307022396),
   ('week', 0.027381084172847705),
   ('home', 0.024428945717554152),
   ('back', 0.022546957452304513),
   ('return', 0.021218495147422415),
   ('night', 0.01693789438724676),
   ('morning', 0.015757039005129342),
   ('room', 0.015277316506144138),
   ('hour', 0.014686888815085428)]),
 (1,
  [('place', 0.04003002251688766),
   ('house', 0.026376925551306336),
   ('mile', 0.024697094249258374),
   ('large', 0.020265198899174382),
   ('water', 0.015726080274491582),
   ('town', 0.01472532971156939),
   ('high', 0.013903284606311878),
   ('city', 0.013545873690982523),
   ('church', 0.013474391507916652),
   ('stand', 0.012545123128060331)]),
 (2,
  [('write', 0.06192957174001404),
   ('letter', 0.06122750618926209),
   ('send', 0.04870117873110889),
   ('long', 0.03857665447289658),
   ('time', 0.03798544137752651),
   ('hear', 0.03473376935299117),
   ('receive', 0.031888556331522745),
   ('answer', 0.01814285186

In [18]:
mallet2ldaModel_09 = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(ldamallet_09)

## Evaluation

In [22]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(mallet2ldaModel_09, corpus, id2word)
vis

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## Dominant topic in each sentence

In [24]:
def format_topics_sentences(model, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [29]:
# Run the function
df_topic_sents_keywords = format_topics_sentences(model=mallet2ldaModel_09, corpus=corpus, texts=data)

In [36]:
# Format output
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# View
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,1.0,0.1525,"place, house, mile, large, water, town, high, ...",baltimore 20 september 1836 dear heinrich: fr...
1,1,6.0,0.142,"good, make, give, find, thing, hope, care, tur...",it was a long and arduous voyage!
2,2,7.0,0.1255,"work, man, great, call, school, order, person,...",however we were and are all well and thank the...
3,3,6.0,0.1575,"good, make, give, find, thing, hope, care, tur...",i had written down the events of the voyage fo...
4,4,0.0,0.1261,"day, leave, week, home, back, return, night, m...","as you know, we set sail on 12 july."
5,5,7.0,0.1408,"work, man, great, call, school, order, person,...","by the afternoon, when the pilot left us, one ..."
6,6,0.0,0.1566,"day, leave, week, home, back, return, night, m...",the next morning i felt ill and remained in be...
7,7,6.0,0.1285,"good, make, give, find, thing, hope, care, tur...",later i was always spared.
8,8,0.0,0.2015,"day, leave, week, home, back, return, night, m...",the wind was adverse from the very first eveni...
9,9,6.0,0.1424,"good, make, give, find, thing, hope, care, tur...",on the open sea things went fine in the beginn...


## Most representative sentence for each topic

In [37]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

In [51]:
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

# Show
sent_topics_sorteddf_mallet.head()

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Text
0,0.0,0.446,"day, leave, week, home, back, return, night, m...","for your amusement, here's a typical week's me..."
1,1.0,0.4094,"place, house, mile, large, water, town, high, ...",standing as we did on the top of an almost bal...
2,2.0,0.2767,"write, letter, send, long, time, hear, receive...",i wrote you a long letter in january 1866 givi...
3,3.0,0.301,"time, people, present, bad, meet, world, lose,...",excerpts from a speech made in the house of re...
4,4.0,0.2469,"child, dear, live, sister, feel, family, love,...","god bless you my dear child, may wednesday pro..."


In [54]:
print(sent_topics_sorteddf_mallet['Text'].values)

["for your amusement, here's a typical week's menu: sun breakfast: bacon, eggs, creamed potato, raisins (no lunch on sunday) dinner: stuffed beef, beans, lettuce, rice, pickled cabbage, pudding mon breakfast: creamed potato, jam lunch: sandwich, lettuce, rice, applesauce dinner: pork and beans, radish, salad, raisins tues: breakfast: mush, apricot lunch: meatballs, rice, lettuce, pickle dinner: bologna, spaghetti, pickles, peas, pudding wed: breakfast: bran flakes, jam lunch: boiled beef, rice, lettuce, takuan dinner: salted mackerel, pickles, rice, peas, jello thu: breakfast: cornmeal, prunes lunch: salted pork and cabbage, beans, lettuce dinner: meatballs, rice, pickles, prunes fri: breakfast: bran flakes, jam lunch: cod fish, lettuce, rice dinner: liver, onion, lettuce, pudding sat: breakfast: mush, raisins lunch: curry-rice, salad, peach dinner: wiener, sauerkraut, rice, figs naturally, in addition, there is bread at each meal, coffee in the morning, and tea with the other meals."


In [55]:
# Number of sentences for each topic
topic_counts = df_topic_sents_keywords['Dominant_Topic'].value_counts()
topic_counts

0.0    7004
4.0    5413
2.0    5406
8.0    4744
1.0    4505
6.0    4501
3.0    4303
7.0    4167
5.0    3982
Name: Dominant_Topic, dtype: int64

In [56]:
# Percentage of sentences for each topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
topic_contribution

0.0    0.1591
4.0    0.1230
2.0    0.1228
8.0    0.1078
1.0    0.1023
6.0    0.1022
3.0    0.0977
7.0    0.0947
5.0    0.0904
Name: Dominant_Topic, dtype: float64

In [60]:
# Topic Number and Keywords
topic_num_keywords = df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']].drop_duplicates().sort_values(by = 'Dominant_Topic')
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
4,0.0,"day, leave, week, home, back, return, night, m..."
0,1.0,"place, house, mile, large, water, town, high, ..."
16,2.0,"write, letter, send, long, time, hear, receive..."
23,3.0,"time, people, present, bad, meet, world, lose,..."
48,4.0,"child, dear, live, sister, feel, family, love,..."
11,5.0,"bring, small, hand, put, fine, foot, carry, op..."
1,6.0,"good, make, give, find, thing, hope, care, tur..."
2,7.0,"work, man, great, call, school, order, person,..."
27,8.0,"year, country, land, money, dollar, pay, state..."


In [61]:
topic_num_keywords.reset_index(drop=True, inplace=True)
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
0,0.0,"day, leave, week, home, back, return, night, m..."
1,1.0,"place, house, mile, large, water, town, high, ..."
2,2.0,"write, letter, send, long, time, hear, receive..."
3,3.0,"time, people, present, bad, meet, world, lose,..."
4,4.0,"child, dear, live, sister, feel, family, love,..."
5,5.0,"bring, small, hand, put, fine, foot, carry, op..."
6,6.0,"good, make, give, find, thing, hope, care, tur..."
7,7.0,"work, man, great, call, school, order, person,..."
8,8.0,"year, country, land, money, dollar, pay, state..."


In [62]:
topic_num_keywords = topic_num_keywords.reindex([0,4,2,8,1,6,3,7,5])
topic_num_keywords

Unnamed: 0,Dominant_Topic,Topic_Keywords
0,0.0,"day, leave, week, home, back, return, night, m..."
4,4.0,"child, dear, live, sister, feel, family, love,..."
2,2.0,"write, letter, send, long, time, hear, receive..."
8,8.0,"year, country, land, money, dollar, pay, state..."
1,1.0,"place, house, mile, large, water, town, high, ..."
6,6.0,"good, make, give, find, thing, hope, care, tur..."
3,3.0,"time, people, present, bad, meet, world, lose,..."
7,7.0,"work, man, great, call, school, order, person,..."
5,5.0,"bring, small, hand, put, fine, foot, carry, op..."


In [63]:
# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts.rename('counts'), topic_contribution.rename('contribution')], axis=1).sort_values(by = 'Dominant_Topic')
df_dominant_topics

Unnamed: 0,Dominant_Topic,Topic_Keywords,counts,contribution
0,0.0,"day, leave, week, home, back, return, night, m...",7004,0.1591
1,1.0,"place, house, mile, large, water, town, high, ...",4505,0.1023
2,2.0,"write, letter, send, long, time, hear, receive...",5406,0.1228
3,3.0,"time, people, present, bad, meet, world, lose,...",4303,0.0977
4,4.0,"child, dear, live, sister, feel, family, love,...",5413,0.123
5,5.0,"bring, small, hand, put, fine, foot, carry, op...",3982,0.0904
6,6.0,"good, make, give, find, thing, hope, care, tur...",4501,0.1022
7,7.0,"work, man, great, call, school, order, person,...",4167,0.0947
8,8.0,"year, country, land, money, dollar, pay, state...",4744,0.1078


## Append topics to sentiment dataframe

This code is adapted from the work of Sunyam Bagga in Moody, A., & Bagga, S. (2020, July 22-24). A comparative study of sentiment and topics in migration related tweets. Digital Humanities 2020 Conference, Ottawa, ON, Canada. https://dh2020.adho.org/wp-content/uploads/2020/07/725_Acomparativestudyofsentimentandtopicsinmigrationrelatedtweets.html

In [67]:
# Create dictionary
map_sentenceText_topicNumber = {}

# Loop through sentences and get topics
for index, item in enumerate(data):
    topic_dist = mallet2ldaModel_09.get_document_topics(corpus[index])

    # topic_dist is a list of tuples: each tuple has a topic number with its corresponding proportion
    sorted_topic_dist = sorted(topic_dist, key=lambda x: (x[1]), reverse=True) # Sort it in descending order
    
    # Pick the top (main) one and put it in the dictionary:
    topic_number = sorted_topic_dist[0][0]
    map_sentenceText_topicNumber[item] = topic_number

In [68]:
map_sentenceText_topicNumber

{' baltimore 20 september 1836 dear heinrich: friday evening, 16 september, anchor was dropped and we had safely arrived in the harbor!': 1,
 'it was a long and arduous voyage!': 6,
 'however we were and are all well and thank the father in heaven that he has protected us so far!': 7,
 'i had written down the events of the voyage for you, but to my great annoyance i am now missing the whole notebook; perhaps i will find it later.': 6,
 'as you know, we set sail on 12 july.': 0,
 'by the afternoon, when the pilot left us, one after the other began to hold his head overboard; bernhard, mrs schwarze, the children, and others.': 7,
 'the next morning i felt ill and remained in bed for two days, but during the whole time i only had to vomit four or five times; this was pretty much the same with most passengers.': 0,
 'later i was always spared.': 6,
 'the wind was adverse from the very first evening and remained so for almost three weeks; we did not pass the english channel until 2 august a

In [69]:
df['topicNumber'] = df['Sentence'].map(map_sentenceText_topicNumber)
df

Unnamed: 0.1,Unnamed: 0,Sentence,fileid,Sequence,Sentiment,topicNumber
0,0,baltimore 20 september 1836 dear heinrich: fr...,S10003-D023.txt,1,0.7263,1
1,1,it was a long and arduous voyage!,S10003-D023.txt,2,0.0000,6
2,2,however we were and are all well and thank the...,S10003-D023.txt,3,0.8777,7
3,3,i had written down the events of the voyage fo...,S10003-D023.txt,4,0.2263,6
4,4,"as you know, we set sail on 12 july.",S10003-D023.txt,5,0.0000,0
5,5,"by the afternoon, when the pilot left us, one ...",S10003-D023.txt,6,0.0000,7
6,6,the next morning i felt ill and remained in be...,S10003-D023.txt,7,0.5267,0
7,7,later i was always spared.,S10003-D023.txt,8,0.0000,6
8,8,the wind was adverse from the very first eveni...,S10003-D023.txt,9,-0.3612,0
9,9,on the open sea things went fine in the beginn...,S10003-D023.txt,10,0.8020,6


In [78]:
df.to_csv('20201119_AM_Latent2Merge.csv', index=None)