In this notebook we'll explore topic modeling to discover broad themes in a collection of movie summaries.

In [1]:
import nltk
import re
import gensim
from gensim import corpora
import operator

nltk.download('stopwords')
from nltk.corpus import stopwords

import numpy as np
import random

random.seed(1)
import pandas as pd
import os

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pcrrt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Clean Data

In [2]:
inpath2= os.path.dirname(os.path.dirname(os.getcwd()))+"\\Data\\Intermediate Data\\"
df=pd.read_csv(inpath2+'final_all_files.csv')

In [3]:
df['title2']=df['date']+" "+df['title']+" - "+df['event_type']
df=df[df['candidate']=='trump']

In [4]:
df2=df[df['event_type']=='rally'][['title2', 'text']]
df2.to_csv(inpath2+'trump_for_lda_rally.tsv', sep='\t', index=False, header=False, encoding='utf8')

In [5]:
df2=df[df['event_type']!='rally'][['title2', 'text']]
df2.to_csv(inpath2+'trump_for_lda_speech.tsv', sep='\t', index=False, header=False, encoding='utf8')

Since we're running topic modeling on texts with lots of names, we'll add the Jockers list of stopwords (which includes character names) to our stoplist.

In [6]:
def read_stopwords(filename):
    stopwords={}
    with open(filename) as file:
        for line in file:
            stopwords[line.rstrip()]=1
    return stopwords

In [7]:
stop_words = {k:1 for k in stopwords.words('english')}
stop_words.update(read_stopwords("C:/Users/pcrrt/anlp21/data/jockers.stopwords"))
stop_words["'s"]=1
stop_words=list(stop_words.keys())
excludewords=["ok", 're', "n't", "'ll", "okay", "uh", "'m", "'ve", "gon"]

In [8]:
def filter(word, stopwords):
    
    """ Function to exclude words from a text """
    
    # no stopwords
    if word in stopwords:
        return False
    if word in excludewords:
        return False
    
    # has to contain at least one letter
    if re.search("[A-Za-z]", word) is not None:
        return True
    
    return False

In [9]:
def read_docs(plotFile, stopwords):
    docs=[]
    names=[]
    with open(plotFile, encoding="utf-8") as file:
        for line in file:
            #print(line)
            #print(line)
            cols=line.rstrip().split("\t")
            title=cols[0]
            text=cols[1]
            tokens=nltk.word_tokenize(text.lower())
            tokens=[x for x in tokens if filter(x, stopwords)]
            docs.append(tokens)
            name=title
            if name=='':
                print(text)
            names.append(name)
    return docs, names

### Rallies Topic Modeling

In [10]:
plotFile=inpath2+'/trump_for_lda_rally.tsv'
data, doc_names=read_docs(plotFile, stop_words)

In [11]:
# Create vocab from data; restrict vocab to only the top 10K terms that show up in at least 5 documents 
# and no more than 50% of all documents
dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=5, no_above=.5, keep_n=10000)

In [12]:
# Replace dataset with numeric ids words in vocab (and exclude all other words)
corpus = [dictionary.doc2bow(text) for text in data]

In [13]:
num_topics=10

Now let's run a topic model on this data using gensim's built-in LDA.

In [14]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics, 
                                           passes=10,
                                           alpha='auto')

We can get a sense of what the topics are by printing the top 10 words with highest $P(word \mid topic)$ for each topic

In [15]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))

topic 0:	biden ballots sleepy suburbs vaccine peace ban virus closed fracking
topic 1:	iowa coal nato west legislation dishonest miners violence forgotten healthcare
topic 2:	texas louisiana crooked abortion oil wisconsin corrupt biden impeachment healthcare
topic 3:	colorado crooked debates texas address alien fox poll figure wind
topic 4:	native test immune abandoned indian irredeemable voter husbands christmas subsidy
topic 5:	west coal kentucky minnesota alabama miners bills clean nfl points
topic 6:	arizona site cars tennessee location rockets lottery thom spend union
topic 7:	black biden minnesota sleepy texas thom community plane enthusiasm closed
topic 8:	montana coal tester texas clean caravan west testing miners indiana
topic 9:	mississippi wisconsin missouri indiana tennessee judge blue tuesday wealth leak


Another way of understanding topics is to print out the documents that have the highest topic representation -- i.e., for a given topic $k$, the documents with highest $P(topic=k | document)$.  How much do the documents listed here align with your understanding of the topics?

In [16]:
topic_model=lda_model 

topic_docs=[]
for i in range(num_topics):
    topic_docs.append({})
for doc_id in range(len(corpus)):
    doc_topics=topic_model.get_document_topics(corpus[doc_id])
    for topic_num, topic_prob in doc_topics:
        topic_docs[topic_num][doc_id]=topic_prob

for i in range(num_topics):
    print("%s\n" % ' '.join([term for term, freq in topic_model.show_topic(i, topn=10)]))
    sorted_x = sorted(topic_docs[i].items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x[:5]:
        print("%s\t%.3f\t%s %s" % (i,v,doc_names[k], k))
    print()   

biden ballots sleepy suburbs vaccine peace ban virus closed fracking

0	1.000	09/17/2020 Speech: Donald Trump Holds a Campaign Rally in Mosinee, Wisconsin - rally 9
0	1.000	09/12/2020 Speech: Donald Trump Holds a Campaign Rally in Minden, Nevada - rally 10
0	1.000	09/19/2020 Speech: Donald Trump Holds a Campaign Rally in Fayetteville, North Carolina - rally 7
0	1.000	09/13/2020 Trump delivers remarks from Minden, Nevada - rally 103
0	1.000	09/22/2020 Speech: Donald Trump Holds a Campaign Rally in Moon Township, Pennsylvania - rally 5

iowa coal nato west legislation dishonest miners violence forgotten healthcare

1	1.000	08/22/2017 Speech: Donald Trump Holds a Political Rally in Phoenix, Arizona - rally 89
1	0.999	04/29/2017 Speech: Donald Trump Holds a Political Rally in Harrisburg, Pennsylvania - rally 94
1	0.999	02/18/2017 Speech: Donald Trump Holds a Political Rally in Melbourne, Florida - rally 97
1	0.999	07/25/2017 Speech: Donald Trump Holds a Political Rally in Youngstown, Ohio 

### Speech Topic Modeling

In [17]:
plotFile=inpath2+'/trump_for_lda_speech.tsv'
data, doc_names=read_docs(plotFile, stop_words)

In [18]:
# Create vocab from data; restrict vocab to only the top 10K terms that show up in at least 5 documents 
# and no more than 50% of all documents
dictionary = corpora.Dictionary(data)
dictionary.filter_extremes(no_below=5, no_above=.5, keep_n=10000)

In [19]:
# Replace dataset with numeric ids words in vocab (and exclude all other words)
corpus = [dictionary.doc2bow(text) for text in data]

In [20]:
num_topics=10

In [21]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=num_topics, 
                                           passes=10,
                                           alpha='auto')

In [22]:
for i in range(num_topics):
    print("topic %s:\t%s" % (i, ' '.join([term for term, freq in lda_model.show_topic(i, topn=10)])))

topic 0:	're police decision biden enforcement flu feel says answer test
topic 1:	nations iran syria security peace god middle turkey children regime
topic 2:	drug border penalty room nato opportunity criminal nuclear immigration russia
topic 3:	ventilators energy hospitals lose medical governor hands decision sudden shape
topic 4:	tax children god violence immigration schools constitution cuts cut tonight
topic 5:	god heroes justice army bless men honor open workers judge
topic 6:	election wall north remember corruption border vote fighting obama wrong
topic 7:	're obama russia ballots iran wants tax percent disaster campaign
topic 8:	virus governors governor ventilators masks medical workers testing businesses hospital
topic 9:	're trump biden justice south health school rate criminal election


In [23]:
topic_model=lda_model 

topic_docs=[]
for i in range(num_topics):
    topic_docs.append({})
for doc_id in range(len(corpus)):
    doc_topics=topic_model.get_document_topics(corpus[doc_id])
    for topic_num, topic_prob in doc_topics:
        topic_docs[topic_num][doc_id]=topic_prob

for i in range(num_topics):
    print("%s\n" % ' '.join([term for term, freq in topic_model.show_topic(i, topn=10)]))
    sorted_x = sorted(topic_docs[i].items(), key=operator.itemgetter(1), reverse=True)
    for k, v in sorted_x[:5]:
        print("%s\t%.3f\t%s %s" % (i,v,doc_names[k], k))
    print()   

're police decision biden enforcement flu feel says answer test

0	1.000	09/01/2020 Jacob Blake protests: Trump announces funding for Kenosha police, businesses during visit | FULL - speech 73
0	1.000	07/19/2020 President Trump goes one-on-one with Chris Wallace | Full Interview - speech 68
0	0.998	03/03/2020 Meeting wiht the National Institute of Health - speech 33
0	0.991	02/29/2020 Coronavirus Update Transcript Warns Not To Travel to Italy South Korea - speech 31
0	0.905	02/25/2020 New Delha News Conference - speech 29

nations iran syria security peace god middle turkey children regime

1	1.000	09/17/2017 Address to the United National General Assembly - speech 83
1	1.000	09/24/2019 Remarks at the United Nations General Assembly - speech 94
1	1.000	09/25/2018 Address at the 73rd Session of the United States General Assembly - speech 91
1	0.958	10/23/2019 Donald Trump Syria Press Conference Transcript Trump Orders All Turkey Sanctions Lifted - speech 14
1	0.952	01/08/2020 Trump Make