# How many themes are in GOP political debates?

In [None]:
import csv
import pandas as pd

# import packages for text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re

import gensim
from gensim.corpora import Dictionary
from gensim.models import ldamodel

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.wrappers import LdaVowpalWabbit, LdaMallet

import numpy
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity

## Make the processes visible

In [2]:
#https://radimrehurek.com/gensim/tutorial.html
# this makes process visible

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Import data

We're importing our corpus now. This corpus includes GOP political debates.

In [3]:
texts = []
r = csv.reader(open('data/politics2015_gop.csv', 'rb'))
for i in r:
    texts.append(i)  
len(texts)

401

In [4]:
#first two rows
texts[0:2]

[["i'm jebbush, and i believe america's on the verge of its greatest century, and i'm ready to lead. i'm a committed, conservative reformer that cut taxes, that balanced budgets, that took on the special interest in florida, and we won. i look forward to talking tonight about how we can fix a broken washington d.c., and create an environment where people can rise up again in this great country. thank you. i think the voters will make that determination. but what i know to be true is that the next president of the unitedstates is going to have to fix an extraordinary difficult situation. this administration, with obama and clinton, has created insecurity the likes of which we never would've imagined. there's not a place in the world where we're better off today than six and a half years ago. and that requires a steadiness. that requires an understanding of how the world works. that requires an understanding and appreciation of american leadership in the world. you can't just, you know, 

### %%capture output 

Consider a situtation when a process is taking too long to wait. Then, you run the cell containing a complex process and would close the jupyter notebook. Even if the jupyter notebook is closed, the process is still running (unless you completely shut down the juypter notebook or the jupyter server running on your machine). This is good because as long as the jupyter notebook is running, it is processing your request(s) and later you can view the results of the process.

To view the results of the process later, you need to start a cell with **%%capture output** (see below). Then, add **output.show()** in the following cell.

In [5]:
%%capture output

# Remove useless numbers and alphanumerical words
documents = [re.sub("[^a-zA-Z]+", " ", str(text)) for text in texts]
print "done ..."
# tokenize
texts = [[word for word in text.lower().split() ] for text in documents]
print "done ..."
# stemming words: having --> have; friends --> friend
lmtzr = WordNetLemmatizer()
texts = [[lmtzr.lemmatize(word) for word in text ] for text in texts]
print "done ..."
#porter_stemmer = PorterStemmer()
#texts = [[porter_stemmer.stem(word) for word in text ] for text in texts]
# remove common words 
stoplist = stopwords.words('english')
texts = [[word for word in text if word not in stoplist] for text in texts]
print "done ..."
#remove short words
texts = [[ word for word in tokens if len(word) >= 3 ] for tokens in texts]
print "done ..."

In [6]:
output.show()

done ...
done ...
done ...
done ...
done ...


This is an optional step (removing extra stopwords)

In [7]:
# A list of extra stopwords specific to the debates transcripts (if you want to remove more stopwords)
extra_stopwords = ['will', 'people', 'need', 'think', 'well','going', 'can', 'country', 'know', 'lot', 'get','make','way','president', 'want',
                'like','say','got','said','just','something','tell','put','now', 'bad','back','want','right','every','one','use','come','never', 
                'many','along','things','day','also','first','guy', 'great', 'take', 'good', 'much','anderson', 'let', 'would', 'year', 'thing', 'america',
                'talk', 'talking', 'thank', 'does', 'give', 'look', 'believe', 'tonight','today','see']

extra_stoplist = extra_stopwords
texts = [[word for word in text if word not in extra_stoplist] for text in texts]
#https://github.com/alexperrier/datatalks/blob/master/debates/R/stm.R

In [12]:
# this is text processing required for topic modeling with Gensim

## Create a dictionary representation of the documents.
dictionary = Dictionary(texts)

## Remove rare and common tokens.
# ignore words that appear in less than 5 documents or more than 50% documents (remove too frequent & infrequent words) - an optional step
dictionary.filter_extremes(no_below=5, no_above=0.5) #https://radimrehurek.com/gensim/corpora/dictionary.html#gensim.corpora.dictionary.Dictionary.filter_extremes

# convert words to vetors or integers
corpus = [dictionary.doc2bow(text) for text in texts]

2017-10-05 12:30:26,404 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-10-05 12:30:26,475 : INFO : built Dictionary(4375 unique tokens: [u'secondly', u'suicidal', u'pardon', u'limited', u'personally']...) from 401 documents (total 25211 corpus positions)
2017-10-05 12:30:26,492 : INFO : discarding 3335 tokens: [(u'expect', 2), (u'committed', 4), (u'imagined', 1), (u'reformer', 2), (u'insecurity', 1), (u'steadiness', 1), (u'insult', 2), (u'appreciation', 1), (u'jebbush', 3), (u'determination', 2)]...
2017-10-05 12:30:26,493 : INFO : keeping 1040 tokens which were in no less than 5 and no more than 200 (=50.0%) documents
2017-10-05 12:30:26,499 : INFO : resulting dictionary: Dictionary(1040 unique tokens: [u'secondly', u'evidence', u'chinese', u'saying', u'caused']...)


In [13]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 1040
Number of documents: 401


# Determing the Best Model

In [None]:
%%capture output

numpy.random.seed(1) 
k_range = range(10,40, 5)
scores = []
for k in k_range:
    goodLdaModel = ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=k, passes=50)
    goodcm = CoherenceModel(model=goodLdaModel, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    print k, goodcm.get_coherence()
    scores.append(goodcm.get_coherence())
    
plt.figure()
plt.plot(k_range, scores)

# LDA Model Building

**passes** controls how often we train the model on the entire corpus. Another word for passes might be "epochs". iterations is somewhat technical, but essentially it controls how often we repeat a particular loop over each document. It is important to set the number of "passes" and "iterations" high enough.

I suggest the following way to choose iterations and passes. First, enable **logging** (as described in many Gensim tutorials), and set **eval_every = 1** in LdaModel. When training the model look for a line in the log that looks something like this:

2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations

If you set passes = 20 you will see this line 20 times. Make sure that by the final passes, most of the documents have **converged**. So you want to choose both passes and iterations to be high enough for this to happen.

https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/lda_training_tips.ipynb

In [14]:
%%capture output

numpy.random.seed(1) # setting random seed to get the same results each time. 
%time model = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=15, passes=100, eval_every = 1)

2017-10-05 12:30:31,623 : INFO : using symmetric alpha at 0.0666666666667
2017-10-05 12:30:31,625 : INFO : using symmetric eta at 0.000961538461538
2017-10-05 12:30:31,627 : INFO : using serial LDA version on this node
2017-10-05 12:30:31,730 : INFO : running online (multi-pass) LDA training, 15 topics, 100 passes over the supplied corpus of 401 documents, updating model once every 401 documents, evaluating perplexity every 401 documents, iterating 50x with a convergence threshold of 0.001000
2017-10-05 12:30:33,390 : INFO : -8.813 per-word bound, 449.9 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:30:33,391 : INFO : PROGRESS: pass 0, at document #401/401
2017-10-05 12:30:34,359 : INFO : topic #12 (0.067): 0.011*"state" + 0.010*"life" + 0.009*"money" + 0.009*"problem" + 0.008*"jake" + 0.008*"washington" + 0.007*"tax" + 0.007*"job" + 0.007*"world" + 0.006*"nation"
2017-10-05 12:30:34,384 : INFO : topic #2 (0.067): 0.012*"american" + 0.010

2017-10-05 12:30:45,269 : INFO : topic #7 (0.067): 0.018*"government" + 0.017*"company" + 0.017*"chris" + 0.016*"immigration" + 0.014*"city" + 0.014*"bankruptcy" + 0.014*"hundred" + 0.012*"time" + 0.012*"wall" + 0.011*"leader"
2017-10-05 12:30:45,271 : INFO : topic #5 (0.067): 0.023*"tax" + 0.017*"money" + 0.013*"job" + 0.012*"pay" + 0.010*"government" + 0.009*"family" + 0.009*"american" + 0.007*"socialsecurity" + 0.007*"actually" + 0.007*"million"
2017-10-05 12:30:45,273 : INFO : topic diff=0.412115, rho=0.377964
2017-10-05 12:30:46,423 : INFO : -7.136 per-word bound, 140.7 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:30:46,424 : INFO : PROGRESS: pass 6, at document #401/401
2017-10-05 12:30:46,902 : INFO : topic #2 (0.067): 0.015*"american" + 0.011*"change" + 0.009*"issue" + 0.009*"problem" + 0.009*"world" + 0.008*"better" + 0.008*"government" + 0.008*"time" + 0.008*"million" + 0.007*"state"
2017-10-05 12:30:46,903 : INFO : topic #12 

2017-10-05 12:30:56,112 : INFO : topic #5 (0.067): 0.024*"tax" + 0.018*"money" + 0.014*"job" + 0.012*"pay" + 0.010*"government" + 0.010*"family" + 0.009*"american" + 0.008*"socialsecurity" + 0.008*"million" + 0.007*"actually"
2017-10-05 12:30:56,114 : INFO : topic #4 (0.067): 0.020*"tax" + 0.017*"budget" + 0.017*"cut" + 0.016*"state" + 0.015*"governor" + 0.013*"isi" + 0.010*"balanced" + 0.010*"eight" + 0.009*"clinton" + 0.009*"spent"
2017-10-05 12:30:56,115 : INFO : topic diff=0.128101, rho=0.277350
2017-10-05 12:30:57,197 : INFO : -7.069 per-word bound, 134.3 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:30:57,198 : INFO : PROGRESS: pass 12, at document #401/401
2017-10-05 12:30:57,672 : INFO : topic #13 (0.067): 0.022*"court" + 0.011*"nation" + 0.011*"woman" + 0.010*"state" + 0.010*"government" + 0.009*"supreme" + 0.009*"justice" + 0.009*"senator" + 0.009*"unitedstates" + 0.009*"fact"
2017-10-05 12:30:57,673 : INFO : topic #9 (0.067): 

2017-10-05 12:31:05,910 : INFO : topic #12 (0.067): 0.015*"state" + 0.014*"jake" + 0.012*"life" + 0.012*"nation" + 0.010*"washington" + 0.010*"must" + 0.010*"money" + 0.010*"problem" + 0.009*"energy" + 0.008*"clinton"
2017-10-05 12:31:05,912 : INFO : topic #0 (0.067): 0.014*"time" + 0.013*"child" + 0.011*"state" + 0.010*"law" + 0.010*"gun" + 0.008*"percent" + 0.008*"world" + 0.008*"issue" + 0.008*"life" + 0.008*"federal"
2017-10-05 12:31:05,913 : INFO : topic diff=0.054649, rho=0.229416
2017-10-05 12:31:07,016 : INFO : -7.043 per-word bound, 131.9 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:31:07,017 : INFO : PROGRESS: pass 18, at document #401/401
2017-10-05 12:31:07,479 : INFO : topic #2 (0.067): 0.015*"american" + 0.012*"change" + 0.011*"better" + 0.010*"issue" + 0.009*"problem" + 0.009*"world" + 0.008*"life" + 0.008*"million" + 0.008*"republican" + 0.008*"vote"
2017-10-05 12:31:07,481 : INFO : topic #11 (0.067): 0.025*"big" + 0.021

2017-10-05 12:31:15,701 : INFO : topic #5 (0.067): 0.024*"tax" + 0.018*"money" + 0.015*"job" + 0.013*"pay" + 0.010*"family" + 0.010*"government" + 0.010*"socialsecurity" + 0.009*"million" + 0.009*"american" + 0.007*"actually"
2017-10-05 12:31:15,702 : INFO : topic #3 (0.067): 0.016*"family" + 0.014*"life" + 0.011*"kid" + 0.011*"fact" + 0.010*"school" + 0.010*"greatest" + 0.010*"time" + 0.009*"able" + 0.009*"jake" + 0.009*"simple"
2017-10-05 12:31:15,704 : INFO : topic diff=0.030008, rho=0.200000
2017-10-05 12:31:16,791 : INFO : -7.030 per-word bound, 130.7 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:31:16,792 : INFO : PROGRESS: pass 24, at document #401/401
2017-10-05 12:31:17,222 : INFO : topic #3 (0.067): 0.016*"family" + 0.014*"life" + 0.011*"kid" + 0.011*"fact" + 0.010*"school" + 0.010*"greatest" + 0.010*"time" + 0.009*"able" + 0.009*"jake" + 0.009*"simple"
2017-10-05 12:31:17,223 : INFO : topic #5 (0.067): 0.024*"tax" + 0.018*"mon

2017-10-05 12:31:24,886 : INFO : topic #9 (0.067): 0.018*"time" + 0.014*"job" + 0.014*"place" + 0.013*"tax" + 0.012*"obama" + 0.011*"government" + 0.010*"system" + 0.009*"business" + 0.009*"work" + 0.009*"making"
2017-10-05 12:31:24,888 : INFO : topic #1 (0.067): 0.025*"fed" + 0.022*"percent" + 0.018*"money" + 0.015*"problem" + 0.015*"business" + 0.014*"tax" + 0.013*"wallstreet" + 0.012*"job" + 0.010*"done" + 0.010*"time"
2017-10-05 12:31:24,890 : INFO : topic diff=0.019021, rho=0.179605
2017-10-05 12:31:25,982 : INFO : -7.021 per-word bound, 129.9 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:31:25,983 : INFO : PROGRESS: pass 30, at document #401/401
2017-10-05 12:31:26,420 : INFO : topic #11 (0.067): 0.025*"big" + 0.021*"problem" + 0.017*"deal" + 0.015*"government" + 0.014*"company" + 0.010*"win" + 0.009*"powerful" + 0.009*"making" + 0.009*"bank" + 0.008*"track"
2017-10-05 12:31:26,422 : INFO : topic #6 (0.067): 0.012*"probably" + 0.01

2017-10-05 12:31:34,040 : INFO : topic #9 (0.067): 0.018*"time" + 0.014*"place" + 0.014*"job" + 0.013*"tax" + 0.012*"obama" + 0.011*"government" + 0.010*"system" + 0.010*"business" + 0.009*"work" + 0.009*"making"
2017-10-05 12:31:34,042 : INFO : topic #2 (0.067): 0.017*"american" + 0.013*"better" + 0.012*"change" + 0.010*"issue" + 0.010*"problem" + 0.010*"republican" + 0.009*"vote" + 0.009*"life" + 0.008*"million" + 0.008*"world"
2017-10-05 12:31:34,044 : INFO : topic diff=0.013650, rho=0.164399
2017-10-05 12:31:35,124 : INFO : -7.016 per-word bound, 129.4 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:31:35,125 : INFO : PROGRESS: pass 36, at document #401/401
2017-10-05 12:31:35,567 : INFO : topic #10 (0.067): 0.016*"government" + 0.015*"week" + 0.013*"world" + 0.011*"growth" + 0.011*"election" + 0.010*"american" + 0.010*"law" + 0.010*"clinton" + 0.009*"economic" + 0.009*"lead"
2017-10-05 12:31:35,568 : INFO : topic #3 (0.067): 0.017*"fa

2017-10-05 12:31:44,228 : INFO : topic #3 (0.067): 0.017*"family" + 0.015*"life" + 0.011*"kid" + 0.011*"jake" + 0.011*"fact" + 0.010*"greatest" + 0.010*"school" + 0.010*"time" + 0.009*"able" + 0.009*"simple"
2017-10-05 12:31:44,230 : INFO : topic #11 (0.067): 0.025*"big" + 0.021*"problem" + 0.018*"deal" + 0.016*"government" + 0.013*"company" + 0.010*"win" + 0.009*"powerful" + 0.009*"making" + 0.009*"bank" + 0.008*"track"
2017-10-05 12:31:44,232 : INFO : topic diff=0.010576, rho=0.152499
2017-10-05 12:31:45,315 : INFO : -7.011 per-word bound, 129.0 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:31:45,316 : INFO : PROGRESS: pass 42, at document #401/401
2017-10-05 12:31:45,752 : INFO : topic #11 (0.067): 0.025*"big" + 0.021*"problem" + 0.018*"deal" + 0.015*"government" + 0.013*"company" + 0.010*"win" + 0.009*"powerful" + 0.009*"making" + 0.009*"track" + 0.009*"bank"
2017-10-05 12:31:45,753 : INFO : topic #8 (0.067): 0.011*"benefit" + 0.011*

2017-10-05 12:31:53,368 : INFO : topic #11 (0.067): 0.025*"big" + 0.021*"problem" + 0.018*"deal" + 0.015*"government" + 0.013*"company" + 0.010*"win" + 0.010*"track" + 0.009*"powerful" + 0.009*"making" + 0.009*"bank"
2017-10-05 12:31:53,370 : INFO : topic #14 (0.067): 0.022*"iran" + 0.020*"war" + 0.019*"syria" + 0.016*"iraq" + 0.015*"nuclear" + 0.014*"isi" + 0.014*"deal" + 0.012*"weapon" + 0.012*"world" + 0.012*"obama"
2017-10-05 12:31:53,372 : INFO : topic diff=0.008458, rho=0.142857
2017-10-05 12:31:54,472 : INFO : -7.007 per-word bound, 128.6 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:31:54,473 : INFO : PROGRESS: pass 48, at document #401/401
2017-10-05 12:31:54,901 : INFO : topic #9 (0.067): 0.018*"time" + 0.016*"place" + 0.014*"job" + 0.013*"obama" + 0.012*"tax" + 0.012*"government" + 0.011*"system" + 0.010*"business" + 0.010*"work" + 0.009*"making"
2017-10-05 12:31:54,902 : INFO : topic #3 (0.067): 0.017*"family" + 0.016*"life" 

2017-10-05 12:32:03,094 : INFO : topic #6 (0.067): 0.013*"probably" + 0.012*"actually" + 0.011*"bigger" + 0.011*"world" + 0.010*"mexico" + 0.009*"senator" + 0.009*"respect" + 0.008*"state" + 0.008*"american" + 0.008*"position"
2017-10-05 12:32:03,096 : INFO : topic #4 (0.067): 0.026*"budget" + 0.024*"tax" + 0.022*"cut" + 0.017*"governor" + 0.016*"state" + 0.016*"balanced" + 0.016*"billion" + 0.012*"isi" + 0.010*"time" + 0.010*"eight"
2017-10-05 12:32:03,098 : INFO : topic diff=0.006994, rho=0.134840
2017-10-05 12:32:04,201 : INFO : -7.004 per-word bound, 128.3 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:32:04,202 : INFO : PROGRESS: pass 54, at document #401/401
2017-10-05 12:32:04,623 : INFO : topic #1 (0.067): 0.026*"fed" + 0.022*"percent" + 0.018*"money" + 0.016*"problem" + 0.015*"business" + 0.014*"tax" + 0.013*"wallstreet" + 0.012*"job" + 0.011*"done" + 0.011*"time"
2017-10-05 12:32:04,624 : INFO : topic #7 (0.067): 0.025*"chris" +

2017-10-05 12:32:12,176 : INFO : topic #13 (0.067): 0.026*"court" + 0.013*"government" + 0.012*"woman" + 0.012*"supreme" + 0.010*"nation" + 0.010*"state" + 0.010*"unitedstates" + 0.010*"issue" + 0.009*"justice" + 0.009*"fact"
2017-10-05 12:32:12,178 : INFO : topic #0 (0.067): 0.014*"child" + 0.013*"state" + 0.013*"time" + 0.011*"gun" + 0.011*"law" + 0.010*"federal" + 0.009*"life" + 0.009*"issue" + 0.008*"percent" + 0.008*"government"
2017-10-05 12:32:12,179 : INFO : topic diff=0.005863, rho=0.128037
2017-10-05 12:32:13,264 : INFO : -7.001 per-word bound, 128.1 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:32:13,265 : INFO : PROGRESS: pass 60, at document #401/401
2017-10-05 12:32:13,693 : INFO : topic #4 (0.067): 0.026*"budget" + 0.025*"tax" + 0.022*"cut" + 0.017*"billion" + 0.017*"governor" + 0.017*"state" + 0.016*"balanced" + 0.012*"isi" + 0.010*"time" + 0.010*"eight"
2017-10-05 12:32:13,694 : INFO : topic #0 (0.067): 0.014*"child" + 0

2017-10-05 12:32:21,341 : INFO : topic #7 (0.067): 0.025*"chris" + 0.023*"company" + 0.018*"immigration" + 0.016*"city" + 0.016*"leader" + 0.015*"wall" + 0.014*"hundred" + 0.014*"bankruptcy" + 0.014*"law" + 0.013*"government"
2017-10-05 12:32:21,343 : INFO : topic diff=0.005018, rho=0.122169
2017-10-05 12:32:22,406 : INFO : -6.998 per-word bound, 127.9 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:32:22,406 : INFO : PROGRESS: pass 66, at document #401/401
2017-10-05 12:32:22,825 : INFO : topic #0 (0.067): 0.014*"child" + 0.013*"state" + 0.013*"time" + 0.011*"gun" + 0.011*"law" + 0.010*"federal" + 0.009*"life" + 0.009*"issue" + 0.008*"percent" + 0.008*"government"
2017-10-05 12:32:22,826 : INFO : topic #4 (0.067): 0.027*"budget" + 0.025*"tax" + 0.023*"cut" + 0.018*"billion" + 0.017*"state" + 0.017*"governor" + 0.016*"balanced" + 0.012*"isi" + 0.010*"time" + 0.010*"eight"
2017-10-05 12:32:22,828 : INFO : topic #6 (0.067): 0.013*"probably" 

2017-10-05 12:32:30,749 : INFO : topic #8 (0.067): 0.012*"putin" + 0.012*"russia" + 0.012*"benefit" + 0.011*"clinton" + 0.011*"talked" + 0.011*"unitedstates" + 0.010*"position" + 0.010*"plannedparenthood" + 0.010*"weakness" + 0.010*"agreement"
2017-10-05 12:32:30,751 : INFO : topic diff=0.004397, rho=0.117041
2017-10-05 12:32:32,197 : INFO : -6.997 per-word bound, 127.7 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:32:32,198 : INFO : PROGRESS: pass 72, at document #401/401
2017-10-05 12:32:32,780 : INFO : topic #4 (0.067): 0.027*"budget" + 0.025*"tax" + 0.023*"cut" + 0.019*"billion" + 0.017*"state" + 0.017*"governor" + 0.016*"balanced" + 0.012*"isi" + 0.010*"time" + 0.010*"eight"
2017-10-05 12:32:32,782 : INFO : topic #6 (0.067): 0.013*"probably" + 0.012*"actually" + 0.011*"world" + 0.011*"bigger" + 0.010*"mexico" + 0.009*"senator" + 0.009*"respect" + 0.008*"state" + 0.008*"american" + 0.008*"position"
2017-10-05 12:32:32,784 : INFO : to

2017-10-05 12:32:43,522 : INFO : topic #1 (0.067): 0.027*"fed" + 0.022*"percent" + 0.018*"money" + 0.016*"problem" + 0.015*"business" + 0.014*"tax" + 0.013*"wallstreet" + 0.012*"job" + 0.011*"done" + 0.011*"time"
2017-10-05 12:32:43,524 : INFO : topic diff=0.004040, rho=0.112509
2017-10-05 12:32:44,952 : INFO : -6.995 per-word bound, 127.6 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:32:44,954 : INFO : PROGRESS: pass 78, at document #401/401
2017-10-05 12:32:45,428 : INFO : topic #10 (0.067): 0.016*"government" + 0.015*"week" + 0.014*"world" + 0.012*"growth" + 0.011*"election" + 0.010*"american" + 0.010*"economic" + 0.010*"clinton" + 0.009*"law" + 0.009*"lead"
2017-10-05 12:32:45,430 : INFO : topic #4 (0.067): 0.027*"budget" + 0.026*"tax" + 0.023*"cut" + 0.019*"billion" + 0.017*"state" + 0.017*"governor" + 0.017*"balanced" + 0.012*"isi" + 0.010*"time" + 0.010*"eight"
2017-10-05 12:32:45,431 : INFO : topic #9 (0.067): 0.018*"time" + 0.01

2017-10-05 12:32:52,986 : INFO : topic #5 (0.067): 0.023*"tax" + 0.020*"money" + 0.017*"job" + 0.013*"socialsecurity" + 0.012*"pay" + 0.011*"family" + 0.010*"government" + 0.010*"million" + 0.008*"american" + 0.008*"work"
2017-10-05 12:32:52,988 : INFO : topic diff=0.003706, rho=0.108465
2017-10-05 12:32:54,090 : INFO : -6.993 per-word bound, 127.4 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:32:54,091 : INFO : PROGRESS: pass 84, at document #401/401
2017-10-05 12:32:54,511 : INFO : topic #9 (0.067): 0.018*"time" + 0.017*"place" + 0.015*"job" + 0.013*"obama" + 0.012*"government" + 0.012*"tax" + 0.011*"rate" + 0.011*"system" + 0.011*"business" + 0.010*"work"
2017-10-05 12:32:54,512 : INFO : topic #4 (0.067): 0.027*"budget" + 0.026*"tax" + 0.024*"cut" + 0.019*"billion" + 0.017*"state" + 0.017*"governor" + 0.017*"balanced" + 0.012*"isi" + 0.010*"time" + 0.010*"eight"
2017-10-05 12:32:54,514 : INFO : topic #6 (0.067): 0.013*"probably" + 0.0

2017-10-05 12:33:02,060 : INFO : topic #1 (0.067): 0.028*"fed" + 0.022*"percent" + 0.018*"money" + 0.016*"problem" + 0.015*"business" + 0.014*"tax" + 0.012*"wallstreet" + 0.012*"job" + 0.011*"done" + 0.011*"time"
2017-10-05 12:33:02,062 : INFO : topic diff=0.003485, rho=0.104828
2017-10-05 12:33:03,145 : INFO : -6.992 per-word bound, 127.3 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:33:03,150 : INFO : PROGRESS: pass 90, at document #401/401
2017-10-05 12:33:03,567 : INFO : topic #10 (0.067): 0.016*"government" + 0.015*"week" + 0.014*"world" + 0.012*"growth" + 0.011*"election" + 0.010*"american" + 0.010*"economic" + 0.010*"clinton" + 0.009*"lead" + 0.009*"saying"
2017-10-05 12:33:03,569 : INFO : topic #11 (0.067): 0.025*"big" + 0.021*"problem" + 0.017*"deal" + 0.014*"government" + 0.013*"company" + 0.011*"track" + 0.011*"win" + 0.009*"powerful" + 0.009*"woman" + 0.009*"time"
2017-10-05 12:33:03,570 : INFO : topic #5 (0.067): 0.023*"tax"

2017-10-05 12:33:11,257 : INFO : topic #2 (0.067): 0.017*"american" + 0.013*"better" + 0.013*"change" + 0.012*"vote" + 0.011*"issue" + 0.011*"republican" + 0.010*"problem" + 0.009*"life" + 0.009*"senate" + 0.008*"running"
2017-10-05 12:33:11,258 : INFO : topic diff=0.003243, rho=0.101535
2017-10-05 12:33:12,897 : INFO : -6.990 per-word bound, 127.1 perplexity estimate based on a held-out corpus of 401 documents with 19107 words
2017-10-05 12:33:12,898 : INFO : PROGRESS: pass 96, at document #401/401
2017-10-05 12:33:13,404 : INFO : topic #1 (0.067): 0.028*"fed" + 0.022*"percent" + 0.018*"money" + 0.016*"problem" + 0.015*"business" + 0.014*"tax" + 0.012*"wallstreet" + 0.012*"job" + 0.011*"done" + 0.011*"time"
2017-10-05 12:33:13,406 : INFO : topic #6 (0.067): 0.013*"probably" + 0.011*"actually" + 0.011*"world" + 0.011*"bigger" + 0.010*"senator" + 0.010*"mexico" + 0.009*"respect" + 0.008*"american" + 0.008*"state" + 0.008*"military"
2017-10-05 12:33:13,408 : INFO : topic #14 (0.067): 0.0

In [15]:
output.show()

CPU times: user 2min 46s, sys: 1.24 s, total: 2min 47s
Wall time: 2min 49s


# Prints the topics.

In [16]:
model.show_topics(num_topics=20)
#show_topics(num_topics=10, num_words=10, log=False, formatted=True)

[(0,
  u'0.014*"child" + 0.013*"state" + 0.013*"time" + 0.012*"gun" + 0.011*"law" + 0.011*"federal" + 0.009*"life" + 0.009*"issue" + 0.009*"percent" + 0.008*"government"'),
 (1,
  u'0.029*"fed" + 0.022*"percent" + 0.018*"money" + 0.016*"problem" + 0.015*"business" + 0.014*"tax" + 0.012*"wallstreet" + 0.012*"job" + 0.011*"done" + 0.011*"time"'),
 (2,
  u'0.017*"american" + 0.013*"better" + 0.013*"change" + 0.012*"vote" + 0.011*"issue" + 0.011*"republican" + 0.010*"problem" + 0.009*"life" + 0.009*"senate" + 0.008*"running"'),
 (3,
  u'0.018*"life" + 0.016*"family" + 0.015*"jake" + 0.011*"fact" + 0.011*"kid" + 0.010*"greatest" + 0.009*"time" + 0.009*"able" + 0.009*"school" + 0.009*"child"'),
 (4,
  u'0.028*"tax" + 0.028*"budget" + 0.025*"cut" + 0.020*"billion" + 0.018*"state" + 0.017*"governor" + 0.017*"balanced" + 0.012*"isi" + 0.010*"time" + 0.010*"eight"'),
 (5,
  u'0.023*"tax" + 0.020*"money" + 0.017*"job" + 0.014*"socialsecurity" + 0.012*"pay" + 0.011*"family" + 0.011*"government" + 

In [17]:
# Prints the topics.
for top in model.show_topics(num_topics=20):
  print top
print

(0, u'0.014*"child" + 0.013*"state" + 0.013*"time" + 0.012*"gun" + 0.011*"law" + 0.011*"federal" + 0.009*"life" + 0.009*"issue" + 0.009*"percent" + 0.008*"government"')
(1, u'0.029*"fed" + 0.022*"percent" + 0.018*"money" + 0.016*"problem" + 0.015*"business" + 0.014*"tax" + 0.012*"wallstreet" + 0.012*"job" + 0.011*"done" + 0.011*"time"')
(2, u'0.017*"american" + 0.013*"better" + 0.013*"change" + 0.012*"vote" + 0.011*"issue" + 0.011*"republican" + 0.010*"problem" + 0.009*"life" + 0.009*"senate" + 0.008*"running"')
(3, u'0.018*"life" + 0.016*"family" + 0.015*"jake" + 0.011*"fact" + 0.011*"kid" + 0.010*"greatest" + 0.009*"time" + 0.009*"able" + 0.009*"school" + 0.009*"child"')
(4, u'0.028*"tax" + 0.028*"budget" + 0.025*"cut" + 0.020*"billion" + 0.018*"state" + 0.017*"governor" + 0.017*"balanced" + 0.012*"isi" + 0.010*"time" + 0.010*"eight"')
(5, u'0.023*"tax" + 0.020*"money" + 0.017*"job" + 0.014*"socialsecurity" + 0.012*"pay" + 0.011*"family" + 0.011*"government" + 0.010*"million" + 0.008

In [18]:
# print words without probability
for i in range(0,15):
    topics = model.show_topic(i, 10)
    print ', '.join([str(word[0]) for word in topics])

child, state, time, gun, law, federal, life, issue, percent, government
fed, percent, money, problem, business, tax, wallstreet, job, done, time
american, better, change, vote, issue, republican, problem, life, senate, running
life, family, jake, fact, kid, greatest, time, able, school, child
tax, budget, cut, billion, state, governor, balanced, isi, time, eight
tax, money, job, socialsecurity, pay, family, government, million, american, work
probably, actually, world, bigger, senator, mexico, respect, american, state, military
chris, company, city, immigration, law, leader, bankruptcy, wall, hundred, legally
russia, putin, benefit, talked, clinton, unitedstates, position, plannedparenthood, agreement, weakness
time, place, job, obama, government, tax, rate, system, business, work
government, week, world, growth, election, american, economic, clinton, lead, saying
big, problem, deal, government, company, track, win, record, woman, powerful
state, jake, nation, life, must, washington, e

# Assigns the topics to the documents in corpus

In [30]:
lda_corpus = model[corpus]

results = []
for i in lda_corpus:
    print i
    results.append(i)
print 

[(2, 0.41315935891788386), (8, 0.35266094809360732), (13, 0.21879504697612073)]
[(7, 0.57363729324593049), (8, 0.36799629462591371), (12, 0.043551558639079738)]
[(7, 0.90502711015325599), (10, 0.079215281793100459)]
[(0, 0.97407402208201865)]
[(4, 0.71574226310324485), (12, 0.2682083187461054)]
[(4, 0.98169929716189386)]
[(7, 0.63322969405715801), (9, 0.34908319676246663)]
[(7, 0.98685443560496877)]
[(7, 0.98095234807707321)]
[(2, 0.50284860326411929), (3, 0.1849465288458931), (12, 0.29651855469915761)]
[(2, 0.15453770250721743), (3, 0.36421871538171513), (7, 0.46695783742136504)]
[(2, 0.21471048869590093), (6, 0.26946502424193758), (7, 0.16558698326115454), (13, 0.33429543814520174)]
[(10, 0.48140964183892093), (13, 0.49843527883106448)]
[(0, 0.24191718048880528), (5, 0.74108932222487978)]
[(0, 0.983908018469551)]
[(12, 0.98205125274194061)]
[(1, 0.98564100611089578)]
[(9, 0.9747747295705399)]
[(9, 0.45098450815995095), (13, 0.52559201719766468)]
[(12, 0.63275750106441442), (13, 0.349

  chunks = self.iterencode(o, _one_shot=True)


In [31]:
# finding highest value from each row
toptopic = [max(collection, key=lambda x: x[1])[0] for collection in results]
toptopic

[2,
 7,
 7,
 0,
 4,
 4,
 7,
 7,
 7,
 2,
 7,
 13,
 13,
 5,
 0,
 12,
 1,
 9,
 13,
 12,
 6,
 4,
 4,
 2,
 3,
 5,
 5,
 4,
 8,
 7,
 12,
 7,
 7,
 4,
 4,
 5,
 8,
 11,
 0,
 5,
 4,
 7,
 13,
 7,
 9,
 9,
 11,
 13,
 13,
 14,
 12,
 3,
 1,
 2,
 2,
 9,
 3,
 9,
 11,
 6,
 3,
 14,
 1,
 7,
 13,
 7,
 7,
 3,
 1,
 8,
 3,
 0,
 6,
 2,
 6,
 14,
 14,
 13,
 1,
 8,
 8,
 3,
 14,
 14,
 9,
 12,
 2,
 2,
 1,
 10,
 2,
 4,
 5,
 5,
 5,
 6,
 10,
 2,
 10,
 5,
 7,
 13,
 12,
 10,
 7,
 13,
 6,
 2,
 9,
 9,
 14,
 14,
 7,
 3,
 7,
 12,
 1,
 3,
 4,
 7,
 12,
 12,
 9,
 9,
 13,
 4,
 0,
 7,
 12,
 12,
 3,
 2,
 1,
 14,
 12,
 12,
 8,
 12,
 12,
 12,
 5,
 14,
 14,
 13,
 2,
 8,
 3,
 2,
 2,
 6,
 7,
 1,
 2,
 1,
 5,
 11,
 1,
 6,
 3,
 2,
 3,
 4,
 2,
 13,
 3,
 1,
 1,
 3,
 2,
 0,
 8,
 1,
 2,
 4,
 8,
 3,
 2,
 5,
 5,
 8,
 2,
 4,
 8,
 4,
 4,
 3,
 14,
 2,
 3,
 4,
 3,
 3,
 2,
 3,
 11,
 4,
 2,
 4,
 8,
 1,
 5,
 3,
 3,
 3,
 2,
 2,
 14,
 2,
 8,
 4,
 3,
 3,
 8,
 11,
 2,
 9,
 10,
 8,
 8,
 3,
 8,
 6,
 6,
 2,
 4,
 3,
 2,
 6,
 2,
 6,
 0,
 7,
 1,
 0,
 0,
 6,
 14

  chunks = self.iterencode(o, _one_shot=True)


In [32]:
toptopic = pd.DataFrame(toptopic)
documents = pd.DataFrame(documents)
documents = documents.rename(columns = {0: 'documents'})
summary = documents.join(toptopic)
summary.head()

Unnamed: 0,documents,0
0,jebbush believe america verge greatest centur...,2
1,steady hand believe skill walker jake absolut...,7
2,maybe work clinton work anybody stage simple ...,7
3,make sure sophisticated weapon send signal ir...,0
4,well wrong lot thing wrong pro life governor ...,4


  chunks = self.iterencode(o, _one_shot=True)


In [33]:
summary.groupby(0).count()

Unnamed: 0_level_0,documents
0,Unnamed: 1_level_1
0,14
1,29
2,38
3,39
4,31
5,28
6,29
7,33
8,37
9,22


  chunks = self.iterencode(o, _one_shot=True)


# Appendix 1

In [34]:
import pyLDAvis.gensim

  chunks = self.iterencode(o, _one_shot=True)


In [35]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(model, corpus, dictionary)

  chunks = self.iterencode(o, _one_shot=True)


Topic numbers are changed in this visualization (LDAvis). 

- Tax cut / budget 
- War Syria Iraq Iran
- Immigration
- Plannedparenthood
- Big bank / wallstreet
- Judgement issue
- Iran, millitary
- Gun
- ...

# Appendix 2 

We want to show off the new `get_term_topics` and `get_document_topics` functionalities, and a good way to do so is to play around with words which might have different meanings in different context.

The word `bank` is a good candidate here, where it can mean either the financial institution or a river bank.
In the toy corpus presented, there are 11 documents, 5 `river` related and 6 `finance` related. 

### get_term_topics

The function `get_term_topics` returns the odds of that particular word belonging to a particular topic. 
A few examples:

In [36]:
model.get_term_topics('border')

[(9, 0.014274237929471287)]

  chunks = self.iterencode(o, _one_shot=True)


In [37]:
model.get_term_topics('tax')

[(1, 0.010350983285945102),
 (6, 0.011640286399490168),
 (8, 0.030948322922213247),
 (12, 0.016718448584323437)]

  chunks = self.iterencode(o, _one_shot=True)


### get_document_topics 

`get_document_topics` is an already existing gensim functionality which uses the `inference` function to get the sufficient statistics and figure out the topic distribution of the document.

The addition to this is the ability for us to now know the topic distribution for each word in the document. 
Let us test this with two different documents which have the word bank in it, one in the finance context and one in the river context.

The `get_document_topics` method returns (along with the standard document topic proprtion) the word_type followed by a list sorted with the most likely topic ids, when `per_word_topics` is set as true.

In [42]:
bow = ['tax','cut','budget','border']

  chunks = self.iterencode(o, _one_shot=True)


In [43]:
bow = model.id2word.doc2bow(bow) # convert to bag of words format first
print bow

[(178, 1), (474, 1), (702, 1), (876, 1)]


  chunks = self.iterencode(o, _one_shot=True)


In [44]:
doc_topics, word_topics, phi_values = model.get_document_topics(bow, per_word_topics=True)
word_topics

[(178, [8]), (474, [9]), (702, [8]), (876, [8, 9])]

  chunks = self.iterencode(o, _one_shot=True)


In [45]:
phi_values

[(178, [(8, 0.99999988974315979)]),
 (474, [(9, 0.99999962829594014)]),
 (702, [(8, 0.99999999317176114)]),
 (876, [(8, 0.96637311211886046), (9, 0.033626862030738697)])]

  chunks = self.iterencode(o, _one_shot=True)


In [52]:
for k, v in dictionary.token2id.iteritems():
    print k, v

secondly 0
evidence 986
chinese 1
saying 532
caused 3
global 4
dollar 5
focus 6
month 7
four 8
island 904
higher 740
welfare 10
follow 11
moon 879
hate 13
increase 357
certainly 15
voter 16
catastrophe 18
zone 19
doe 523
malley 20
obamacare 21
young 144
send 23
environment 24
charge 25
program 26
voted 27
health 361
worth 30
sent 31
paying 696
woman 35
risk 36
advantage 37
sitting 38
far 39
billionaire 40
rise 41
choice 43
balancing 45
fall 47
telling 48
assad 277
trouble 50
difference 51
bringing 52
minute 53
entire 54
school 55
immediately 888
level 57
forth 362
solution 59
leave 61
race 62
saudi 63
team 64
small 65
economically 66
taxcode 955
prevent 67
revolution 68
force 69
ten 70
core 884
dealing 71
consistent 72
cyber 73
sign 74
past 75
second 76
street 77
video 78
governor 703
lawyer 80
wife 1027
poll 869
turning 365
folk 721
even 83
afghanistan 84
stood 85
constitution 86
newjersey 87
giving 88
saved 89
liberty 92
debate 544
waiting 94
run 885
capital 96
new 97
net 98
single 9

  chunks = self.iterencode(o, _one_shot=True)


In [51]:
for k, v in dictionary.token2id.iteritems():
    if v == 474:
        print k, v

border 474


  chunks = self.iterencode(o, _one_shot=True)
