In [62]:
import os
import string

from textblob import TextBlob

from nltk.corpus import stopwords

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [8]:
path = os.getcwd()+'\\data'
fileList = os.listdir(path)

fileList[:4]

['2008_44_pid=29751.txt',
 '2008_44_pid=62272.txt',
 '2008_44_pid=73120.txt',
 '2008_44_pid=76232.txt']

In [22]:
pathToFile = path + '\\' + fileList[3]
with open(pathToFile, 'r') as f:
    txt = f.readlines()
    
txtList = txt[0].split('.')
txtList[:7]

['Thank you, Iowa',
 ' You know, they said this day would never come',
 ' They said our sights were set too high',
 ' They said this country was too divided; too disillusioned to ever come together around a common purpose',
 " But on this January night - at this defining moment in history - you have done what the cynics said we couldn't do",
 ' You have done what the state of New Hampshire can do in five days',
 ' You have done what America can do in this New Year, 2008']

In [83]:
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']

def stringClean(str):
    str = ''.join([i for i in str if i not in string.punctuation])
    words = TextBlob(str).words
    words = ' '.join([w for w in words if w not in stop if len(w)> 1])
    
    return words

updatedTextList = []
for txt in txtList:
    updatedTextList.append(stringClean(txt))
    
updatedTextList

['Thank Iowa',
 'You know said day would never come',
 'They said sights set high',
 'They said country divided disillusioned ever come together around common purpose',
 'But January night defining moment history done cynics said couldnt',
 'You done state New Hampshire five days',
 'You done America New Year 2008',
 'In lines stretched around schools churches small towns big cities came together Democrats Republicans Independents stand say one nation one people time change come',
 'You said time come move beyond bitterness pettiness anger thats consumed Washington end political strategy thats division instead make addition build coalition change stretches Red States Blue States',
 'Because thats well win November thats well finally meet challenges face nation',
 'We choosing hope fear',
 'Were choosing unity division sending powerful message change coming America',
 'You said time come tell lobbyists think money influence speak louder voices dont government take back',
 'The time come

In [84]:
# Create a CountVectorizer for parsing/counting words
# make sure we are breaking out by word and not character.
doc_term_vector = CountVectorizer(analyzer='word',
                                  ngram_range=(1,2),
                                  stop_words='english',
                                  token_pattern = '\\b[a-z][a-z]+\\b'
                                 )

doc_term_vector.fit(txtList)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [85]:
doc_term_vector.get_feature_names()

['able',
 'able breathe',
 'able look',
 'addition',
 'addition build',
 'afford',
 'afford doctor',
 'afford health',
 'affordable',
 'affordable available',
 'ages',
 'ages common',
 'ahead',
 'ahead roadblocks',
 'america',
 'america did',
 'america differently',
 'america moment',
 'america new',
 'america remembered',
 'america sees',
 'america world',
 'american',
 'american ideas',
 'american way',
 'americans',
 'americans deserve',
 'americans participated',
 'anger',
 'anger consumed',
 'available',
 'available single',
 'awaits',
 'awaits courage',
 'band',
 'band colonists',
 'barriers',
 'barriers divided',
 'beat',
 'beat politics',
 'beat washington',
 'bed',
 'bed night',
 'bedrock',
 'bedrock nation',
 'began',
 'began streets',
 'belief',
 'belief destiny',
 'believe',
 'believe families',
 'believed',
 'believed deeply',
 'believes',
 'believes country',
 'better',
 'better awaits',
 'big',
 'big cities',
 'bit',
 'bit better',
 'bitterness',
 'bitterness pettiness',

In [86]:
# Create the term-document matrix
# Transpose it so the terms are the rows
term_docs = doc_term_vector.transform(updatedTextList).transpose()
term_docs.shape

(823, 47)

In [87]:
# Convert sparse matrix of counts to a gensim corpus
# here corpus will be fed into lda in iteration
corpus = matutils.Sparse2Corpus(term_docs)

In [88]:
id2word = dict((v,k) for k, v in doc_term_vector.vocabulary_.items())

In [89]:
# Create lda model (equivalent to "fit" in sklearn)
# we can also run lsa on this:
# lsa = models.LsiModel(corpus=corpus, num_topics)
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=20)

In [93]:
lda.print_topics(num_words=20, num_topics=5)

[(0,
  '0.009*know + 0.008*states + 0.008*thank + 0.007*come + 0.007*iowa + 0.005*moment + 0.005*said + 0.005*time + 0.005*country + 0.005*message + 0.005*time come + 0.005*politics + 0.005*change + 0.004*new + 0.004*hampshire + 0.004*new hampshire + 0.004*finally + 0.004*people + 0.004*stand + 0.004*campaign'),
 (1,
  '0.007*ill + 0.007*president + 0.006*change + 0.006*health + 0.006*health care + 0.006*care + 0.006*said + 0.004*hope + 0.004*finally + 0.004*common + 0.004*woman + 0.004*country + 0.004*way + 0.004*common threats + 0.004*american + 0.004*threats + 0.004*republicans + 0.004*democrats republicans + 0.004*democrats + 0.004*young woman'),
 (2,
  '0.011*america + 0.010*hope + 0.008*nation + 0.008*led + 0.008*moment + 0.006*night + 0.006*world + 0.004*new + 0.004*say + 0.004*young + 0.004*united + 0.004*tax + 0.004*able + 0.004*years + 0.004*say moment + 0.004*look + 0.004*sees + 0.004*men + 0.004*women + 0.002*days')]

In [54]:
# here we are looking into top words that pertain to topic 0
lda.get_topic_terms(2,topn=10)

[(435, 0.0076053105301616929),
 (283, 0.0076047394541385481),
 (163, 0.0058656881312008418),
 (563, 0.0058437955283436099),
 (511, 0.0058409355384847848),
 (483, 0.0041434358182013496),
 (440, 0.0041049268952231058),
 (115, 0.0041024087217231917),
 (576, 0.0041013536640147792),
 (687, 0.0040948915104582814)]

In [48]:
# above we see that word id 376 has the highest
# probability of being in topic 0
# from below the word is iowa
lda.id2word

{0: 'able',
 1: 'able breathe',
 2: 'able look',
 3: 'addition',
 4: 'addition build',
 5: 'afford',
 6: 'afford doctor',
 7: 'afford health',
 8: 'affordable',
 9: 'affordable available',
 10: 'ages',
 11: 'ages common',
 12: 'ahead',
 13: 'ahead roadblocks',
 14: 'america',
 15: 'america did',
 16: 'america differently',
 17: 'america moment',
 18: 'america new',
 19: 'america remembered',
 20: 'america sees',
 21: 'america world',
 22: 'american',
 23: 'american ideas',
 24: 'american way',
 25: 'americans',
 26: 'americans deserve',
 27: 'americans participated',
 28: 'anger',
 29: 'anger consumed',
 30: 'available',
 31: 'available single',
 32: 'awaits',
 33: 'awaits courage',
 34: 'band',
 35: 'band colonists',
 36: 'barriers',
 37: 'barriers divided',
 38: 'beat',
 39: 'beat politics',
 40: 'beat washington',
 41: 'bed',
 42: 'bed night',
 43: 'bedrock',
 44: 'bedrock nation',
 45: 'began',
 46: 'began streets',
 47: 'belief',
 48: 'belief destiny',
 49: 'believe',
 50: 'believ

# how about we combined many texts togather and try this out