In [21]:
import os
import string

from textblob import TextBlob

from nltk.corpus import stopwords

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [22]:
path = os.getcwd()+'\\data'
fileList = os.listdir(path)

fileList[:4]

['2008_44_pid=29751.txt',
 '2008_44_pid=62272.txt',
 '2008_44_pid=73120.txt',
 '2008_44_pid=76232.txt']

In [23]:
pathToFile = path + '\\' + fileList[3]
with open(pathToFile, 'r') as f:
    txt = f.readlines()
    
txtList = txt[0].split('.')
txtList[:3]

['Thank you, Iowa',
 ' You know, they said this day would never come',
 ' They said our sights were set too high']

In [8]:
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']

def stringClean(str):
    str = ''.join([i for i in str if i not in string.punctuation])
    words = TextBlob(str).words
    words = ' '.join([w for w in words if w not in stop if len(w)> 1])
    
    return words

updatedTextList = []
for txt in txtList:
    updatedTextList.append(stringClean(txt))
    
updatedTextList[:5]

['Thank Iowa',
 'You know said day would never come',
 'They said sights set high',
 'They said country divided disillusioned ever come together around common purpose',
 'But January night defining moment history done cynics said couldnt']

In [9]:
# Create a CountVectorizer for parsing/counting words
# make sure we are breaking out by word and not character.
doc_term_vector = CountVectorizer(analyzer='word',
                                  ngram_range=(1,2),
                                  stop_words='english',
                                  token_pattern = '\\b[a-z][a-z]+\\b'
                                 )

doc_term_vector.fit(txtList)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [10]:
doc_term_vector.get_feature_names()[:5]

['able', 'able breathe', 'able look', 'addition', 'addition build']

In [11]:
# Create the term-document matrix
# Transpose it so the terms are the rows
term_docs = doc_term_vector.transform(updatedTextList).transpose()
term_docs.shape

(823, 47)

In [12]:
# Convert sparse matrix of counts to a gensim corpus
# here corpus will be fed into lda in iteration
corpus = matutils.Sparse2Corpus(term_docs)

In [13]:
id2word = dict((v,k) for k, v in doc_term_vector.vocabulary_.items())

In [14]:
# Create lda model (equivalent to "fit" in sklearn)
# we can also run lsa on this:
# lsa = models.LsiModel(corpus=corpus, num_topics)
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=20)

In [15]:
lda.print_topics(num_words=20, num_topics=5)

[(8,
  '0.013*common + 0.009*come + 0.009*said + 0.009*states + 0.009*make + 0.009*know + 0.009*change + 0.009*ill + 0.009*standing + 0.009*common threats + 0.009*threats + 0.005*disillusioned + 0.005*disillusioned come + 0.005*common purpose + 0.005*purpose + 0.005*country divided + 0.005*divided disillusioned + 0.005*come common + 0.005*said country + 0.005*divided'),
 (9,
  '0.010*states + 0.010*america + 0.010*united states + 0.010*states america + 0.010*united + 0.010*father kenya + 0.010*story happen + 0.010*kansas story + 0.010*led today + 0.010*kansas + 0.010*father + 0.010*happen united + 0.010*today father + 0.010*happen + 0.010*kenya mother + 0.010*mother kansas + 0.010*mother + 0.010*kenya + 0.010*today + 0.010*story'),
 (4,
  '0.022*thank + 0.015*america + 0.015*world + 0.015*thank iowa + 0.015*iowa + 0.008*change + 0.008*nation + 0.008*choosing + 0.008*division + 0.008*powerful + 0.008*unity division + 0.008*change coming + 0.008*choosing unity + 0.008*sending + 0.008*sen

In [16]:
# here we are looking into top words that pertain to topic 0
lda.get_topic_terms(2,topn=10)

[(817, 0.018116704726318947),
 (408, 0.013698446881998359),
 (337, 0.013698153360476072),
 (180, 0.0092793269994243706),
 (819, 0.0092787703726663704),
 (791, 0.0092787387663826153),
 (547, 0.0048611238068859922),
 (258, 0.0048609655306794172),
 (186, 0.0048609516907960814),
 (280, 0.0048609479210782397)]

In [17]:
lda_corpus = lda[corpus]

In [18]:
lda_docs = [doc for doc in lda_corpus]

In [19]:
lda_docs[0]

[(0, 0.025000000117549978),
 (1, 0.025000721463372268),
 (2, 0.025000000083206794),
 (3, 0.025003397250346206),
 (4, 0.77499400909080118),
 (5, 0.025000000126935967),
 (6, 0.025000000125384444),
 (7, 0.025001202016736005),
 (8, 0.025000669559327497),
 (9, 0.025000000166339856)]

In [20]:
corpus

<gensim.matutils.Sparse2Corpus at 0x7973770>

In [13]:
# above we see that word id 376 has the highest
# probability of being in topic 0
# from below the word is iowa
lda.id2word

{0: 'able',
 1: 'able breathe',
 2: 'able look',
 3: 'addition',
 4: 'addition build',
 5: 'afford',
 6: 'afford doctor',
 7: 'afford health',
 8: 'affordable',
 9: 'affordable available',
 10: 'ages',
 11: 'ages common',
 12: 'ahead',
 13: 'ahead roadblocks',
 14: 'america',
 15: 'america did',
 16: 'america differently',
 17: 'america moment',
 18: 'america new',
 19: 'america remembered',
 20: 'america sees',
 21: 'america world',
 22: 'american',
 23: 'american ideas',
 24: 'american way',
 25: 'americans',
 26: 'americans deserve',
 27: 'americans participated',
 28: 'anger',
 29: 'anger consumed',
 30: 'available',
 31: 'available single',
 32: 'awaits',
 33: 'awaits courage',
 34: 'band',
 35: 'band colonists',
 36: 'barriers',
 37: 'barriers divided',
 38: 'beat',
 39: 'beat politics',
 40: 'beat washington',
 41: 'bed',
 42: 'bed night',
 43: 'bedrock',
 44: 'bedrock nation',
 45: 'began',
 46: 'began streets',
 47: 'belief',
 48: 'belief destiny',
 49: 'believe',
 50: 'believ

# how about we combined many texts togather and try this out