In [1]:
# generic
import os
import string
from operator import itemgetter
from sklearn.externals import joblib

# text
from textblob import TextBlob
from nltk.corpus import stopwords

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [2]:
path = os.getcwd()+'\\data'
fileList = os.listdir(path)

fileList[3]

'2008_44_pid=76232.txt'

In [3]:
pathToFile = path + '\\' + fileList[3]
with open(pathToFile, 'r') as f:
    txt = f.readlines()
    
txtList = txt[0].split('.')
txtList

['Thank you, Iowa',
 ' You know, they said this day would never come',
 ' They said our sights were set too high',
 ' They said this country was too divided; too disillusioned to ever come together around a common purpose',
 " But on this January night - at this defining moment in history - you have done what the cynics said we couldn't do",
 ' You have done what the state of New Hampshire can do in five days',
 ' You have done what America can do in this New Year, 2008',
 ' In lines that stretched around schools and churches; in small towns and big cities; you came together as Democrats, Republicans and Independents to stand up and say that we are one nation; we are one people; and our time for change has come',
 " You said the time has come to move beyond the bitterness and pettiness and anger that's consumed Washington; to end the political strategy that's been all about division and instead make it about addition - to build a coalition for change that stretches through Red States a

In [4]:
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']

def stringClean(str):
    str = ''.join([i for i in str if i not in string.punctuation])
    words = TextBlob(str).words
    words = ' '.join([w for w in words if w not in stop if len(w)> 1])
    
    return words

updatedTextList = []
for txt in txtList:
    updatedTextList.append(stringClean(txt))
    
updatedTextList[:5]

['Thank Iowa',
 'You know said day would never come',
 'They said sights set high',
 'They said country divided disillusioned ever come together around common purpose',
 'But January night defining moment history done cynics said couldnt']

In [5]:
# Create a CountVectorizer for parsing/counting words
# make sure we are breaking out by word and not character.
doc_term_vector = CountVectorizer(analyzer='word',
                                  ngram_range=(1,2),
                                  stop_words='english',
                                  token_pattern = '\\b[a-z][a-z]+\\b'
                                 )

doc_term_vector.fit(txtList)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
doc_term_vector.get_feature_names()[:5]

['able', 'able breathe', 'able look', 'addition', 'addition build']

In [7]:
# Create the term-document matrix
# Transpose it so the terms are the rows
term_docs = doc_term_vector.transform(updatedTextList).transpose()
term_docs.shape

(823, 47)

In [8]:
# Convert sparse matrix of counts to a gensim corpus
# here corpus will be fed into lda in iteration
corpus = matutils.Sparse2Corpus(term_docs)

In [9]:
id2word = dict((v,k) for k, v in doc_term_vector.vocabulary_.items())

In [23]:
# Create lda model (equivalent to "fit" in sklearn)
# we can also run lsa on this:
# lsa = models.LsiModel(corpus=corpus, num_topics)
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=20)

In [11]:
lda.print_topics(num_words=5, num_topics=10)

[(0, '0.013*nation + 0.009*world + 0.009*hope + 0.009*come + 0.009*time'),
 (1, '0.019*know + 0.019*america + 0.013*hope + 0.013*sees + 0.007*said'),
 (2, '0.015*led + 0.015*moment + 0.010*hope + 0.010*cause + 0.010*young'),
 (3, '0.014*states + 0.010*said + 0.010*washington + 0.010*tax + 0.005*led'),
 (4,
  '0.017*country + 0.010*health + 0.010*care + 0.010*health care + 0.010*finally'),
 (5,
  '0.023*iowa + 0.015*know + 0.015*thank iowa + 0.015*thank + 0.008*roadblocks'),
 (6,
  '0.018*america + 0.012*president + 0.012*common + 0.012*common threats + 0.012*threats'),
 (7,
  '0.015*little + 0.008*new + 0.008*sacrifice + 0.008*comes little + 0.008*pay lot'),
 (8, '0.027*thank + 0.018*obama + 0.010*closer + 0.010*love + 0.010*makes'),
 (9, '0.001*led + 0.001*young + 0.001*cause + 0.001*moment + 0.001*divided')]

In [12]:
# here we are looking into top words that pertain to topic 0
lda.get_topic_terms(2,topn=10)

[(408, 0.014951085364410407),
 (483, 0.014942603465743614),
 (337, 0.010129232864630223),
 (102, 0.010128715329547759),
 (817, 0.010128259597981019),
 (618, 0.0053070292962610725),
 (332, 0.0053065738275374641),
 (193, 0.0053065689793565726),
 (201, 0.0053065443770853333),
 (192, 0.0053065433258200458)]

In [17]:
lda_corpus = lda[corpus]

In [18]:
lda_docs = [doc for doc in lda_corpus]

In [19]:
lda_docs[0]

[(0, 0.025000000117549978),
 (1, 0.025000721463372268),
 (2, 0.025000000083206794),
 (3, 0.025003397250346206),
 (4, 0.77499400909080118),
 (5, 0.025000000126935967),
 (6, 0.025000000125384444),
 (7, 0.025001202016736005),
 (8, 0.025000669559327497),
 (9, 0.025000000166339856)]

In [20]:
corpus

<gensim.matutils.Sparse2Corpus at 0x7973770>

In [13]:
# above we see that word id 376 has the highest
# probability of being in topic 0
# from below the word is iowa
lda.id2word

{0: 'able',
 1: 'able breathe',
 2: 'able look',
 3: 'addition',
 4: 'addition build',
 5: 'afford',
 6: 'afford doctor',
 7: 'afford health',
 8: 'affordable',
 9: 'affordable available',
 10: 'ages',
 11: 'ages common',
 12: 'ahead',
 13: 'ahead roadblocks',
 14: 'america',
 15: 'america did',
 16: 'america differently',
 17: 'america moment',
 18: 'america new',
 19: 'america remembered',
 20: 'america sees',
 21: 'america world',
 22: 'american',
 23: 'american ideas',
 24: 'american way',
 25: 'americans',
 26: 'americans deserve',
 27: 'americans participated',
 28: 'anger',
 29: 'anger consumed',
 30: 'available',
 31: 'available single',
 32: 'awaits',
 33: 'awaits courage',
 34: 'band',
 35: 'band colonists',
 36: 'barriers',
 37: 'barriers divided',
 38: 'beat',
 39: 'beat politics',
 40: 'beat washington',
 41: 'bed',
 42: 'bed night',
 43: 'bedrock',
 44: 'bedrock nation',
 45: 'began',
 46: 'began streets',
 47: 'belief',
 48: 'belief destiny',
 49: 'believe',
 50: 'believ

# how about we combined many texts togather and try this out

In [2]:
path = os.getcwd()+'\\data'
fileList = os.listdir(path)

# in the part of file 'speech_data_pull_v3.ipynb' I have 
pathToFile = path + '\\' + 'bulkSpeeches.txt'
with open(pathToFile, 'r') as f:
    txt = f.readlines()

In [3]:
txtList = txt[0].split('.')
txtList[:5]

['Wherever I go, I talk about how we need to bring about real change in this country',
 ' And few understand the need for change as well as folks here in Michigan',
 " Because while we've been talking about a recession in this country for a few months now, Michigan has been living it for a very long time",
 ' Michigan has the highest unemployment rate in the nation and workers and communities across this state have been struggling for years with the downturn that all of America is feeling today',
 ' In fairness, some of these challenges are the product of larger forces beyond the control of government']

In [4]:
stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']

def stringClean(str):
    str = ''.join([i for i in str if i not in string.punctuation])
    words = TextBlob(str).words
    words = ' '.join([w for w in words if w not in stop if len(w)> 1])
    
    return words

updatedTextList = []
for txt in txtList:
    updatedTextList.append(stringClean(txt))
    
updatedTextList[:5]

['Wherever go talk need bring real change country',
 'And understand need change well folks Michigan',
 'Because weve talking recession country months Michigan living long time',
 'Michigan highest unemployment rate nation workers communities across state struggling years downturn America feeling today',
 'In fairness challenges product larger forces beyond control government']

In [5]:
len(updatedTextList)

30145

In [6]:
# Create a CountVectorizer for parsing/counting words
# make sure we are breaking out by word and not character.
doc_term_vector = CountVectorizer(analyzer='word',
                                  ngram_range=(1,2),
                                  stop_words='english',
                                  token_pattern = '\\b[a-z][a-z]+\\b'
                                 )

doc_term_vector.fit(updatedTextList)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='\\b[a-z][a-z]+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
doc_term_vector.get_feature_names()[:5]

['aan', 'aan energy', 'aaron', 'aaron joshua', 'aarp']

In [8]:
# Create the term-document matrix
# Transpose it so the terms are the rows
term_docs = doc_term_vector.transform(updatedTextList).transpose()
term_docs.shape

(136374, 30145)

In [9]:
# Convert sparse matrix of counts to a gensim corpus
# here corpus will be fed into lda in iteration
corpus = matutils.Sparse2Corpus(term_docs)

In [10]:
id2word = dict((v,k) for k, v in doc_term_vector.vocabulary_.items())

In [19]:
# Create lda model (equivalent to "fit" in sklearn)
# we can also run lsa on this:
# lsa = models.LsiModel(corpus=corpus, num_topics)

# !!! skip this part:this take a really long time, hence I am going to pickle it and run later

lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=20)

KeyboardInterrupt: 

In [11]:
lda

NameError: name 'lda' is not defined

In [19]:
# !!! skip this part:this take a really long time, hence I am going to pickle it and run later

joblib.dump(lda, 'lda_model.pkl')

['lda_model.pkl',
 'lda_model.pkl_01.npy',
 'lda_model.pkl_02.npy',
 'lda_model.pkl_03.npy',
 'lda_model.pkl_04.npy',
 'lda_model.pkl_05.npy']

In [13]:
lda_model = joblib.load('lda_model.pkl')

In [14]:
lda_model.print_topics(num_words=10, num_topics=10)

[(0,
  '0.014*think + 0.011*going + 0.008*im + 0.008*know + 0.007*obama + 0.006*dont + 0.005*look + 0.005*make + 0.004*people + 0.003*campaign'),
 (1,
  '0.014*care + 0.013*health + 0.012*credit + 0.012*card + 0.011*credit card + 0.010*health care + 0.008*ive + 0.007*got + 0.006*seen + 0.006*making'),
 (2,
  '0.005*let + 0.005*bad + 0.004*rules + 0.004*america + 0.004*god + 0.004*bush + 0.004*second + 0.004*opponent + 0.004*bless + 0.004*god bless'),
 (3,
  '0.013*tax + 0.012*mccain + 0.010*families + 0.010*john + 0.008*john mccain + 0.007*years + 0.006*americans + 0.005*senator + 0.004*thats + 0.004*dont'),
 (4,
  '0.008*make + 0.007*sure + 0.007*make sure + 0.005*need + 0.004*ill + 0.004*security + 0.004*lets + 0.004*help + 0.003*protect + 0.003*job'),
 (5,
  '0.006*women + 0.005*work + 0.005*men + 0.005*america + 0.005*jobs + 0.004*men women + 0.003*young + 0.003*gas + 0.003*economy + 0.003*decisions'),
 (6,
  '0.019*thats + 0.016*change + 0.015*american + 0.013*people + 0.009*need 

In [15]:
# note a document = a sentence
# we want to find which sentences belong to which topic

# !!! skip this part:this take a really long time, hence I am going to pickle it and run later

lda_corpus = lda_model[corpus]

In [26]:
# !!! skip this part: already pickled
lda_corpus_pickled = joblib.dump(lda_corpus, 'lda_corpus.pkl')

In [16]:
#lda_corpus_unpickled = joblib.load('lda_corpus.pkl')
#lda_docs = [doc for doc in lda_corpus_unpickled]

NameError: name 'lda_corpus_unpickled' is not defined

In [17]:
#!!!!!!!! don't run this already pickled below

lda_docs = [doc for doc in lda_corpus]

In [24]:
lda_docs_pickled = joblib.dump(lda_docs, 'model_data/lda_docs.pkl')

In [2]:
lda_docs_unpickled = joblib.load('model_data/lda_docs.pkl')

NameError: name 'joblib' is not defined

In [26]:
lda_doc_test = lda_docs_unpickled[0]

In [27]:
def getSentence(sentence):
    return sorted(sentence, key=itemgetter(1), reverse=True)[0][0]

sentenceAndTopicList = []
for index, doc in enumerate(lda_docs_unpickled):
    temp = [index, getSentence(doc)]
    sentenceAndTopicList.append(temp)
    
sentenceAndTopicList[:20]
len(sentenceAndTopicList)
#getSentence(lda_doc_test)

30145

In [30]:
import pandas as pd

def getSentence(data, textList):
    return textList[data]

df = pd.DataFrame(sentenceAndTopicList)
df.rename(columns={0: 'docIndex', 1: 'topic'}, inplace=True)
df['sentence'] = df.apply(lambda row: getSentence(row.docIndex, txtList), axis=1)
df.sort(['topic'], ascending=True, inplace=True)
df[df['topic']==1].head()



Unnamed: 0,docIndex,topic,sentence
26476,26476,1,I will reform our health care system so we ca...
24201,24201,1,"In the Senate, I worked across the aisle to c..."
26477,26477,1,"If you have health care, my plan will lower y..."
1828,1828,1,Other nations would feel great pressure to ac...
16984,16984,1,"And, you know, I've seen and heard worse"


In [42]:
topic1 = []
df_topic1 = df[df['topic'] == 1]
topic1_sentences = list(df_topic1.iloc[:,2].values)

topic1_sentences[:5]

[' I will reform our health care system so we can relieve families, businesses, and our economy from the crushing cost of health care by investing in new technology and preventative care',
 ' In the Senate, I worked across the aisle to crack down on these schemes',
 ' If you have health care, my plan will lower your premiums',
 ' Other nations would feel great pressure to accommodate Iranian demands',
 " And, you know, I've seen and heard worse"]

### This part is taken from markov.ipynb

In [None]:
import random
import string
from collections import defaultdict

# ======= (1) open a single file and upload its string into documents

speech = []
with open('obamaSpeech.txt', 'r') as f:
    # sppech is not a list where each element is a new line
    speech.extend(f.readlines())
    
document = ''.join(speech)

# ======= (2) remove all punctuation except '.' which could be used to split lines later 

strPunctuation = string.punctuation.replace('.', '').replace("'", '')

# remove all punctuation
for punct in strPunctuation:
    document = document.replace(punct, ' ')

# remove newLines, tabs and single quote
for i in ["\t","\n", "'"]:
    document = document.replace(i, '')

# ======= (3) Now split by '.'

docList = document.split('.')
docList = filter(lambda x: len(x)>1, docList) # get rid of any string that has length greater than 1
docList = map(lambda x: x.strip(), docList) # strip away all empty strings at the ends
docList = list(docList)

# ======= (4) Start doing n-grams

def find_ngrams(input_string, n):
    # replace double space by single space
    input_string = input_string.replace('  ', ' ')
    input_list = input_string.split(' ')
    return zip(*[input_list[i:] for i in range(n)])

# ======= (5) Start doing n-grams

def generateNGramDict(docList):
    d = defaultdict(list)
    for doc in docList:
        trigrams = list(find_ngrams(doc,3))
        for trigram in trigrams:
            d[trigram[:2]].append(trigram[2])
            
    return d

dtemp = generateNGramDict(docList)

# ======= (6) Test sentence generator

def generateText(triGramDict, numOfLoops, firstWord='getting', secondWord='the'):
    newSpeech = [firstWord,secondWord]
    counter = 0
    
    while counter < numOfLoops:
        try:
            firstWord, secondWord = secondWord, np.random.choice(triGramDict[(firstWord,secondWord)])
            #print(newSpeech)
            newSpeech.append(secondWord)
        except:
            break
        
    return ' '.join(newSpeech)

generateText(dtemp, 10, 'I', 'want')
#docList    
#dtemp[('getting','the')]