In [86]:
#download inaugural
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [119]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import inaugural, stopwords
from collections import Counter
from tqdm import tqdm_notebook as tqdm
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.model_selection import cross_val_score


In [78]:
def parse_sentence_president_nlp(text, president):
    nlp = spacy.load('en')
    tokens = nlp(text)
    return [[sent, president] for sent in tokens.sents]

In [79]:
def bag_of_words(text):
    nlp = spacy.load('en')
    tokens = nlp(text)
   
    # Filter out punctuation and stop words.
    allwords = ["{}_{}".format(token.lemma_, token.pos_)
                for token in tokens
                if not token.is_punct
                and not token.is_stop]
     
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(1000)]

In [80]:
def text_cleaner(text):
    text = re.sub(r'\n',' ',text)
    text = re.sub(r',','',text)
    text = re.sub(r':','',text)
    text = ' '.join(text.split())
    return text.lower()

In [81]:
# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    df['word_count'] = [sum(1 for token in sent if not token.is_punct and not token.is_stop) for sent in df['text_sentence']]
    df['punct_count'] = [sum(1 for token in sent if token.is_punct) for sent in df['text_sentence']]
    
    df['prev_word_count'] = [sum(1 for token in sent if not token.is_punct and not token.is_stop) for sent in df['text_sentence']]
    
    df['prev_word_count'] = df['word_count'].shift(1)
    df['prev_word_count'].fillna(0, inplace=True)
    
    df['next_word_count'] = df['word_count'].shift(-1)
    df['next_word_count'].fillna(0, inplace=True)
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(tqdm(df['text_sentence'])):
            
        words = ["{}_{}".format(token.lemma_, token.pos_)
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and "{}_{}".format(token.lemma_, token.pos_) in common_words
                 )]
        
        word_counts = Counter(words).items()
        
        # Populate the row with word counts.
        for word, count in word_counts:
            df.loc[i, word] = count
            
    return df

In [82]:
def my_preprocessor(doc):
    return(doc.lower())

In [83]:
def my_tokenizer(doc):
    nlp = spacy.load('en')
    tokens = nlp(doc)
    #return " ".join(["{}_{}".format(token.lemma_, token.pos_) for token in tokens if not token.is_punct and not token.is_stop])
    return(["{}_{}".format(token.lemma_, token.pos_) for token in tokens if not token.is_punct and not token.is_stop])

In [84]:
def get_sentences(sentences,president):
    inaug_sents=[]
    pres=[]
    for sent in sentences:
        inaug_sents.append(' '.join(sent))    
        pres.append(president)
    return inaug_sents, pres

In [128]:
sents

['Fellow - Citizens of the Senate and of the House of Representatives :',
 'Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order , and received on the 14th day of the present month .',
 'On the one hand , I was summoned by my Country , whose voice I can never hear but with veneration and love , from a retreat which I had chosen with the fondest predilection , and , in my flattering hopes , with an immutable decision , as the asylum of my declining years -- a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination , and of frequent interruptions in my health to the gradual waste committed on it by time .',
 'On the other hand , the magnitude and difficulty of the trust to which the voice of my country called me , being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his

In [86]:
#Get sentences in list format (before NLP)
file_pres = [['1789-Washington.txt','Washington'],['1793-Washington.txt','Washington'],['2009-Obama.txt','Obama']]

sents = []
pres = []
for i in file_pres:
    s, p = get_sentences(inaugural.sents(i[0]), i[1])
    sents = sents + s
    pres = pres + p
    
sentences_tfidf = pd.concat([pd.DataFrame(sents), pd.DataFrame(pres)],axis=1)
sentences_tfidf.columns = ['sentence','president']
sentences_tfidf.head()

Unnamed: 0,sentence,president
0,Fellow - Citizens of the Senate and of the Hou...,Washington
1,Among the vicissitudes incident to life no eve...,Washington
2,"On the one hand , I was summoned by my Country...",Washington
3,"On the other hand , the magnitude and difficul...",Washington
4,In this conflict of emotions all I dare aver i...,Washington


In [87]:
#Set up sentences
sentences = pd.DataFrame(parse_sentence_president_nlp(inaugural.raw('1789-Washington.txt'), 'Washington')  + 
    parse_sentence_president_nlp(inaugural.raw('1793-Washington.txt'), 'Washington') + 
    parse_sentence_president_nlp(inaugural.raw('2009-Obama.txt'), 'Obama'))

sentences.head()

Unnamed: 0,0,1
0,"(Fellow, -, Citizens, of, the, Senate, and, of...",Washington
1,"(On, the, one, hand, ,, I, was, summoned, by, ...",Washington
2,"(On, the, other, hand, ,, the, magnitude, and,...",Washington
3,"(In, this, conflict, of, emotions, all, I, dar...",Washington
4,"(All, I, dare, hope, is, that, if, ,, in, exec...",Washington


In [88]:
# Set up the bags.
washingtonwords = bag_of_words(text_cleaner(inaugural.raw('1789-Washington.txt'))) + bag_of_words(text_cleaner(inaugural.raw('1793-Washington.txt')))
obamawords = bag_of_words(text_cleaner(inaugural.raw('2009-Obama.txt')))
common_words = set(washingtonwords + obamawords)

print(len(obamawords))
print(len(common_words))
common_words

686
1050


{'ancestor_NOUN',
 'willingly_ADV',
 'responsibility_NOUN',
 'render_VERB',
 'charter_NOUN',
 'indifference_NOUN',
 'ceremony_NOUN',
 'entertain_VERB',
 'fist_NOUN',
 'depend_VERB',
 'demand_VERB',
 'observation_NOUN',
 'hero_NOUN',
 'false_ADJ',
 'endeavor_VERB',
 'thoroughly_ADV',
 'capital_NOUN',
 'government_NOUN',
 'advancement_NOUN',
 'parent_NOUN',
 'obscure_ADJ',
 'pure_ADJ',
 'productive_ADJ',
 'call_VERB',
 'humble_ADJ',
 'threat_NOUN',
 'understand_VERB',
 'school_NOUN',
 'sow_VERB',
 'benefit_NOUN',
 'creed_NOUN',
 'cost_NOUN',
 'political_ADJ',
 'citizenship_NOUN',
 'grateful_ADJ',
 'far_ADV',
 'tender_VERB',
 'suffering_NOUN',
 'service_NOUN',
 'wind_NOUN',
 'statistic_NOUN',
 'strengthen_VERB',
 'demand_NOUN',
 'hardship_NOUN',
 'city_NOUN',
 'missile_NOUN',
 'consistent_ADJ',
 'begin_VERB',
 'ordinary_ADJ',
 'easily_ADV',
 'law_NOUN',
 'care_NOUN',
 'remake_VERB',
 'proof_NOUN',
 'adversary_NOUN',
 'impression_NOUN',
 'show_VERB',
 'congenial_ADJ',
 'generation_VERB',
 

In [89]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()




Unnamed: 0,ancestor_NOUN,willingly_ADV,responsibility_NOUN,render_VERB,charter_NOUN,indifference_NOUN,ceremony_NOUN,entertain_VERB,fist_NOUN,depend_VERB,...,deliberate_VERB,habit_NOUN,silencing_NOUN,god_NOUN,text_sentence,text_source,word_count,punct_count,prev_word_count,next_word_count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(Fellow, -, Citizens, of, the, Senate, and, of...",Washington,22,4,0.0,37.0
1,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,"(On, the, one, hand, ,, I, was, summoned, by, ...",Washington,37,10,22.0,30.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(On, the, other, hand, ,, the, magnitude, and,...",Washington,30,6,37.0,13.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(In, this, conflict, of, emotions, all, I, dar...",Washington,13,1,30.0,36.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,"(All, I, dare, hope, is, that, if, ,, in, exec...",Washington,36,7,13.0,16.0


## Testing Random Forest and Logistic Regression

In [109]:
rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9651162790697675

Test set score: 0.8448275862068966


In [111]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(86, 1054) (86,)
Training set score: 0.9883720930232558

Test set score: 0.9310344827586207


In [118]:
preds = lr.predict(X)
word_counts['pred_source'] = preds
word_counts[['text_sentence','text_source','pred_source']].to_csv('predictions.csv')

In [126]:
print(cross_val_score(lr, X, Y,cv=5)) 

[0.89655172 0.89655172 0.89655172 0.93103448 0.82142857]


In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(preprocessor=my_preprocessor, 
                             tokenizer=my_tokenizer,
                             max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             #stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
inaug_sents_tfidf=vectorizer.fit_transform(sentences_tfidf['sentence'])
print("Number of features: %d" % inaug_sents_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf= train_test_split(inaug_sents_tfidf,sentences_tfidf['president'], test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]

#List of features
terms = vectorizer.get_feature_names()

#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

Number of features: 292


In [73]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train_tfidf, y_train_tfidf)
print(X_train_tfidf.shape, y_train_tfidf.shape)
print('Training set score:', lr.score(X_train_tfidf, y_train_tfidf))
print('\nTest set score:', lr.score(X_test_tfidf, y_test_tfidf))

(84, 292) (84,)
Training set score: 0.8095238095238095

Test set score: 0.7857142857142857


In [74]:

rfc = ensemble.RandomForestClassifier()

train = rfc.fit(X_train_tfidf, y_train_tfidf)

print('Training set score:', rfc.score(X_train_tfidf, y_train_tfidf))
print('\nTest set score:', rfc.score(X_test_tfidf, y_test_tfidf))

Training set score: 0.9880952380952381

Test set score: 0.8214285714285714


In [75]:
terms

['act_VERB',
 'administration_NOUN',
 'advance_VERB',
 'afford_VERB',
 'age_NOUN',
 'ambition_NOUN',
 'america_NOUN',
 'american_ADJ',
 'american_NOUN',
 'answer_NOUN',
 'article_NOUN',
 'assure_VERB',
 'awaken_VERB',
 'bear_VERB',
 'begin_VERB',
 'big_ADJ',
 'bind_VERB',
 'birth_NOUN',
 'bless_VERB',
 'blessing_NOUN',
 'blood_NOUN',
 'break_VERB',
 'bring_VERB',
 'build_VERB',
 'business_NOUN',
 'call_VERB',
 'capital_NOUN',
 'care_NOUN',
 'carry_VERB',
 'challenge_NOUN',
 'character_NOUN',
 'charter_NOUN',
 'child_NOUN',
 'choice_NOUN',
 'choose_VERB',
 'circumstance_NOUN',
 'citizen_NOUN',
 'civil_ADJ',
 'come_VERB',
 'common_ADJ',
 'community_NOUN',
 'confidence_NOUN',
 'conflict_NOUN',
 'consequence_NOUN',
 'consider_VERB',
 'constitution_NOUN',
 'constitutional_ADJ',
 'consume_VERB',
 'cooperation_NOUN',
 'country_NOUN',
 'courage_NOUN',
 'crisis_NOUN',
 'danger_NOUN',
 'dare_VERB',
 'dark_ADJ',
 'day_NOUN',
 'decide_VERB',
 'decision_NOUN',
 'decline_VERB',
 'defense_NOUN',
 'de