In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import state_union, stopwords

## Data Retrieval / Processing

In [2]:
# Make list of individual file ids
file_ids = state_union.fileids()
file_ids[58:]

['2001-GWBush-1.txt',
 '2001-GWBush-2.txt',
 '2002-GWBush.txt',
 '2003-GWBush.txt',
 '2004-GWBush.txt',
 '2005-GWBush.txt',
 '2006-GWBush.txt']

In [502]:
# Create blank labels list to populate
labels = []

# Iterate through list of files pulling out President name & year
for file in file_ids:
    
    # label president for each text.  Splice to extract name, remove non-alpha characters for 
    # occassions when a President gave more than one State of the Union address in the same year.
    president = re.sub("[^a-zA-Z]", '', file[5:-4])
    
    # Get year of each State of the Union Address
    year = file[:4]
    
    # Append to labels list
    labels.append([file, president, year])

    
# Create blank lists to populate with sentence level data
sent_list = []
pres_list = []
date_list = []

# Iterate through each State of the Union, create sentence level documents, maintain President
# and year information for each sentence
for i in range(len(labels)):
    sents = state_union.sents(labels[i][0]) #get sentences from document
    joined_sents = [(' '.join(sent), labels[i][1], labels[i][2])for sent in sents]
    
    # write out to individual lists for easier text processing
    for i in range(len(joined_sents)): 
        sent_list.append(joined_sents[i][0]) 
        pres_list.append(joined_sents[i][1])  
        date_list.append(joined_sents[i][2]) 

## Text Cleaning

In [506]:
# Create list to hold cleaned up sentences
sent_list_clean = []

# Iterate through sentence list, removing punctuation, numeric values,
# frequently occuring word 'applause' that acts like a stop word in this 
# context, and sentences with all capital letters that represent title 
# information for each speech.
for sent in sent_list:
    sent = re.sub("[^a-zA-Z]", ' ', sent) #remove numeric and punctuation
    sent = re.sub("Applause", '', sent)   #remove "Applause"
    if sent == sent.upper():              #removes ALL CAPS header sentences
        sent = ""                         #replace with blank
    sent_list_clean.append(sent)

# Inspect cleaned sentences
print(len(sent_list_clean))
sent_list_clean[:4]

17930


['',
 'April          ',
 'Mr   Speaker   Mr   President   Members of the Congress   It is with a heavy heart that I stand before you   my friends and colleagues   in the Congress of the United States  ',
 'Only yesterday   we laid to rest the mortal remains of our beloved President   Franklin Delano Roosevelt  ']

In [505]:
# Tokenize, get lemmas, and rejoin back to sentence level doc
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

# Create list to hold lemmatized sentences
lemma_sents = []

for sent in sent_list_clean:
    words = word_tokenize(sent)                                 #tokenize each word 
    word_lemma = [lemmatizer.lemmatize(word) for word in words] #lemmatize each word
    sent_lemma = ' '.join(word_lemma)                           #rejoin back into sentences
    lemma_sents.append(sent_lemma)

# Inspect results
lemma_sents[:5]

['',
 'April',
 'Mr Speaker Mr President Members of the Congress It is with a heavy heart that I stand before you my friend and colleague in the Congress of the United States',
 'Only yesterday we laid to rest the mortal remains of our beloved President Franklin Delano Roosevelt',
 'At a time like this word are inadequate']

In [402]:
# Send to data frame
df = pd.DataFrame()
df['sent'] = lemma_sents
df['president'] = pres_list
df['year'] = date_list
df.head()

Unnamed: 0,sent,president,year
0,,Truman,1945
1,April,Truman,1945
2,Mr Speaker Mr President Members of the Congres...,Truman,1945
3,Only yesterday we laid to rest the mortal rema...,Truman,1945
4,At a time like this word are inadequate,Truman,1945


0         True
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
17900    False
17901    False
17902    False
17903    False
17904    False
17905    False
17906     True
17907    False
17908    False
17909    False
17910     True
17911    False
17912     True
17913    False
17914    False
17915    False
17916    False
17917    False
17918    False
17919    False
17920    False
17921    False
17922    False
17923    False
17924    False
17925    False
17926    False
17927    False
17928    False
17929     True
Name: sent, Length: 17930, dtype: bool

In [411]:
# Remove blank sentence documents created from removing 'applause'
# Isn't actually working and doesnt' seem more efficient.  Dropna() doesn't work w/ object
df = df[df.sent!=""]
df.head()

Unnamed: 0,sent,president,year
1,April,Truman,1945
2,Mr Speaker Mr President Members of the Congres...,Truman,1945
3,Only yesterday we laid to rest the mortal rema...,Truman,1945
4,At a time like this word are inadequate,Truman,1945
5,The most eloquent tribute would be a reverent ...,Truman,1945


## Train_test_split

In [412]:
from sklearn.model_selection import train_test_split

# Split data frame, reserving 25% for validation

df_train, df_test = train_test_split(df,
                                    stratify=df['president'],
                                    test_size=0.25,
                                    random_state=42)

print(df_train.shape)
print(df_test.shape)

(13007, 3)
(4336, 3)


## Tfidf Vectorizer

In [431]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english',
                             lowercase=True,
                             max_df=0.1, #take 80% of df...isn't changing output
                             min_df=2, #use words that appear atleast twice
                             use_idf=True,
                             smooth_idf=True, 
                             #max_features=3000
                             )

# Specify data to vectorize
X_train = df_train['sent']
X_test = df_test['sent']

# Vectorize
X_train_tfidf = vectorizer.fit_transform(X_train).tocsr()
X_test_tfidf = vectorizer.transform(X_test).tocsr()

In [432]:
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(13007, 5965)
(4336, 5965)


## Latent Semantic Analysis / SVD

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

# Reduce matrix size 
svd = TruncatedSVD(100)
lsa = make_pipeline(svd, Normalizer())  #isn't tfidf normalized already?
#lsa = make_pipeline(svd)

X_train_lsa = lsa.fit_transform(X_train_tfidf)
variance_explained = svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print('Percent variance captured by components:', total_variance*100)

sent_by_component = pd.DataFrame(X_train_lsa, index=X_train)

for i in range(5):
    print('Component {}:'.format(i))
    print(sent_by_component.loc[:, i].sort_values(ascending=False)[:10])


## Similarity

In [180]:
#Memory error.  Better to do on full speeches?
#similarity = np.asarray(np.asmatrix(X_train_lsa) * np.asmatrix(X_train_lsa).T)

## Supervised Models

In [418]:
from sklearn.pipeline import Pipeline

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
#from sklearn.linear_model
# add svm classifier
# drop MLP?
from sklearn.model_selection import cross_val_score

In [419]:
X_train = X_train_tfidf
Y_train = df_train['president']

X_test = X_test_tfidf
Y_test = df_test['president']

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(13007, 5965) (13007,)
(4336, 5965) (4336,)


### Random Forest

In [424]:
rfc = RandomForestClassifier(max_depth=5)
rfc.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [425]:
print('Cross val scores:', cross_val_score(rfc, X_train, Y_train, cv=5))
print('Train accuracy:', rfc.score(X_train, Y_train))
print('Test accuracy:', rfc.score(X_test, Y_test))

Cross val scores: [0.20998081 0.22695853 0.21782559 0.21969988 0.21918336]
Train accuracy: 0.2205735373260552
Test accuracy: 0.21702029520295202


### Neural Network

In [381]:
mlp = MLPClassifier()
mlp.fit(X_train, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [382]:
print('Cross val scores:', cross_val_score(mlp, X_train, Y_train, cv=5))
print('Train accuracy:', mlp.score(X_train, Y_train))
print('Test accuracy:', mlp.score(X_test, Y_test))

Cross val scores: [0.35759376 0.3604608  0.36728625 0.36123512 0.35158287]
Train accuracy: 0.9766490667063286
Test accuracy: 0.14231541378541154


### Multinomial NB

In [426]:
nb = MultinomialNB()
nb.fit(X_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [427]:
print('Cross val scores:', cross_val_score(nb, X_train, Y_train, cv=5))
print('Train accuracy:', nb.score(X_train, Y_train))
print('Test accuracy:', nb.score(X_test, Y_test))

Cross val scores: [0.38579655 0.37826421 0.38647714 0.38014621 0.39098613]
Train accuracy: 0.5294841239332667
Test accuracy: 0.3842250922509225
