In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import spacy
from nltk.corpus import state_union, stopwords
from collections import Counter

In [2]:
# Import Presidential State of the Unions file ids
nltk.download('state_union')
state_union.fileids()

[nltk_data] Downloading package state_union to C:\Users\Square
[nltk_data]     Bear\AppData\Roaming\nltk_data...
[nltk_data]   Package state_union is already up-to-date!


['1945-Truman.txt',
 '1946-Truman.txt',
 '1947-Truman.txt',
 '1948-Truman.txt',
 '1949-Truman.txt',
 '1950-Truman.txt',
 '1951-Truman.txt',
 '1953-Eisenhower.txt',
 '1954-Eisenhower.txt',
 '1955-Eisenhower.txt',
 '1956-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1958-Eisenhower.txt',
 '1959-Eisenhower.txt',
 '1960-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1962-Kennedy.txt',
 '1963-Johnson.txt',
 '1963-Kennedy.txt',
 '1964-Johnson.txt',
 '1965-Johnson-1.txt',
 '1965-Johnson-2.txt',
 '1966-Johnson.txt',
 '1967-Johnson.txt',
 '1968-Johnson.txt',
 '1969-Johnson.txt',
 '1970-Nixon.txt',
 '1971-Nixon.txt',
 '1972-Nixon.txt',
 '1973-Nixon.txt',
 '1974-Nixon.txt',
 '1975-Ford.txt',
 '1976-Ford.txt',
 '1977-Ford.txt',
 '1978-Carter.txt',
 '1979-Carter.txt',
 '1980-Carter.txt',
 '1981-Reagan.txt',
 '1982-Reagan.txt',
 '1983-Reagan.txt',
 '1984-Reagan.txt',
 '1985-Reagan.txt',
 '1986-Reagan.txt',
 '1987-Reagan.txt',
 '1988-Reagan.txt',
 '1989-Bush.txt',
 '1990-Bush.txt',
 '1991-Bush-1.txt',
 '1991-B

In [3]:
# Kennedy's famous 1963 SOTU and Carter's 1980 SOTU 
kennedy = state_union.raw('1963-Kennedy.txt')
carter = state_union.raw('1980-Carter.txt')

In [4]:

# Parse using SpaCy
nlp = spacy.load('en_core_web_sm')
kennedy_doc = nlp(kennedy)
carter_doc = nlp(carter)

In [5]:
# Group into sentences
carter_sents = [[sent, 'Carter'] for sent in carter_doc.sents]
kennedy_sents = [[sent, 'Kennedy'] for sent in kennedy_doc.sents]

# Combine
sentences = pd.DataFrame(carter_sents + kennedy_sents)
sentences.head()

Unnamed: 0,0,1
0,"(PRESIDENT, JIMMY, CARTER, 'S, ADDRESS, TO, A,...",Carter
1,"(This, last, few, months, has, not, been, an, ...",Carter
2,"(As, we, meet, tonight, ,, it, has, never, bee...",Carter
3,"(And, tonight, ,, as, throughout, our, own, ge...",Carter
4,"(The, 1980, 's, have, been, born, in, turmoil,...",Carter


In [6]:
# Look at excerpts from each 
print(carter_doc[:50])
print('\nCarter speech length:', len(carter_doc))

print('\n', kennedy_doc[:50])
print('\nKennedy speech length:', len(kennedy_doc))

PRESIDENT JIMMY CARTER'S ADDRESS TO A JOINT SESSION OF CONGRESS ON THE STATE OF THE UNION
 
This last few months has not been an easy time for any of us. As we meet tonight, it has never been more clear that the state of our Union

Carter speech length: 3881

 PRESIDENT JOHN F. KENNEDY'S ANNUAL ADDRESS TO A JOINT SESSION OF CONGRESS ON THE STATE OF THE UNION
 
I congratulate you all - not merely on your electoral victory but on your selected role in history. For you and I are privileged to serve the great Republic

Kennedy speech length: 6291


In [7]:
# Create bag of words function for each text
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(500)]

# Get bags 
carter_words = bag_of_words(carter_doc)
kennedy_words = bag_of_words(kennedy_doc)

# Combine bags to create common set of unique words
common_words = set(carter_words + kennedy_words)

In [8]:
# Create bag of words data frame using combined common words and sentences
def bow_features(sentences, common_words):
    
    # Build data frame
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentences in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentences
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
    
    return df

In [9]:
# Create bow features 
bow = bow_features(sentences, common_words)
bow.head()

Unnamed: 0,sustain,determine,environment,communist,stable,opinion,fiscal,range,factor,Indian,...,young,private,superpower,limit,common,toil,CONGRESS,greatly,text_sentence,text_source
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,"(PRESIDENT, JIMMY, CARTER, 'S, ADDRESS, TO, A,...",Carter
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(This, last, few, months, has, not, been, an, ...",Carter
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(As, we, meet, tonight, ,, it, has, never, bee...",Carter
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(And, tonight, ,, as, throughout, our, own, ge...",Carter
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"(The, 1980, 's, have, been, born, in, turmoil,...",Carter


In [10]:
#TF-IDF Features
# Grab sentence level documents in NLTK
kennedy = state_union.sents('1963-Kennedy.txt')
carter = state_union.sents('1980-Carter.txt')

In [11]:
# Create list of text 
kennedy_list = [" ".join(sent) for sent in kennedy]
carter_list = [" ".join(sent) for sent in carter]
joined = kennedy_list + carter_list

In [12]:
# Vectorize
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, 
                             min_df=2, 
                             stop_words='english',   
                             use_idf=True,
                             norm=u'l2', 
                             smooth_idf=True 
                            )

tfidf = vectorizer.fit_transform(joined).tocsr()

In [13]:
#Supervised Learning Models
from sklearn.model_selection import cross_val_score

# Specify model inputs for each feature set

# BoW
X_bow = bow.drop(['text_sentence', 'text_source'], 1)
Y_bow = bow['text_source']

# Tfidf
X_tfidf = tfidf
Y_tfidf = ['Kennedy']*len(kennedy_list) + ['Carter']*len(carter_list)

In [14]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

# BoW
lr = LogisticRegression()
lr_bow = lr.fit(X_bow, Y_bow)
print('BoW Logistic Regression Scores: ', cross_val_score(lr_bow, X_bow, Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(lr_bow, X_bow, Y_bow, cv=5)))

# Tfidf
lr = LogisticRegression()
lr_tfidf = lr.fit(X_tfidf, Y_tfidf)
print('\nTfidf Logistic Regression Scores:', cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(lr_tfidf, X_tfidf, Y_tfidf, cv=5)))



BoW Logistic Regression Scores:  [0.7625     0.7375     0.72151899 0.73417722 0.65822785]




Avg Score: 0.7227848101265822





Tfidf Logistic Regression Scores: [0.69620253 0.74683544 0.72151899 0.65822785 0.58227848]




Avg Score: 0.6810126582278481


In [15]:
#Random Forest
from sklearn import ensemble

# BoW
rfc = ensemble.RandomForestClassifier()
rfc_bow = rfc.fit(X_bow, Y_bow)
print('BoW Random Forest Scores: ', cross_val_score(rfc_bow, X_bow, Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(rfc_bow, X_bow, Y_bow, cv=5)))

# Tfidf
rfc = ensemble.RandomForestClassifier()
rfc_tfidf = rfc.fit(X_tfidf, Y_tfidf)
print('\nTfidf Random Forest Scores:', cross_val_score(rfc_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(rfc_tfidf, X_tfidf, Y_tfidf, cv=5)))



BoW Random Forest Scores:  [0.5375     0.6625     0.67088608 0.65822785 0.59493671]
Avg Score: 0.6047784810126582





Tfidf Random Forest Scores: [0.56962025 0.62025316 0.72151899 0.62025316 0.56962025]
Avg Score: 0.6278481012658228


In [16]:
#Gradient Boosting
# BoW
clf = ensemble.GradientBoostingClassifier()
clf_bow = clf.fit(X_bow, Y_bow)
print('Bow Gradient Boosting Scores:', cross_val_score(clf_bow, X_bow,Y_bow, cv=5))
print('Avg Score:', np.mean(cross_val_score(clf_bow, X_bow, Y_bow, cv=5)))

# Tfidf
clf = ensemble.GradientBoostingClassifier()
clf_tfidf = clf.fit(X_tfidf, Y_tfidf)
print('\nTfidf Random Forest Scores:', cross_val_score(clf_tfidf, X_tfidf, Y_tfidf, cv=5))
print('Avg Score:', np.mean(cross_val_score(clf_tfidf, X_tfidf, Y_tfidf, cv=5)))

Bow Gradient Boosting Scores: [0.6875     0.5875     0.63291139 0.67088608 0.5443038 ]
Avg Score: 0.6296202531645569

Tfidf Random Forest Scores: [0.64556962 0.6835443  0.60759494 0.65822785 0.55696203]
Avg Score: 0.6354430379746835


In [17]:
#Pick A Model and Try to Increase Accuracy by 5%
#Model: Logistic Regression Using BoW Feature Set
# Increase BoW size

# Update function to include 1000 most common words
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(1000)]

# Get bags 
carter_words = bag_of_words(carter_doc)
kennedy_words = bag_of_words(kennedy_doc)

# Combine bags to create common set of unique words
common_words = set(carter_words + kennedy_words)

In [18]:
# Create bow features 
big_bow = bow_features(sentences, common_words)

In [None]:
# Make new X and Y inputs
X_big_bow = big_bow.drop(['text_sentence', 'text_source'], 1)
Y_big_bow = big_bow['text_source']

# Rerun BoW
lr = LogisticRegression()
lr_big_bow = lr.fit(X_big_bow, Y_big_bow)
print('BoW (big) Logistic Regression Scores: ', cross_val_score(lr_big_bow, X_big_bow, Y_big_bow, cv=5))
print('Avg. Score ', np.mean(cross_val_score(lr_big_bow, X_big_bow, Y_big_bow, cv=5)))



BoW (big) Logistic Regression Scores:  [0.75       0.7125     0.72151899 0.73417722 0.63291139]




Avg. Score  0.7102215189873418


In [None]:

# Update function, go back to 500 most common words and add in punctuation
def bag_of_words(text):
    
    # filter out punctuation and stop words
    allwords = [token.lemma_
                for token in text
                if not token.is_stop]
                   
    # Return most common words
    return [item[0] for item in Counter(allwords).most_common(500)]

# Get bags 
carter_words = bag_of_words(carter_doc)
kennedy_words = bag_of_words(kennedy_doc)

# Combine bags to create common set of unique words
common_words = set(carter_words + kennedy_words)

In [None]:

# Create bow features 
bow = bow_features(sentences, common_words)

In [None]:
# Regenerate model features
X_bow = bow.drop(['text_sentence', 'text_source'], 1)
Y_bow = bow['text_source']

In [None]:
# Rerun model
lr = LogisticRegression(
    )
lr_bow = lr.fit(X_bow, Y_bow)
print('BoW #3 - Logistic Regression Scores: ', cross_val_score(lr_bow, X_bow, Y_bow, cv=5))
print('Avg. Score ', np.mean(cross_val_score(lr_bow, X_bow, Y_bow, cv=5)))