In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords, state_union
from sklearn.feature_extraction.text import CountVectorizer
import time

# Introduction  

In this challenge, I want to build a model to classify state of the union addresses. In this case I will only be concerned with two previous presidents: Ronald Reagan and Bill Clinton; given a single sentence can we predict which president said it? This project will involve 5 primary steps:

  1. Data cleaning / processing / language parsing
  2. Create features using two different NLP methods: For example, BoW vs tf-idf.
  3. Use the features to fit supervised learning models for each feature set to predict the category outcomes.
  4. Assess your models using cross-validation and determine whether one model performed better.
  5. Pick one of the models and try to increase accuracy by at least 5 percentage points.
  
## Read Data

In [2]:
# 1.  Get all Reagan transcripts
reagan_text = ""
files = ['1981-Reagan.txt', '1982-Reagan.txt',
         '1983-Reagan.txt','1984-Reagan.txt',
         '1985-Reagan.txt','1986-Reagan.txt',
         '1987-Reagan.txt','1988-Reagan.txt']

for fname in files:
    reagan_text += state_union.raw(fname).replace('\n',' ').lower()

In [3]:
# Bill Clinton

# 1.  Get all Reagan transcripts
files = ['1993-Clinton.txt', '1994-Clinton.txt', 
         '1995-Clinton.txt', '1996-Clinton.txt', 
         '1997-Clinton.txt', '1998-Clinton.txt', 
         '1999-Clinton.txt', '2000-Clinton.txt',]
clinton_text = ""

for fname in files:
    clinton_text += state_union.raw(fname).replace('\n',' ').lower()

**NLP Raw Text**

In [4]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
reagan_doc = nlp(reagan_text)
clinton_doc = nlp(clinton_text)

**Break into sentences**

In [5]:
# Group into sentences.
reagan_sents = [[sent, "Reagan"] for sent in reagan_doc.sents]
clinton_sents = [[sent, "Clinton"] for sent in clinton_doc.sents]

# Combine the sentences from the two novels into one data frame.
sentences = pd.DataFrame(reagan_sents + clinton_sents, columns=['text','president'])
sentences.head()

Unnamed: 0,text,president
0,"(president, ronald, reagan, 's, address, befor...",Reagan
1,"(speaker, ,, mr, .)",Reagan
2,"(president, ,, distinguished, members, of, con...",Reagan
3,"(i, 'm, here, tonight, to, reaffirm, that, ple...",Reagan
4,"(all, of, us, are, aware, of, the, punishing, ...",Reagan


First let us get words that are common to both Presidents.

## Bag of Words

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english') #, min_df=4, max_df=.7)
bow = cv.fit_transform(sentences.text.astype(str), sentences.president).toarray()
bow = pd.DataFrame(bow, columns=cv.get_feature_names())
bow.sum(axis=0).sort_values(ascending=False).head(10)

people        484
america       424
new           348
year          321
years         320
work          303
american      271
government    257
congress      253
world         244
dtype: int64

In [9]:
bow.shape

(4481, 6633)

## TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tcv = TfidfVectorizer(stop_words='english') #, min_df=4, max_df=.1 )
tfidf = tcv.fit_transform(sentences.text.astype(str), sentences.president).toarray()
tfidf = pd.DataFrame(tfidf, columns=tcv.get_feature_names())

tfidf.sum(axis=0).sort_values(ascending=False).head(10)

people       75.434830
america      74.819466
work         63.258179
let          60.248949
year         57.842328
new          55.523233
years        55.172134
americans    49.432571
american     48.379614
congress     48.119739
dtype: float64

In [11]:
tfidf.shape

(4481, 6633)

**Split** data into train and test sets

In [12]:
from sklearn.model_selection import train_test_split

X_bow_train, X_bow_test, y_train, y_test = train_test_split(bow, 
                                                    sentences.president,
                                                    test_size=0.1,
                                                    random_state=0)

X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(tfidf, 
                                                    sentences.president,
                                                    test_size=0.1,
                                                    random_state=0)

# Classification Models  

## Bag of Words
  1. Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_bow_train, y_train)
print(X_bow_train.shape, y_train.shape)
print('Training set score:', lr.score(X_bow_train, y_train))
print('\nTest set score:', lr.score(X_bow_test, y_test))



(4032, 6633) (4032,)
Training set score: 0.9439484126984127

Test set score: 0.7639198218262806


As expected, logistic regression overfits the training data big time! SVM's should help with this.     
  2. Linear Support Vector Machine

In [14]:
from sklearn.svm import SVC

svc = SVC(kernel='poly', degree=1, gamma=.22)
train_svc = svc.fit(X_bow_train, y_train)
print(X_bow_train.shape, y_train.shape)
print('Training set score:', svc.score(X_bow_train, y_train))
print('\nTest set score:', svc.score(X_bow_test, y_test))

(4032, 6633) (4032,)
Training set score: 0.9109623015873016

Test set score: 0.7661469933184856


So we overfit our training data less, but we do see a lot of improvement on the test set.

## TF-IDF

In [15]:
lr2 = LogisticRegression()
train = lr2.fit(X_tfidf_train, y_train)
print(X_tfidf_train.shape, y_train.shape)
print('Training set score:', lr2.score(X_tfidf_train, y_train))
print('\nTest set score:', lr2.score(X_tfidf_test, y_test))

(4032, 6633) (4032,)
Training set score: 0.8772321428571429

Test set score: 0.7594654788418709




In [17]:
svc2 = SVC(kernel='linear')
train_svc2 = svc2.fit(X_tfidf_train, y_train)
print('Training set score:', svc2.score(X_tfidf_train, y_train))
print('\nTest set score:', svc2.score(X_tfidf_test, y_test))

Training set score: 0.9263392857142857

Test set score: 0.7616926503340757


TF-IDF appears to be less prone to overfitting, however this is not a great model.

# POS Tagging 
Now let's add a few features, such as POS tagging, sentence and word length, polarity and subjectivity.

## POS tagging plus bag of words

In [23]:
from nltk import pos_tag
STOPWORDS = set(stopwords.words('english'))

In [21]:
# should be more effiecent if we just replace every

def sent_to_token_tag(text):
    if text is not str:
        text = str(text)

    text = re.sub('[^a-zA-Z0-9]+', ' ', text)
    tokens = text.split(' ')
    # filter out punctuation and empty strings
    tokens_ = [token for token in tokens if token not in STOPWORDS and token != '']
    tt = dict(pos_tag(tokens_))
    return ' '.join([k+'_'+v for k, v in tt.items()]) + '.'

In [24]:
sentences_pos = [sent_to_token_tag(sent) for sent in sentences.text]

# create a 'bag of words' but this time a word is a token PLUS POS tag
cv_pos = CountVectorizer()
bow_pos = cv_pos.fit_transform(sentences_pos).toarray()
bow_pos = pd.DataFrame(bow_pos, columns=cv_pos.get_feature_names())

In [25]:
bow_pos.sum(axis=0).sort_values(ascending=False).head(20)

people_nns       446
must_md          426
us_prp           331
year_nn          292
new_jj           291
years_nns        288
american_jj      259
government_nn    231
world_nn         226
work_nn          220
children_nns     219
one_cd           212
americans_nns    202
every_dt         199
congress_nn      195
time_nn          192
america_nn       172
last_jj          172
let_vb           156
make_vbp         154
dtype: int64

Now we can add these 3 sets of features to our bag of words (or TF-IDF matrix).

In [35]:
X_bow = bow.join(bow_pos) #.join(sentences.sentiment_score)
X_tfidf = tfidf.join(bow_pos) #.join(sentences.sentiment_score)

In [32]:
X_tfidf.shape

(4481, 16069)

In [36]:
X_bow_pos_train, X_bow_pos_test, y_train, y_test = train_test_split(X_bow, 
                                                    sentences.president,
                                                    test_size=0.1,
                                                    random_state=0)

X_tfidf_pos_train, X_tfidf_pos_test, y_train, y_test = train_test_split(X_tfidf, 
                                                    sentences.president,
                                                    test_size=0.1,
                                                    random_state=0)

In [37]:
lr = LogisticRegression()
train = lr.fit(X_bow_pos_train, y_train)
print(X_bow_pos_train.shape, y_train.shape)
print('Training set score:', lr.score(X_bow_pos_train, y_train))
print('\nTest set score:', lr.score(X_bow_pos_test, y_test))



(4032, 16068) (4032,)
Training set score: 0.9749503968253969

Test set score: 0.7683741648106904


In [None]:
svc = SVC(kernel='poly', degree=1, gamma=.22)
train_svc = svc.fit(X_bow_pos_train, y_train)
print(X_bow_pos_train.shape, y_train.shape)
print('Training set score:', svc.score(X_bow_pos_train, y_train))
print('\nTest set score:', svc.score(X_bow_pos_test, y_test))

(4032, 16068) (4032,)
