In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
random.seed(42)
from nltk.corpus import stopwords, state_union
from sklearn.feature_extraction.text import CountVectorizer
import time
import warnings
warnings.simplefilter('ignore')

In [2]:
'''
For downloading Corpora
import nltk
nltk.download()

for download english langugage in spaCy
!python -m spacy download en
'''

'\nFor downloading Corpora\nimport nltk\nnltk.download()\n\nfor download english langugage in spaCy\n!python -m spacy download en\n'

# Introduction  

In this challenge, I want to build a model to classify state of the union addresses. In this case I will only be concerned with two previous presidents: Ronald Reagan and Bill Clinton; given a single sentence can we predict which president said it? This project will involve 5 primary steps:

  1. Data cleaning / processing / language parsing
  2. Create features using two different NLP methods: For example, BoW vs tf-idf.
  3. Use the features to fit supervised learning models for each feature set to predict the category outcomes.
  4. Assess your models using cross-validation and determine whether one model performed better.
  5. Pick one of the models and try to increase accuracy by at least 5 percentage points.
  
## Read Data

In [2]:
# 1.  Get all Reagan transcripts
reagan_text = ""
files = ['1981-Reagan.txt', '1982-Reagan.txt',
         '1983-Reagan.txt','1984-Reagan.txt',
         '1985-Reagan.txt','1986-Reagan.txt',
         '1987-Reagan.txt','1988-Reagan.txt']

for fname in files:
    reagan_text += state_union.raw(fname).replace('\n',' ').lower()
    
# Bill Clinton

# 1.  Get all Reagan transcripts
files = ['1993-Clinton.txt', '1994-Clinton.txt', 
         '1995-Clinton.txt', '1996-Clinton.txt', 
         '1997-Clinton.txt', '1998-Clinton.txt', 
         '1999-Clinton.txt', '2000-Clinton.txt',]
clinton_text = ""

for fname in files:
    clinton_text += state_union.raw(fname).replace('\n',' ').lower()

**NLP Raw Text**

In [3]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
reagan_doc = nlp(reagan_text)
clinton_doc = nlp(clinton_text)

**Break into sentences**

In [4]:
# Group into sentences.
reagan_sents = [sent.string for sent in reagan_doc.sents]
clinton_sents = [sent.string for sent in clinton_doc.sents]

So we have 1,883 total sentences for Reagan and 3,183 sentences from Clinton, so we need to resample Clinton's sentences so that they are of the same sample size. 

In [14]:
random.seed(0)
clinton_sents = random.sample(clinton_sents, len(reagan_sents))

sentences = reagan_sents + clinton_sents

a = np.zeros((len(reagan_sents),))
b = np.ones((len(clinton_sents), ))
target = np.concatenate((a, b), axis=0)

In [15]:
from nltk.stem import WordNetLemmatizer
STOPWORDS = set(stopwords.words('english'))

def tokenize_text(text):
    text = re.sub(r"[^a-z0-9]+", " ", text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in text.split(' ') if word not in STOPWORDS and 
              word != '' and not word.isnumeric()]
    return ' '.join(tokens)


lemmatizer = WordNetLemmatizer()
sentences = [tokenize_text(sent) for sent in sentences]

## Bag of Words

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english') #, min_df=10, max_df=.05)
bow = cv.fit_transform(sentences, target).toarray()
bow = pd.DataFrame(bow, columns=cv.get_feature_names())
bow.sum(axis=0).sort_values(ascending=False).head(10)

year          474
american      351
people        345
america       319
new           241
government    237
work          203
child         198
world         188
congress      183
dtype: int64

## TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tcv = TfidfVectorizer(stop_words='english') #, min_df=10, max_df=.05)
tfidf = tcv.fit_transform(sentences, target).toarray()
tfidf = pd.DataFrame(tfidf, columns=tcv.get_feature_names())

tfidf.sum(axis=0).sort_values(ascending=False).head(10)

year          82.611723
american      68.165984
people        67.440010
america       63.923792
let           49.954837
work          47.763782
government    47.491771
new           45.238571
child         42.333046
time          41.783926
dtype: float64

**Split** data into train and test sets

In [19]:
from sklearn.model_selection import train_test_split as tts

X_bow_train, X_bow_test, y_train, y_test = tts(bow, 
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0)

X_tfidf_train, X_tfidf_test, y_train, y_test = tts(tfidf, 
                                                    target,
                                                    test_size=0.2,
                                                    random_state=0)

# Classification Models  

## Bag of Words
  1. Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_bow_train, y_train)
print(X_bow_train.shape, y_train.shape)
print('Training set score:', lr.score(X_bow_train, y_train))
print('\nTest set score:', lr.score(X_bow_test, y_test))

(3012, 5112) (3012,)
Training set score: 0.9309428950863213

Test set score: 0.7214854111405835


As expected, logistic regression overfits the training data big time! SVM's should help with this.     
  2. Linear Support Vector Machine

## TF-IDF

In [76]:
lr2 = LogisticRegression()
train = lr2.fit(X_tfidf_train, y_train)
print(X_tfidf_train.shape, y_train.shape)
print('Training set score:', lr2.score(X_tfidf_train, y_train))
print('\nTest set score:', lr2.score(X_tfidf_test, y_test))

(3012, 5102) (3012,)
Training set score: 0.8841301460823373

Test set score: 0.7360742705570292


As to be expected, using tf-idf results in less overfitting, but is only slightly more accurate. 

# doc2vec

Similar to word2vec but operates on a document basis, see more [here](https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5).

In [24]:
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

tagged_data_reagan = [TaggedDocument(words=word_tokenize(sent.string), tags=[0]) for sent in reagan_doc.sents]
tagged_data_clinton = [TaggedDocument(words=word_tokenize(sent.string), tags=[1]) for sent in clinton_doc.sents]
tagged_data_clinton = random.sample(tagged_data_clinton, len(tagged_data_reagan))

tagged_data = tagged_data_clinton + tagged_data_reagan

In [74]:
max_epochs = 110
vec_size = 10
alpha = 0.045

model = Doc2Vec(size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    if (epoch % 10)==0:
        print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0004
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

iteration 0
iteration 10
iteration 20
iteration 30
iteration 40
iteration 50
iteration 60
iteration 70
iteration 80
iteration 90
iteration 100


In [75]:
# Group into sentences.
reagan_sents = [sent.string for sent in reagan_doc.sents]
clinton_sents = [sent.string for sent in clinton_doc.sents]
clinton_sents = random.sample(clinton_sents, len(reagan_sents))

sentences = reagan_sents + clinton_sents

a = np.zeros((len(reagan_sents),))
b = np.ones((len(clinton_sents), ))
target = np.concatenate((a, b), axis=0)

embeddings = np.zeros((len(sentences), vec_size))
for i, sentence in enumerate(sentences):
    embeddings[i] = model.infer_vector(word_tokenize(sentence))

X_train, X_test, y_train, y_test = tts(embeddings, target, 
                                       test_size=.2, random_state=42)

lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
print("Train score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test score: {:.2f}".format(lr.score(X_test, y_test)))

Train score: 0.72
Test score: 0.71
