For this challenge, you will need to choose a corpus of data from nltk or another source that includes categories you can predict and create an analysis pipeline that includes the following steps:
Data cleaning / processing / language parsing
Create features using two different NLP methods: For example, BoW vs tf-idf.
Use the features to fit supervised learning models for each feature set to predict the category outcomes.
Assess your models using cross-validation and determine whether one model performed better.
Pick one of the models and try to increase accuracy by at least 5 percentage points.
Write up your report in a Jupyter notebook. Be sure to explicitly justify the choices you make throughout, and submit it below.



In [8]:
import nltk
# Launch the installer to download "gutenberg" and "stop words" corpora.
#nltk.download()

In [13]:
from textstat.textstat import textstat
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
#import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from gensim.models import doc2vec
from collections import namedtuple
#nlp = spacy.load('en') 
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [4]:

# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?:-[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [5]:
from nltk.corpus import inaugural

inaugural.fileids()


['1789-Washington.txt',
 '1793-Washington.txt',
 '1797-Adams.txt',
 '1801-Jefferson.txt',
 '1805-Jefferson.txt',
 '1809-Madison.txt',
 '1813-Madison.txt',
 '1817-Monroe.txt',
 '1821-Monroe.txt',
 '1825-Adams.txt',
 '1829-Jackson.txt',
 '1833-Jackson.txt',
 '1837-VanBuren.txt',
 '1841-Harrison.txt',
 '1845-Polk.txt',
 '1849-Taylor.txt',
 '1853-Pierce.txt',
 '1857-Buchanan.txt',
 '1861-Lincoln.txt',
 '1865-Lincoln.txt',
 '1869-Grant.txt',
 '1873-Grant.txt',
 '1877-Hayes.txt',
 '1881-Garfield.txt',
 '1885-Cleveland.txt',
 '1889-Harrison.txt',
 '1893-Cleveland.txt',
 '1897-McKinley.txt',
 '1901-McKinley.txt',
 '1905-Roosevelt.txt',
 '1909-Taft.txt',
 '1913-Wilson.txt',
 '1917-Wilson.txt',
 '1921-Harding.txt',
 '1925-Coolidge.txt',
 '1929-Hoover.txt',
 '1933-Roosevelt.txt',
 '1937-Roosevelt.txt',
 '1941-Roosevelt.txt',
 '1945-Roosevelt.txt',
 '1949-Truman.txt',
 '1953-Eisenhower.txt',
 '1957-Eisenhower.txt',
 '1961-Kennedy.txt',
 '1965-Johnson.txt',
 '1969-Nixon.txt',
 '1973-Nixon.txt',
 '1

In [6]:
dem_list = ['2009-Obama.txt','1997-Clinton.txt','1993-Clinton.txt','1977-Carter.txt','1965-Johnson.txt', '1885-Cleveland.txt', '1893-Cleveland.txt',
            '1961-Kennedy.txt','1949-Truman.txt','1945-Roosevelt.txt','1941-Roosevelt.txt','1937-Roosevelt.txt','1933-Roosevelt.txt', '2013-Obama.txt']
rep_list = ['1953-Eisenhower.txt', '1957-Eisenhower.txt', '1969-Nixon.txt', '1973-Nixon.txt', '1981-Reagan.txt', '1985-Reagan.txt', '1989-Bush.txt','2001-Bush.txt', '2005-Bush.txt'
            '1861-Lincoln.txt', '1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt', '1881-Garfield.txt', '1889-Harrison.txt', '2017-trump.txt',
            '1897-McKinley.txt', '1901-McKinley.txt', '1905-Roosevelt.txt', '1909-Taft.txt', '1921-Harding.txt', '1925-Coolidge.txt', '1929-Hoover.txt']

#import another speech from text


obama13=open("C:/Users/boyerm/Documents/Thinkful/2013-Obama.txt",'rU')
raw_obama13=obama13.read()
tokens_obama13 = nltk.word_tokenize(raw_obama13)
text_obama13 = nltk.Text(tokens_obama13)

trump17=open("C:/Users/boyerm/Documents/Thinkful/2017-trump.txt",'rU')
raw_trump17=trump17.read()
tokens_trump17 = nltk.word_tokenize(raw_trump17)
text_trump17 = nltk.Text(tokens_trump17)

list_of_other_speeches = [] #inaugural.fileids()
list_of_other_speeches.append('2017-trump.txt')
list_of_other_speeches.append('2013-Obama.txt')
print(list_of_other_speeches)
print(text_files)

In [6]:
text_files = ['C:/Users/boyerm/Documents/Thinkful/2013-Obama.txt','C:/Users/boyerm/Documents/Thinkful/2017-trump.txt']

In [9]:
# make lists with the text and features
speeches = []
raw_text = []
clean_text = []
reading_ease = []
smog_index = []
flesch_kincaid_grade = []
coleman_liau_index = []
readability = []
chall_readability = []
diffwords = []
linsear_write_formula = []
gunning_fog = []
text_standard = []
party1 = []
twords = []
for p in inaugural.fileids():    
    speeches.append(p)
    x = inaugural.raw(p)
    raw_text.append(x)
    clean = text_cleaner(x)
    clean_text.append(clean)    
    
    ease = textstat.flesch_reading_ease(x)
    reading_ease.append(ease)
    smog = textstat.smog_index(x)
    smog_index.append(smog)
    fk_grade = textstat.flesch_kincaid_grade(x)
    flesch_kincaid_grade.append(fk_grade)
    liau = textstat.coleman_liau_index(x)
    coleman_liau_index.append(liau)
    read = textstat.automated_readability_index(x)
    readability.append(read)
    read2 = textstat.dale_chall_readability_score(x)
    chall_readability.append(read2)
    words = textstat.difficult_words(x)
    diffwords.append(words)
    write = textstat.linsear_write_formula(x)
    linsear_write_formula.append(write)
    fog = textstat.gunning_fog(x)
    gunning_fog.append(fog)
    standard = textstat.text_standard(x)
    text_standard.append(standard)
    
    token_words = nltk.word_tokenize(clean)
    twords.append(token_words)

    if p in dem_list:
        party1.append(1)    #'dem'
    elif p in rep_list:
        party1.append(0)   #'rep'
    else: 
        party1.append(-1)    #'xxx'
        


In [None]:
# my own  files
for p in text_files:
    z = open(p,'rU')
    x =z.read()
    raw_text.append(x)

    clean = text_cleaner(x)
    clean_text.append(clean)    
    ease = textstat.flesch_reading_ease(x)
    reading_ease.append(ease)
    smog = textstat.smog_index(x)
    smog_index.append(smog)
    fk_grade = textstat.flesch_kincaid_grade(x)
    flesch_kincaid_grade.append(fk_grade)
    liau = textstat.coleman_liau_index(x)
    coleman_liau_index.append(liau)
    read = textstat.automated_readability_index(x)
    readability.append(read)
    read2 = textstat.dale_chall_readability_score(x)
    chall_readability.append(read2)
    words = textstat.difficult_words(x)
    diffwords.append(words)
    write = textstat.linsear_write_formula(x)
    linsear_write_formula.append(write)
    fog = textstat.gunning_fog(x)
    gunning_fog.append(fog)
    standard = textstat.text_standard(x)
    text_standard.append(standard)
    token_words = nltk.word_tokenize(clean)
    twords.append(token_words)
    
    #tokens_trump17 = nltk.word_tokenize(raw_trump17)
    #text_trump17 = nltk.Text(tokens_trump17)

In [10]:
# make dataframe of the lists of features
sp = pd.DataFrame()
sp['speeches'] = speeches
sp['raw_text'] = raw_text
sp['clean_text'] = clean_text
sp['reading_ease'] = reading_ease
sp['smog_index'] = smog_index
sp['flesch_kincaid_grade'] = flesch_kincaid_grade
sp['coleman_liau_index'] = coleman_liau_index
sp['readability'] = readability
sp['chall_readability'] = chall_readability
sp['diffwords'] = diffwords
sp['linsear_write_formula'] = linsear_write_formula
sp['gunning_fog'] = gunning_fog
sp['text_standard'] = text_standard
sp['party'] = party1
sp['tokens'] = twords

In [11]:
sp

Unnamed: 0,speeches,raw_text,clean_text,reading_ease,smog_index,flesch_kincaid_grade,coleman_liau_index,readability,chall_readability,diffwords,linsear_write_formula,gunning_fog,text_standard,party,tokens
0,1789-Washington.txt,Fellow-Citizens of the Senate and of the House...,Fellow-Citizens of the Senate and of the House...,8.34,16.1,27.5,12.78,33.3,11.69,450,29.0,39.467413,12th and 13th grade,-1,"[Fellow-Citizens, of, the, Senate, and, of, th..."
1,1793-Washington.txt,"Fellow citizens, I am again called upon by the...","Fellow citizens, I am again called upon by the...",37.17,10.7,16.5,11.56,18.4,11.28,51,14.0,30.631111,11th and 12th grade,-1,"[Fellow, citizens, ,, I, am, again, called, up..."
2,1797-Adams.txt,"When it was first perceived, in early times, t...","When it was first perceived, in early times, t...",7.94,15.8,27.7,12.55,33.4,11.25,662,30.0,38.463641,12th and 13th grade,-1,"[When, it, was, first, perceived, ,, in, early..."
3,1801-Jefferson.txt,Friends and Fellow Citizens:\n\nCalled upon to...,Friends and Fellow Citizens: Called upon to un...,37.2,11.6,18.5,11.85,22.6,10.24,494,26.5,30.288436,11th and 12th grade,-1,"[Friends, and, Fellow, Citizens, :, Called, up..."
4,1805-Jefferson.txt,"Proceeding, fellow citizens, to that qualifica...","Proceeding, fellow citizens, to that qualifica...",31.11,12.5,20.9,12.43,26.0,10.4,600,18.666667,32.320332,11th and 12th grade,-1,"[Proceeding, ,, fellow, citizens, ,, to, that,..."
5,1809-Madison.txt,Unwilling to depart from examples of the most ...,Unwilling to depart from examples of the most ...,14.64,15.2,25.1,12.38,29.9,11.52,380,18.666667,37.33617,11th and 12th grade,-1,"[Unwilling, to, depart, from, examples, of, th..."
6,1813-Madison.txt,About to add the solemnity of an oath to the o...,About to add the solemnity of an oath to the o...,34.22,12.1,17.6,11.85,20.1,10.45,383,20.333333,29.341157,11th and 12th grade,-1,"[About, to, add, the, solemnity, of, an, oath,..."
7,1817-Monroe.txt,I should be destitute of feeling if I was not ...,I should be destitute of feeling if I was not ...,51.92,10.6,12.9,11.49,15.5,8.95,842,18.0,23.034065,10th and 11th grade,-1,"[I, should, be, destitute, of, feeling, if, I,..."
8,1821-Monroe.txt,"Fellow citizens, I shall not attempt to descri...","Fellow citizens, I shall not attempt to descri...",45.32,11.0,15.4,11.73,18.7,9.02,1046,14.25,25.000179,10th and 11th grade,-1,"[Fellow, citizens, ,, I, shall, not, attempt, ..."
9,1825-Adams.txt,In compliance with an usage coeval with the ex...,In compliance with an usage coeval with the ex...,31.48,13.2,18.7,12.83,22.2,10.09,831,28.0,29.163087,12th and 13th grade,-1,"[In, compliance, with, an, usage, coeval, with..."


In [23]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [24]:
# this picks the most common words in each speech:
allwords = []
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    for word in text:
        word = word.lower()
        word = wordnet_lemmatizer.lemmatize(word)
        if word.isalnum() == True:
            if word not in stopwords.words('english'):
                allwords.append(word)
            else:
                continue
        else:
            continue
            
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

In [25]:
# this goes through the whole speech dataframe to find the BOW
all_common_words = []
i = 0
for i in range(0,sp.shape[0]):
    z = bag_of_words(sp.tokens[i])
    all_common_words.append(z)
    z = []
    i =+ 1
    
# Can flatten list of lists    [[word, word],[word,word]]  ~  sum(list_of_lists, [])

In [26]:
# this finds the common words among all speeches top words:

cw=[]

for i in range(0,sp.shape[0]):
    for word in all_common_words[i]:
        if word not in cw:
            cw.append(word)
        else:
            continue

In [27]:
print(len(cw))  # of unique words

3287


In [28]:
# Ok, now i Have a list of the most common words (unique across all speeches). So I want to construct a dataframe that 
# counts how many times the word appears in the speech

In [29]:
wordcount = pd.DataFrame(columns=cw)
wordcount['text_sentence'] = sp.clean_text
#wordcount['text_source'] = sp.party
wordcount.loc[:, cw] = 0

list_of_words = []
for i in range(0,sp.shape[0]):
    
    for word in sp.tokens[i]:
        word = word.lower()
        word = wordnet_lemmatizer.lemmatize(word)
        if word.isalnum() == True:
            if word not in stopwords.words('english'):      
                if word in cw:
                    list_of_words.append(word)

                  
    # Populate the row with word counts.
    for w in list_of_words:
        wordcount.loc[i, w] += 1
    
    # reset list again
    list_of_words = []

In [30]:
wordcount

Unnamed: 0,every,government,public,may,present,country,duty,ha,wa,one,...,storm,faced,founding,feed,watching,search,spend,charity,defeat,text_sentence
0,9,9,6,6,5,5,5,5,4,4,...,0,0,0,0,0,0,0,0,0,Fellow-Citizens of the Senate and of the House...
1,0,1,0,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,"Fellow citizens, I am again called upon by the..."
2,5,18,6,13,2,10,3,7,8,1,...,0,0,0,0,0,0,0,0,0,"When it was first perceived, in early times, t..."
3,2,13,4,8,0,4,2,4,1,6,...,0,0,0,0,0,0,0,0,0,Friends and Fellow Citizens: Called upon to un...
4,4,3,14,10,3,5,8,5,3,1,...,0,0,0,0,0,0,0,0,0,"Proceeding, fellow citizens, to that qualifica..."
5,2,0,6,1,1,5,3,4,0,2,...,0,0,0,0,0,0,0,0,0,Unwilling to depart from examples of the most ...
6,5,3,2,2,0,6,1,6,7,2,...,0,0,0,0,0,0,0,0,0,About to add the solemnity of an oath to the o...
7,14,21,8,10,4,11,9,19,6,2,...,0,0,0,0,0,0,0,0,0,I should be destitute of feeling if I was not ...
8,13,14,5,15,3,8,6,35,16,6,...,0,0,0,0,0,0,0,0,0,"Fellow citizens, I shall not attempt to descri..."
9,5,21,9,3,1,10,9,27,9,4,...,0,0,0,0,0,0,0,0,0,In compliance with an usage coeval with the ex...


In [31]:
sp.rename(columns={'party': 'political_party'}, inplace=True)

result = pd.concat([wordcount, sp], axis=1)
result

Unnamed: 0,every,government,public,may,present,country,duty,ha,wa,one,...,flesch_kincaid_grade,coleman_liau_index,readability,chall_readability,diffwords,linsear_write_formula,gunning_fog,text_standard,political_party,tokens
0,9,9,6,6,5,5,5,5,4,4,...,27.5,12.78,33.3,11.69,450,29.0,39.467413,12th and 13th grade,-1,"[Fellow-Citizens, of, the, Senate, and, of, th..."
1,0,1,0,1,1,1,0,1,0,0,...,16.5,11.56,18.4,11.28,51,14.0,30.631111,11th and 12th grade,-1,"[Fellow, citizens, ,, I, am, again, called, up..."
2,5,18,6,13,2,10,3,7,8,1,...,27.7,12.55,33.4,11.25,662,30.0,38.463641,12th and 13th grade,-1,"[When, it, was, first, perceived, ,, in, early..."
3,2,13,4,8,0,4,2,4,1,6,...,18.5,11.85,22.6,10.24,494,26.5,30.288436,11th and 12th grade,-1,"[Friends, and, Fellow, Citizens, :, Called, up..."
4,4,3,14,10,3,5,8,5,3,1,...,20.9,12.43,26.0,10.4,600,18.666667,32.320332,11th and 12th grade,-1,"[Proceeding, ,, fellow, citizens, ,, to, that,..."
5,2,0,6,1,1,5,3,4,0,2,...,25.1,12.38,29.9,11.52,380,18.666667,37.33617,11th and 12th grade,-1,"[Unwilling, to, depart, from, examples, of, th..."
6,5,3,2,2,0,6,1,6,7,2,...,17.6,11.85,20.1,10.45,383,20.333333,29.341157,11th and 12th grade,-1,"[About, to, add, the, solemnity, of, an, oath,..."
7,14,21,8,10,4,11,9,19,6,2,...,12.9,11.49,15.5,8.95,842,18.0,23.034065,10th and 11th grade,-1,"[I, should, be, destitute, of, feeling, if, I,..."
8,13,14,5,15,3,8,6,35,16,6,...,15.4,11.73,18.7,9.02,1046,14.25,25.000179,10th and 11th grade,-1,"[Fellow, citizens, ,, I, shall, not, attempt, ..."
9,5,21,9,3,1,10,9,27,9,4,...,18.7,12.83,22.2,10.09,831,28.0,29.163087,12th and 13th grade,-1,"[In, compliance, with, an, usage, coeval, with..."


In [32]:
result = pd.get_dummies(data=result, columns=['text_standard'])

# Now do the BOW models

# Random Forest

In [33]:
# BOW [Random Forest]
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = result['political_party']
X = np.array(result.drop(['party','tokens','clean_text','speeches','text_sentence','raw_text'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.5,
                                                    random_state=0)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 1.0

Test set score: 0.535714285714


# Logistic Regression

In [34]:
# BOW with Logistic Regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

lr_predicted = lr.predict(X_test)
pd.crosstab(y_test, lr_predicted)

(28, 3306) (28,)
Training set score: 1.0

Test set score: 0.607142857143


col_0,-1,0,1
political_party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,8,1,2
0,1,6,5
1,0,2,3


# Gradient Boosting

In [35]:
# BOW with Gradient Boosting

clf = ensemble.GradientBoostingClassifier()
train = clf.fit(X_train, y_train)

print('Training set score:', clf.score(X_train, y_train))
print('\nTest set score:', clf.score(X_test, y_test))

Training set score: 1.0

Test set score: 0.964285714286


# tf-idf

In [25]:
X = clean_text
Y = party1

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.4, # drop words that occur in more than half the paragraphs
                             min_df=1, # only use words that appear at least twice
                             #analyzer = 'word',
                             #ngram_range=(1,3) ,
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


In [37]:
# Reference: https://www.quantstart.com/articles/Supervised-Learning-for-Document-Classification-with-Scikit-Learn

In [None]:
#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(X_tfidf, test_size=0.4, random_state=0)


# SVM Model

In [32]:
#Create and train the Support Vector Machine.
from sklearn.svm import SVC

svm = SVC(C=1000000.0,  kernel='rbf')
svm.fit(X_tfidf, Y)

svm = train_svm(X_train, y_train)

SVC(C=1000000.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [36]:
def train_svm(X, y):
    """
    Create and train the Support Vector Machine.
    """
    svm = SVC(C=1000000.0, kernel='rbf')
    svm.fit(X, y)
    return svm

from sklearn.metrics import confusion_matrix

    # Create and train the Support Vector Machine
svm = train_svm(X_train_tfidf, Y_train)

    # Make an array of predictions on the test set
pred = svm.predict(X_test_tfidf)

    # Output the hit-rate and the confusion matrix for each model
print(svm.score(X_test_tfidf, Y_test))
print(confusion_matrix(pred, Y_test))


0.608695652174
[[7 1 0]
 [2 6 3]
 [0 3 1]]


In [None]:
# my model knows when something isn't classified vs. when it is either dem/rep

# Naive Bayes

In [40]:

#initialize the Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_tfidf, Y_train)
nb_pred=clf.predict(X_test_tfidf)
print(accuracy_score(Y_test, nb_pred))

0.565217391304


# Random Forest

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score

rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(X_train_tfidf,Y_train)
rf_pred=rf.predict(X_test_tfidf)
accuracy_score(Y_test, rf_pred)

0.69565217391304346