In [1]:
import pandas as pd
import nltk
#nltk.download()
from nltk.corpus import gutenberg
from sklearn.model_selection import train_test_split

In [2]:
print(gutenberg.fileids())

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']


## The Task
- The aim of this project to determine features for chosen texts using two Natural Language Processing approaches: Bag of Words, and tf-idf. We will then use supervised learning models to distinguish between the texts and find out whether one or both NLP models work effectively to distinguish one body of text from another. The corpora to be used: 20th and 21st century US presidential inaugural speeches. We will see whether these models can reliably be used to determine whether a speech was delivered by a Republican or a Democrat.

In [3]:
from nltk.corpus import inaugural
print(inaugural.fileids())

['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt', '1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt', '1833-Jackson.txt', '1837-VanBuren.txt', '1841-Harrison.txt', '1845-Polk.txt', '1849-Taylor.txt', '1853-Pierce.txt', '1857-Buchanan.txt', '1861-Lincoln.txt', '1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt', '1881-Garfield.txt', '1885-Cleveland.txt', '1889-Harrison.txt', '1893-Cleveland.txt', '1897-McKinley.txt', '1901-McKinley.txt', '1905-Roosevelt.txt', '1909-Taft.txt', '1913-Wilson.txt', '1917-Wilson.txt', '1921-Harding.txt', '1925-Coolidge.txt', '1929-Hoover.txt', '1933-Roosevelt.txt', '1937-Roosevelt.txt', '1941-Roosevelt.txt', '1945-Roosevelt.txt', '1949-Truman.txt', '1953-Eisenhower.txt', '1957-Eisenhower.txt', '1961-Kennedy.txt', '1965-Johnson.txt', '1969-Nixon.txt', '1973-Nixon.txt', '1977-Carter.txt', '1981-Reagan.txt', '1985-Reaga

In [4]:
dem = ['1913-Wilson.txt', '1917-Wilson.txt', '1933-Roosevelt.txt', 
       '1937-Roosevelt.txt', '1941-Roosevelt.txt', '1945-Roosevelt.txt', 
       '1949-Truman.txt', '1957-Eisenhower.txt', '1961-Kennedy.txt', 
       '1965-Johnson.txt', '1977-Carter.txt', '1993-Clinton.txt', 
       '1997-Clinton.txt', '2009-Obama.txt']

In [5]:
rep = ['1897-McKinley.txt', '1901-McKinley.txt', '1905-Roosevelt.txt', 
       '1909-Taft.txt', '1921-Harding.txt', '1925-Coolidge.txt', 
       '1929-Hoover.txt', '1953-Eisenhower.txt', '1957-Eisenhower.txt', 
       '1969-Nixon.txt', '1973-Nixon.txt', '1981-Reagan.txt', 
       '1985-Reagan.txt', '1989-Bush.txt', '2001-Bush.txt', '2005-Bush.txt']

In [6]:
demwords = {}
for (i,speech) in enumerate(dem):
    demwords['dem' + str(i)] = inaugural.raw(speech)

In [7]:
len(demwords)

14

In [8]:
repwords = {}
for (i,speech) in enumerate(rep):
    repwords['rep' + str(i)] = inaugural.raw(speech)

In [9]:
len(repwords)

16

In [10]:
demwords['dem8']

'Vice President Johnson, Mr. Speaker, Mr. Chief Justice, President Eisenhower, Vice President Nixon, President Truman, reverend clergy, fellow citizens, we observe today not a victory of party, but a celebration of freedom -- symbolizing an end, as well as a beginning -- signifying renewal, as well as change. For I have sworn I before you and Almighty God the same solemn oath our forebears l prescribed nearly a century and three quarters ago.\n\nThe world is very different now. For man holds in his mortal hands the power to abolish all forms of human poverty and all forms of human life. And yet the same revolutionary beliefs for which our forebears fought are still at issue around the globe -- the belief that the rights of man come not from the generosity of the state, but from the hand of God.\n\nWe dare not forget today that we are the heirs of that first revolution. Let the word go forth from this time and place, to friend and foe alike, that the torch has been passed to a new gener

In [11]:
import re
for key in demwords:
    demwords[key] = re.sub(r'--',' ',demwords[key])
    demwords[key] = ' '.join(demwords[key].split())
for key in repwords:
    repwords[key] = re.sub(r'--',' ',repwords[key])
    repwords[key] = ' '.join(repwords[key].split())

In [12]:
from nltk.corpus import stopwords

In [13]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:
import spacy
import time
nlp = spacy.load('en')

start_time = time.time()
# All the processing work is done here, so it may take a while.
dem_full = ''
rep_full = ''

for key in demwords:
    dem_full += (' ' + str(key) + ' ' + demwords[key])
for key in repwords:
    rep_full += (' ' + str(key) + ' ' + repwords[key])

dem_doc = nlp(dem_full)
interval = time.time() - start_time
print('time taken: {} minutes.'.format('%.1f' % (interval / 60)))

time taken: 0.2 minutes.


In [15]:
dem_full.index('dem0')

1

In [16]:
start_time = time.time()
rep_doc = nlp(rep_full)
interval = time.time() - start_time
print('time taken: {} minutes.'.format('%.1f' % (interval / 60)))

time taken: 0.6 minutes.


In [19]:
dem_sents = [[sent, "dem"] for sent in dem_doc.sents]
rep_sents = [[sent, "rep"] for sent in rep_doc.sents]
all_sents = dem_sents + rep_sents

In [24]:
import pandas as pd
sentences = pd.DataFrame()
sentences[0] = [sent[0] for sent in all_sents]
sentences[1] = [sent[1] for sent in all_sents]

In [25]:
sentences[1][0]

'dem'

In [100]:
sentences[2] = ['' for i in range(0, len(sentences[0]))]
sentences[2][0] = str(sentences[0][0][1])
start_time = time.time()
for i in range(1, len(sentences[0])):
    if str(sentences[0][i][0]).startswith('dem') or str(sentences[0][i][0]).startswith('rep'):
        sentences[2][i] = str(sentences[0][i][0])
    else:
        sentences[2][i] = str(sentences[2][i-1])
print('Time taken: {} seconds.'.format(time.time() - start_time))

Time taken: 0.5485620498657227 seconds.


In [101]:
sentences[2].value_counts()

dem13    245
rep5     197
rep6     170
rep3     161
dem9     150
rep4     150
rep13    144
rep11    130
rep12    129
rep7     121
dem6     116
rep15    115
dem12    114
rep9     104
rep1     101
dem3      97
rep14     97
rep8      94
dem7      94
dem2      86
dem11     82
rep10     70
dem4      70
dem0      69
dem1      61
dem8      53
rep2      34
dem5      26
Name: 2, dtype: int64

## 1. Bag of Words Approach

In [145]:
from collections import Counter

# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [token.lemma_
                for token in text
                if not token.is_punct
                and not token.is_stop]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(2000)]
    

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_party'] = sentences[1]
    df['which_speech'] = sentences[2]
    df.loc[:, common_words] = 0
    
  # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df

# Set up the bags.
demowords = bag_of_words(dem_doc)
repuwords = bag_of_words(rep_doc)

# Combine bags to create a set of unique words.
common_words = set(repuwords + demowords)    

In [103]:
# Create our data frame with features. This can take a while to run.
word_counts = bow_features(sentences, common_words)
word_counts.head()

Processing row 0
Processing row 500
Processing row 1000
Processing row 1500
Processing row 2000
Processing row 2500
Processing row 3000


Unnamed: 0,space,negroes,pursuit,neighborhood,ample,fine,shallow,rights,kind,generous,...,person,porto,golden,cast,emphasize,practice,rid,text_sentence,text_party,which_speech
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"( , dem0)",dem,dem0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(There, has, been, a, change, of, government, .)",dem,dem0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(It, began, two, years, ago, ,, when, the, Hou...",dem,dem0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(It, has, now, been, completed, .)",dem,dem0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,"(The, Senate, about, to, assemble, will, also,...",dem,dem0


In [128]:
# To ensure all speeches are correctly labeled:
word_counts['text_party'] = ['rep' if word_counts['which_speech'][i].startswith('rep') else 'dem' 
                             for i in range(len(word_counts['which_speech']))]

In [130]:
# rep0 somehow got lost in the shuffle
word_counts[word_counts['which_speech'] == 'rep0']

Unnamed: 0,space,negroes,pursuit,neighborhood,ample,fine,shallow,rights,kind,generous,...,person,porto,golden,cast,emphasize,practice,rid,text_sentence,text_party,which_speech


In [131]:
speeches = list(word_counts['which_speech'].unique())
speeches

['dem0',
 'dem1',
 'dem2',
 'dem3',
 'dem4',
 'dem5',
 'dem6',
 'dem7',
 'dem8',
 'dem9',
 'dem11',
 'dem12',
 'dem13',
 'rep1',
 'rep2',
 'rep3',
 'rep4',
 'rep5',
 'rep6',
 'rep7',
 'rep8',
 'rep9',
 'rep10',
 'rep11',
 'rep12',
 'rep13',
 'rep14',
 'rep15']

## Results

The supervised learning results are shown below. Clearly there is no strog relationship between the words used and the party of the presiden who spoek them (at least, no relationship that the BoW and supervised learnings could detect.) Calculated accuracies are mostly not much better than chance. At the end of this section we will try the analysis using the tf-idf approach. 

### Random Forest Classifier

In [185]:
import numpy as np
from sklearn import ensemble

from sklearn.model_selection import cross_val_score

start_time = time.time()
# 
X = pd.DataFrame(word_counts.drop(['text_sentence','text_party', 'republican', 
                               'liberal', 'which_speech'], axis=1))
Y = word_counts['text_party']
rfc = ensemble.RandomForestClassifier()

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4)

train = rfc.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', rfc.score(X_train, y_train))

# Or could use np.array(Y).reshape(len(Y),) instead of np.array(Y).ravel():
score = cross_val_score(rfc, X, np.array(Y).ravel(), cv=6)
print(score)
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\n Time taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

(1848, 2791) (1848,)
Training set score: 0.968073593074
[ 0.5155642   0.50194553  0.4688716   0.40350877  0.53411306  0.4453125 ]

Average accuracy of cross-val data (incl. train data): 0.4782
SD of accuracy: 0.04441
--------------------------------------------------

 Time taken: 0.1 minutes.


In [186]:
success_count = 0
for speech in speeches:
    x = rfc.predict(X[word_counts['which_speech'] == speech])
    countdem = [1 for sentence in x if sentence == 'dem']
    fracdem = np.sum(countdem) / len(x)
    if fracdem > 0.5 and speech.startswith('dem'):
        success_count += 1
    elif fracdem < 0.5 and speech.startswith('rep'):
        success_count += 1
print('Success rate in predicting party of President: ', success_count / len(speeches))

Success rate in predicting party of President:  0.9642857142857143


In [244]:
# Define new set of training presidents:
train_pres = ['dem0', 'dem1', 'dem2', 'dem3', 'dem4', 'dem5', 'dem6', 
              'dem7', 'dem9', 'rep1', 'dem8', 
               'rep6', 'rep7', 'rep8', 'rep9', 'rep10', 'rep11', 'rep12', 'rep15', 'rep14', 'rep13']

test_pres = ['dem11', 'dem12', 'dem13','rep2', 'rep3', 'rep4', 'rep5']#, 

In [245]:
rfc.fit(X[word_counts['which_speech'].isin(train_pres)], Y[word_counts['which_speech'].isin(train_pres)])

success_count = 0
for speech in test_pres:
    phrase_predictions = rfc.predict(X[word_counts['which_speech'] == speech])
    countdem = [1 for value in phrase_predictions if value == 'dem']
    fracdem = np.sum(countdem) / len(phrase_predictions)
    if fracdem > 0.5 and speech.startswith('dem'):
        success_count += 1
    elif fracdem < 0.5 and speech.startswith('rep'):
        success_count += 1
print('Success rate in predicting party of Presidents: ', success_count / len(test_pres))

Success rate in predicting party of Presidents:  0.42857142857142855


### Logistic Regression

In [204]:
from sklearn.linear_model import LogisticRegression

start_time = time.time()
lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))


# Or could use np.array(Y).reshape(len(Y),) instead of np.array(Y).ravel():
score = cross_val_score(lr, X, np.array(Y).ravel(), cv=6)
print(score)
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\nTime taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

(1848, 2791) (1848,)
Training set score: 0.909090909091
[ 0.51945525  0.56420233  0.51167315  0.45224172  0.53216374  0.45703125]

Average accuracy of cross-val data (incl. train data): 0.5061
SD of accuracy: 0.03994
--------------------------------------------------

Time taken: 0.1 minutes.


In [220]:
success_count = 0
for speech in speeches:
    x = lr.predict(X[word_counts['which_speech'] == speech])
    countdem = [1 for sentence in x if sentence == 'dem']
#    print(x)
    fracdem = np.sum(countdem) / len(x)
#    print(fracdem)
    if fracdem > 0.5 and speech.startswith('dem'):
#        print('Success!')
        success_count += 1
    elif fracdem < 0.5 and speech.startswith('rep'):
#        print('Success!')
        success_count += 1
print('Success rate in predicting party of President: ', success_count / len(speeches))

Success rate in predicting party of President:  0.8571428571428571


In [221]:
lr.fit(X[word_counts['which_speech'].isin(train_pres)], Y[word_counts['which_speech'].isin(train_pres)])

success_count = 0
for speech in test_pres:
    phrase_predictions = lr.predict(X[word_counts['which_speech'] == speech])
    countdem = [1 for value in phrase_predictions if value == 'dem']
    fracdem = np.sum(countdem) / len(phrase_predictions)
    if fracdem > 0.5 and speech.startswith('dem'):
        success_count += 1
    elif fracdem < 0.5 and speech.startswith('rep'):
        success_count += 1
print('Success rate in predicting party of Presidents: ', success_count / len(test_pres))

Success rate in predicting party of Presidents:  0.5


### Gradient Boosting

In [214]:
params  =  {'n_estimators' : 100,
          'max_depth': 10,
          'loss': 'deviance'}

In [215]:
start_time = time.time()
clf = ensemble.GradientBoostingClassifier(**params)
train = clf.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', clf.score(X_train, y_train))


# Or could use np.array(Y).reshape(len(Y),) instead of np.array(Y).ravel():
score = cross_val_score(clf, X, np.array(Y).ravel(), cv=6)
print(score)
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\nTime taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

(1848, 2791) (1848,)
Training set score: 0.87012987013
[ 0.55447471  0.55447471  0.52529183  0.43859649  0.57309942  0.50195312]

Average accuracy of cross-val data (incl. train data): 0.5246
SD of accuracy: 0.04481
--------------------------------------------------

Time taken: 4.3 minutes.


In [216]:
clf.fit(X[word_counts['which_speech'].isin(train_pres)], Y[word_counts['which_speech'].isin(train_pres)])

success_count = 0
for speech in test_pres:
    phrase_predictions = clf.predict(X[word_counts['which_speech'] == speech])
    countdem = [1 for value in phrase_predictions if value == 'dem']
    fracdem = np.sum(countdem) / len(phrase_predictions)
    if fracdem > 0.5 and speech.startswith('dem'):
        success_count += 1
    elif fracdem < 0.5 and speech.startswith('rep'):
        success_count += 1
print('Success rate in predicting party of Presidents: ', success_count / len(test_pres))

Success rate in predicting party of Presidents:  0.5


### Support Vector Classifier

In [212]:
from sklearn.svm import SVC
start_time = time.time()
svc = SVC()
svc.fit(X_train, y_train)
print('Training set score:', svc.score(X_train, y_train))
# Or could use np.array(Y).reshape(len(Y),) instead of np.array(Y).ravel():
score = cross_val_score(svc, X, np.array(Y).ravel(), cv=6)
print(score)
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\nTime taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

Training set score: 0.599567099567
[ 0.58949416  0.58949416  0.58949416  0.59064327  0.59064327  0.58984375]

Average accuracy of cross-val data (incl. train data): 0.5899
SD of accuracy: 0.00052
--------------------------------------------------

Time taken: 4.3 minutes.


In [213]:
svc.fit(X[word_counts['which_speech'].isin(train_pres)], Y[word_counts['which_speech'].isin(train_pres)])

success_count = 0
for speech in test_pres:
    phrase_predictions = svc.predict(X[word_counts['which_speech'] == speech])
    countdem = [1 for value in phrase_predictions if value == 'dem']
    fracdem = np.sum(countdem) / len(phrase_predictions)
    if fracdem > 0.5 and speech.startswith('dem'):
        success_count += 1
    elif fracdem < 0.5 and speech.startswith('rep'):
        success_count += 1
print('Success rate in predicting party of Presidents: ', success_count / len(test_pres))

Success rate in predicting party of Presidents:  0.5


### K-Nearest Neighbors

In [217]:
# This function runs the KNN Classifier
# and performs cross-validation
from sklearn.neighbors import KNeighborsClassifier

start_time = time.time()
knn = KNeighborsClassifier(n_neighbors=250, weights='distance')
knn.fit(X_train, y_train)
print('Training set score:', knn.score(X_train, y_train))

score = cross_val_score(knn, X, np.array(Y).ravel(), cv=6)
print(score)  
print('-' * 50, '\n')
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\nTime taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

Training set score: 0.983225108225
[ 0.55642023  0.57003891  0.54863813  0.40935673  0.57894737  0.4765625 ]
-------------------------------------------------- 


Average accuracy of cross-val data (incl. train data): 0.5233
SD of accuracy: 0.06081
--------------------------------------------------

Time taken: 1.4 minutes.


In [218]:
count = 0
for speech in speeches:
    ans = knn.predict(X[word_counts['which_speech'] == speech])
    percentdem = list(ans).count('dem') / len(ans)
    if percentdem > 0.5 and speech.startswith('dem'):
        count +=1
print('Percentage accuracy of model: {}'.format(count / len(speeches)))

Percentage accuracy of model: 0.42857142857142855


In [219]:
knn.fit(X[word_counts['which_speech'].isin(train_pres)], Y[word_counts['which_speech'].isin(train_pres)])

success_count = 0
for speech in test_pres:
    phrase_predictions = knn.predict(X[word_counts['which_speech'] == speech])
    countdem = [1 for value in phrase_predictions if value == 'dem']
    fracdem = np.sum(countdem) / len(phrase_predictions)
    if fracdem > 0.5 and speech.startswith('dem'):
        success_count += 1
    elif fracdem < 0.5 and speech.startswith('rep'):
        success_count += 1
print('Success rate in predicting party of Presidents: ', success_count / len(test_pres))

Success rate in predicting party of Presidents:  0.5


### Naive Bayes Classifier

In [222]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

In [223]:
count = 0
address = []
rep_or_dem = []
for speech in speeches:
    bnb.fit(X_train, y_train)
    ans = bnb.predict(X[word_counts['which_speech'] == speech])
    percentdem = list(ans).count('dem') / len(ans)
    address.append(speech)# * len(ans))
    rep_or_dem.extend(list(ans))
    if percentdem > 0.5 and speech.startswith('dem'):
        count +=1
print(len(address), len(rep_or_dem), len(ans))
print(address)
print('Percentage accuracy of model: {}'.format(count / len(speeches)))

28 3080 115
['dem0', 'dem1', 'dem2', 'dem3', 'dem4', 'dem5', 'dem6', 'dem7', 'dem8', 'dem9', 'dem11', 'dem12', 'dem13', 'rep1', 'rep2', 'rep3', 'rep4', 'rep5', 'rep6', 'rep7', 'rep8', 'rep9', 'rep10', 'rep11', 'rep12', 'rep13', 'rep14', 'rep15']
Percentage accuracy of model: 0.21428571428571427


In [224]:
bnb.fit(X[word_counts['which_speech'].isin(train_pres)], Y[word_counts['which_speech'].isin(train_pres)])

success_count = 0
for speech in test_pres:
    phrase_predictions = bnb.predict(X[word_counts['which_speech'] == speech])
    countdem = [1 for value in phrase_predictions if value == 'dem']
    fracdem = np.sum(countdem) / len(phrase_predictions)
    if fracdem > 0.5 and speech.startswith('dem'):
        success_count += 1
    elif fracdem < 0.5 and speech.startswith('rep'):
        success_count += 1
print('Success rate in predicting party of Presidents: ', success_count / len(test_pres))

Success rate in predicting party of Presidents:  0.5


## 2. tf-idf Approach 

In [17]:
dem_dict = {}
for index, speech in enumerate(dem):
    dem_dict['dem' + str(index)] = inaugural.paras(speech)
    dem_dict['dem' + str(index)][0][0].insert(0, 'dem' + str(index))

In [18]:
dem_dict = {}
dem_list = []
for index, speech in enumerate(dem):
    dem_dict['dem' + str(index)] = inaugural.paras(speech)
    dem_dict['dem' + str(index)][0][0].insert(0, 'dem' + str(index))
    words = dem_dict['dem' + str(index)]
    dem_list.append(words)

In [19]:
rep_dict = {}
rep_list = []
for index, speech in enumerate(rep):
    rep_dict['rep' + str(index)] = inaugural.paras(speech)
    rep_dict['rep' + str(index)][0][0].insert(0, 'rep' + str(index))
    words = rep_dict['rep' + str(index)]
    rep_list.append(words)

In [21]:
len(rep_list)

16

In [22]:
speech_paras = dem_dict['dem0']
for i in range(1, len(dem_dict)):
    x = dem_dict['dem' + str(i)]
    speech_paras += x
for key in rep_dict:
    speech_paras += rep_dict[key] 

In [23]:
dem_dict['dem4']

[[['dem4', 'On', 'each', 'national', 'day', 'of', 'inauguration', 'since', '1789', ',', 'the', 'people', 'have', 'renewed', 'their', 'sense', 'of', 'dedication', 'to', 'the', 'United', 'States', '.']], [['In', 'Washington', "'", 's', 'day', 'the', 'task', 'of', 'the', 'people', 'was', 'to', 'create', 'and', 'weld', 'together', 'a', 'nation', '.']], ...]

In [25]:
speech_paras

[[['There', 'has', 'been', 'a', 'change', 'of', 'government', '.'], ['It', 'began', 'two', 'years', 'ago', ',', 'when', 'the', 'House', 'of', 'Representatives', 'became', 'Democratic', 'by', 'a', 'decisive', 'majority', '.'], ['It', 'has', 'now', 'been', 'completed', '.'], ['The', 'Senate', 'about', 'to', 'assemble', 'will', 'also', 'be', 'Democratic', '.'], ['The', 'offices', 'of', 'President', 'and', 'Vice', '-', 'President', 'have', 'been', 'put', 'into', 'the', 'hands', 'of', 'Democrats', '.'], ['What', 'does', 'the', 'change', 'mean', '?'], ['That', 'is', 'the', 'question', 'that', 'is', 'uppermost', 'in', 'our', 'minds', 'today', '.'], ['That', 'is', 'the', 'question', 'I', 'am', 'going', 'to', 'try', 'to', 'answer', ',', 'in', 'order', ',', 'if', 'I', 'may', ',', 'to', 'interpret', 'the', 'occasion', '.']], [['It', 'means', 'much', 'more', 'than', 'the', 'mere', 'success', 'of', 'a', 'party', '.'], ['The', 'success', 'of', 'a', 'party', 'means', 'little', 'except', 'when', 'the'

In [26]:
# This be it!
final = {}
for key in dem_dict:
    final[key] = []
    for paragraph in dem_dict[key]:        
        final_para = paragraph[0]
        for i in range(1, len(paragraph)):
            final_para.extend(paragraph[i])           
        final[key].append(' '.join(final_para))

for key in rep_dict:
    final[key] = []
    for paragraph in rep_dict[key]:        
        final_para = paragraph[0]
        for i in range(1, len(paragraph)):
            final_para.extend(paragraph[i])           
        final[key].append(' '.join(final_para))

In [27]:
final['rep9']

['Senator Dirksen , Mr . Chief Justice , Mr . Vice President , President Johnson , Vice President Humphrey , my fellow Americans -- and my fellow citizens of the world community :',
 'I ask you to share with me today the majesty of this moment . In the orderly transfer of power , we celebrate the unity that keeps us free .',
 'Each moment in history is a fleeting time , precious and unique . But some stand out as moments of beginning , in which courses are set that shape decades or centuries .',
 'This can be such a moment .',
 "Forces now are converging that make possible , for the first time , the hope that many of man ' s deepest aspirations can at last be realized . The spiraling pace of change allows us to contemplate , within our own lifetime , advances that once would have taken centuries .",
 'In throwing wide the horizons of space , we have discovered new horizons on earth .',
 'For the first time , because the people of the world want peace , and the leaders of the world are 

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

#######X_train, X_test = train_test_split(final, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=1.0, # drop words that occur in more than half the paragraphs
                             min_df=1, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )

#Applying the vectorizer
final_tfidf = {}
final_tfidf_csr = {}
tfidf_bypara = {}
for key in final:
    final_tfidf[key]=vectorizer.fit_transform(final[key])
    print("Number of features: %d" % final_tfidf[key].get_shape()[1])
    final_tfidf_csr[key] = final_tfidf[key].tocsr()

#number of paragraphs
    n = final_tfidf_csr[key].shape[0]
#A list of dictionaries, one per paragraph
    tfidf_bypara[key] = [{} for _ in range(0,n)]
#List of features
    terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
    for i, j in zip(*final_tfidf_csr[key].nonzero()):
        tfidf_bypara[key][i][terms[j]] = final_tfidf_csr[key][i, j]
        
#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
    print('Original sentence:', final[key][0])
    print('Tf_idf vector:', tfidf_bypara[key][0])

Number of features: 497
Original sentence: There has been a change of government . It began two years ago , when the House of Representatives became Democratic by a decisive majority . It has now been completed . The Senate about to assemble will also be Democratic . The offices of President and Vice - President have been put into the hands of Democrats . What does the change mean ? That is the question that is uppermost in our minds today . That is the question I am going to try to answer , in order , if I may , to interpret the occasion .
Tf_idf vector: {'change': 0.25187927420212392, 'government': 0.092356583371141721, 'began': 0.16753603146542811, 'years': 0.12593963710106196, 'ago': 0.14320370059706508, 'house': 0.16753603146542811, 'representatives': 0.16753603146542811, 'democratic': 0.28640740119413016, 'decisive': 0.16753603146542811, 'majority': 0.16753603146542811, 'completed': 0.16753603146542811, 'senate': 0.16753603146542811, 'assemble': 0.16753603146542811, 'offices': 0.

Tf_idf vector: {'years': 0.14123029500002887, 'ago': 0.14123029500002887, 'met': 0.17133936759415175, 'inaugurate': 0.17133936759415175, 'president': 0.1537266891950988, 'republic': 0.28246059000005774, 'single': 0.17133936759415175, 'minded': 0.17133936759415175, 'anxiety': 0.17133936759415175, 'stood': 0.17133936759415175, 'spirit': 0.14123029500002887, 'dedicated': 0.17133936759415175, 'fulfillment': 0.17133936759415175, 'vision': 0.1537266891950988, 'speed': 0.17133936759415175, 'time': 0.17133936759415175, 'people': 0.10142826602685469, 'security': 0.17133936759415175, 'peace': 0.13153733862097758, 'essential': 0.1537266891950988, 'pursuit': 0.17133936759415175, 'happiness': 0.17133936759415175, 'pledged': 0.17133936759415175, 'drive': 0.1537266891950988, 'temple': 0.17133936759415175, 'ancient': 0.17133936759415175, 'faith': 0.17133936759415175, 'profaned': 0.17133936759415175, 'end': 0.17133936759415175, 'action': 0.1537266891950988, 'tireless': 0.17133936759415175, 'unafraid': 

Tf_idf vector: {'fellow': 0.093820398050541862, 'citizens': 0.12480818209187956, 'today': 0.070983467372623316, 'celebrate': 0.11194708969516989, 'mystery': 0.14293487373650762, 'american': 0.10197125141396102, 'renewal': 0.10197125141396102, 'ceremony': 0.12480818209187956, 'held': 0.14293487373650762, 'depth': 0.14293487373650762, 'winter': 0.14293487373650762, 'words': 0.14293487373650762, 'speak': 0.14293487373650762, 'faces': 0.14293487373650762, 'world': 0.21295040211786997, 'force': 0.12480818209187956, 'spring': 0.24961636418375913, 'reborn': 0.14293487373650762, 'oldest': 0.14293487373650762, 'democracy': 0.10197125141396102, 'brings': 0.14293487373650762, 'forth': 0.12480818209187956, 'vision': 0.12480818209187956, 'courage': 0.14293487373650762, 'reinvent': 0.14293487373650762, 'america': 0.25133045603681653, 'founders': 0.12480818209187956, 'boldly': 0.14293487373650762, 'declared': 0.14293487373650762, 'independence': 0.14293487373650762, 'purposes': 0.14293487373650762, '

Tf_idf vector: {'fellow': 0.10287378294496817, 'citizens': 0.10287378294496817, 'assembled': 0.11978620341297963, '4th': 0.11978620341297963, 'march': 0.11978620341297963, '1897': 0.11978620341297963, 'great': 0.081566653028723943, 'anxiety': 0.11978620341297963, 'regard': 0.11978620341297963, 'currency': 0.11978620341297963, 'credit': 0.11978620341297963, 'exists': 0.11978620341297963, 'treasury': 0.11978620341297963, 'receipts': 0.11978620341297963, 'inadequate': 0.11978620341297963, 'meet': 0.10287378294496817, 'current': 0.11978620341297963, 'obligations': 0.090874229658830588, 'government': 0.16313330605744789, 'sufficient': 0.11978620341297963, 'public': 0.067532005342018073, 'needs': 0.11978620341297963, 'surplus': 0.11978620341297963, 'instead': 0.11978620341297963, 'deficit': 0.11978620341297963, 'felt': 0.11978620341297963, 'constrained': 0.11978620341297963, 'convene': 0.11978620341297963, 'congress': 0.14792361838163823, 'extraordinary': 0.11978620341297963, 'session': 0.10

Tf_idf vector: {'countrymen': 0.16946444963274582, 'occasion': 0.199141885306999, 'administration': 0.15887211435815318, 'sacred': 0.22269819122470405, 'oath': 0.199141885306999, 'assumed': 0.22269819122470405, 'american': 0.16946444963274582, 'citizen': 0.199141885306999, 'dedication': 0.22269819122470405, 'consecration': 0.22269819122470405, 'god': 0.199141885306999, 'highest': 0.18242842027585821, 'office': 0.16946444963274582, 'service': 0.14215864932701239, 'people': 0.1139521023139965, 'assume': 0.22269819122470405, 'trust': 0.22269819122470405, 'humility': 0.22269819122470405, 'knowledge': 0.18242842027585821, 'guidance': 0.199141885306999, 'almighty': 0.199141885306999, 'providence': 0.22269819122470405, 'hope': 0.15887211435815318, 'discharge': 0.22269819122470405, 'increasing': 0.199141885306999, 'burdens': 0.199141885306999}
Number of features: 718
Original sentence: My friends , before I begin the expression of those thoughts that I deem appropriate to this moment , would y

Original sentence: President Clinton , distinguished guests and my fellow citizens , the peaceful transfer of authority is rare in history , yet common in our country . With a simple oath , we affirm old traditions and make new beginnings .
Tf_idf vector: {'president': 0.20046773203820425, 'clinton': 0.21682491049139002, 'distinguished': 0.23987905868123649, 'guests': 0.23987905868123649, 'fellow': 0.23987905868123649, 'citizens': 0.16864880348813219, 'peaceful': 0.23987905868123649, 'transfer': 0.23987905868123649, 'authority': 0.23987905868123649, 'rare': 0.23987905868123649, 'history': 0.20046773203820425, 'common': 0.18778011873502934, 'country': 0.15435943565851129, 'simple': 0.21682491049139002, 'oath': 0.23987905868123649, 'affirm': 0.20046773203820425, 'old': 0.21682491049139002, 'traditions': 0.23987905868123649, 'make': 0.20046773203820425, 'new': 0.18778011873502934, 'beginnings': 0.23987905868123649}
Number of features: 606


Original sentence: rep15 Vice President Cheney , Mr . Chief Justice , President Carter , President Bush , President Clinton , members of the United States Congress , reverend clergy , distinguished guests , fellow citizens : On this day , prescribed by law and marked by ceremony , we celebrate the durable wisdom of our Constitution , and recall the deep commitments that unite our country . I am grateful for the honor of this hour , mindful of the consequential times in which we live , and determined to fulfill the oath that I have sworn and you have witnessed . At this second gathering , our duties are defined not by the words I use , but by the history we have seen together . For a half a century , America defended our own freedom by standing watch on distant borders . After the shipwreck of communism came years of relative quiet , years of repose , years of sabbatical ¡ Xand then there came a day of fire . We have seen our vulnerability ¡ Xand we have seen its deepest source . For as

Tf_idf vector: {'rep15': 0.016910333934197906, 'vice': 0.016910333934197906, 'president': 0.067641335736791625, 'cheney': 0.016910333934197906, 'mr': 0.016910333934197906, 'chief': 0.016910333934197906, 'justice': 0.10146200360518744, 'carter': 0.016910333934197906, 'bush': 0.016910333934197906, 'clinton': 0.016910333934197906, 'members': 0.016910333934197906, 'united': 0.084551669670989524, 'states': 0.084551669670989524, 'congress': 0.016910333934197906, 'reverend': 0.016910333934197906, 'clergy': 0.016910333934197906, 'distinguished': 0.016910333934197906, 'guests': 0.016910333934197906, 'fellow': 0.050731001802593718, 'citizens': 0.10146200360518744, 'day': 0.10146200360518744, 'prescribed': 0.016910333934197906, 'law': 0.033820667868395812, 'marked': 0.016910333934197906, 'ceremony': 0.016910333934197906, 'celebrate': 0.016910333934197906, 'durable': 0.016910333934197906, 'wisdom': 0.016910333934197906, 'constitution': 0.016910333934197906, 'recall': 0.016910333934197906, 'deep': 




In [34]:
# Create a list to hold the names of the variables:
allwords = []
list_of_para_names = []
list_of_indices = []
for key in tfidf_bypara:
    for i, key2 in enumerate(tfidf_bypara[key]):
        list_of_para_names.append(str(key))
        list_of_indices.append(str(i))
        for key3 in tfidf_bypara[key][i]:
            allwords.append(key3)
#final_table
final_table = pd.DataFrame(columns=allwords)
final_table['para_name'] = list_of_para_names
final_table['para_index'] = list_of_indices
final_table.loc[:, allwords] = 0

In [35]:
final_table.head()

Unnamed: 0,change,government,began,years,ago,house,representatives,democratic,decisive,majority,...,strength,xtested,weary,xwe,ready,greatest,achievements,bless,para_name,para_index
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,dem0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,dem0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,dem0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,dem0,3
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,dem0,4


In [36]:
final_table['para_index'] = final_table['para_index'].astype(int)

In [37]:
type(final_table.loc[2, 'para_index'])

int

In [40]:
start_time = time.time()
for i in range(len(final_table)):
    row_dict = tfidf_bypara[final_table.loc[i, 'para_name']][final_table.loc[i, 'para_index']]
    for key in row_dict:
        final_table.loc[i, key] = row_dict[key]
interval = time.time() - start_time
print('Time taken: {} minutes'.format('%.1f' % (interval / 60)))

Time taken: 79.5 minutes


In [41]:
final_table.head()

Unnamed: 0,change,government,began,years,ago,house,representatives,democratic,decisive,majority,...,strength,xtested,weary,xwe,ready,greatest,achievements,bless,para_name,para_index
0,0.251879,0.0923566,0.167536,0.12594,0.143204,0.167536,0.167536,0.286407,0.167536,0.167536,...,0.0,0,0,0,0,0,0.0,0,dem0,0
1,0.0830581,0.0,0.0,0.0,0.0,0.0,0.0,0.0944439,0.0,0.0,...,0.0,0,0,0,0,0,0.0,0,dem0,1
2,0.0854528,0.062666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.113677,0,0,0,0,0,0.0,0,dem0,2
3,0.0,0.11471,0.0,0.0782107,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0.104043,0,dem0,3
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0,0,0,0,0,0.0,0,dem0,4


In [42]:
final_table['which_party'] = [final_table['para_name'][i][:3] for i in range(len(final_table['para_name']))]

In [43]:
final_table.head()

Unnamed: 0,change,government,began,years,ago,house,representatives,democratic,decisive,majority,...,xtested,weary,xwe,ready,greatest,achievements,bless,para_name,para_index,which_party
0,0.251879,0.0923566,0.167536,0.12594,0.143204,0.167536,0.167536,0.286407,0.167536,0.167536,...,0,0,0,0,0,0.0,0,dem0,0,dem
1,0.0830581,0.0,0.0,0.0,0.0,0.0,0.0,0.0944439,0.0,0.0,...,0,0,0,0,0,0.0,0,dem0,1,dem
2,0.0854528,0.062666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.0,0,dem0,2,dem
3,0.0,0.11471,0.0,0.0782107,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.104043,0,dem0,3,dem
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0.0,0,dem0,4,dem


In [112]:
start_time = time.time()

X = final_table.drop(['para_name', 'para_index', 'which_party'], axis = 1)
Y = final_table['which_party']

train_pres = ['dem0', 'dem1', 'dem2', 'dem3', 'dem4', 'dem5', 'dem6', 
              'dem7', 'dem9', 'rep1', 'dem8', 
               'rep6', 'rep7', 'rep8', 'rep9', 'rep10', 'rep11', 'rep12', 'rep15', 'rep14', 'rep13']

test_pres = ['dem11', 'dem12', 'dem13','rep2', 'rep3', 'rep4', 'rep5']


Xtrain = X[final_table['para_name'].isin(train_pres)]
Ytrain = Y[final_table['para_name'].isin(train_pres)]

Xtest = X[final_table['para_name'].isin(test_pres)]
Ytest = Y[final_table['para_name'].isin(test_pres)]

which_party  dem  rep
0                    
dem           20   46
rep           66   59
Success rate in predicting party of President from Inaugural Speech: 42.9%
Time taken: 0.0 minutes


## Results
The calculated predictions are shown below. Clearly the tf-idf model was not able to predict at the speech level (as dictinct from the paragraph level) the party affiliation of the speaker. The results for all models shown range from 42.9% to 57%. For this (tf-idf) approach, we used all words and did not restrict the model to the most common words, say 2000 or so. Restricting the word set (and other parameters of the tf-idf instantiaton) may have changed the outcome. This and other avenues could not be explored due to the considerable amount of time and energy already spent on this task, with other projects waiting.

### Random Forest Classifier

In [131]:
from sklearn import ensemble
rfc = ensemble.RandomForestClassifier()
fit_model = rfc.fit(Xtrain, Ytrain)

final_table_pres = final_table[final_table['para_name'].isin(test_pres)]
y_pred = pd.DataFrame(rfc.predict(Xtest), index=final_table_pres.index)

tab = pd.crosstab(y_pred[0], Ytest)
print(tab)
success_count = 0
for pres in test_pres:
#    affiliation.append(pres[:3])    
    pred_mat = y_pred[final_table_pres['para_name'] == pres]
    total_dem = sum(1 for value in pred_mat[0] if value == 'dem')
    frac_dem = total_dem / len(pred_mat)
    if frac_dem > 0.5 and pres[:3] == 'dem':
        success_count += 1
    elif frac_dem < 0.5 and pres[:3] == 'rep':
        success_count += 1

success_rate = success_count / len(test_pres)     
interval = time.time() - start_time

print('Success rate in predicting party of President from Inaugural Speech: {}%'.format('%.1f' % (success_rate * 100)))  
print('Time taken: {} minutes'.format('%.1f' % (interval / 60)))

which_party  dem  rep
0                    
dem           31   37
rep           55   68
Success rate in predicting party of President from Inaugural Speech: 57.1%
Time taken: 11.4 minutes


#### Paragraph by paragraph:

In [137]:
from sklearn.model_selection import cross_val_score
import numpy as np

score = cross_val_score(rfc, X, np.array(Y).ravel(), cv=6)
print(score)
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\n Time taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

[ 0.65644172  0.67484663  0.2962963   0.40372671  0.54037267  0.52173913]

Average accuracy of cross-val data (incl. train data): 0.5156
SD of accuracy: 0.13320
--------------------------------------------------

 Time taken: 14.0 minutes.


### Logistic Regression

In [113]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
fit_model = lr.fit(Xtrain, Ytrain)

final_table_pres = final_table[final_table['para_name'].isin(test_pres)]
y_pred = pd.DataFrame(lr.predict(Xtest), index=final_table_pres.index)

tab = pd.crosstab(y_pred[0], Ytest)
print(tab)
success_count = 0
for pres in test_pres:
#    affiliation.append(pres[:3])    
    pred_mat = y_pred[final_table_pres['para_name'] == pres]
    total_dem = sum(1 for value in pred_mat[0] if value == 'dem')
    frac_dem = total_dem / len(pred_mat)
    if frac_dem > 0.5 and pres[:3] == 'dem':
        success_count += 1
    elif frac_dem < 0.5 and pres[:3] == 'rep':
        success_count += 1

success_rate = success_count / len(test_pres)     
interval = time.time() - start_time

print('Success rate in predicting party of President from Inaugural Speech: {}%'.format('%.1f' % (success_rate * 100)))  
print('Time taken: {} minutes'.format('%.1f' % (interval / 60)))

which_party  dem  rep
0                    
dem           23   41
rep           63   64
Success rate in predicting party of President from Inaugural Speech: 42.9%
Time taken: 4.1 minutes


#### Paragraph by paragraph:

In [139]:
start_time = time.time()
score = cross_val_score(lr, X, np.array(Y).ravel(), cv=6)
print(score)
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\n Time taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

[ 0.67484663  0.61349693  0.36419753  0.38509317  0.57142857  0.52173913]

Average accuracy of cross-val data (incl. train data): 0.5218
SD of accuracy: 0.11389
--------------------------------------------------

 Time taken: 0.3 minutes.


### Gradient Boosting

In [129]:
params  =  {'n_estimators' : 500,
          'max_depth': 10,
          'loss': 'deviance'}

In [130]:
start_time = time.time()
clf = ensemble.GradientBoostingClassifier(**params)
fit_model = clf.fit(Xtrain, Ytrain)

final_table_pres = final_table[final_table['para_name'].isin(test_pres)]
y_pred = pd.DataFrame(clf.predict(Xtest), index=final_table_pres.index)

tab = pd.crosstab(y_pred[0], Ytest)
print(tab)
success_count = 0
for pres in test_pres:
#    affiliation.append(pres[:3])    
    pred_mat = y_pred[final_table_pres['para_name'] == pres]
    total_dem = sum(1 for value in pred_mat[0] if value == 'dem')
    frac_dem = total_dem / len(pred_mat)
    if frac_dem > 0.5 and pres[:3] == 'dem':
        success_count += 1
    elif frac_dem < 0.5 and pres[:3] == 'rep':
        success_count += 1

success_rate = success_count / len(test_pres)     
interval = time.time() - start_time

print('Success rate in predicting party of President from Inaugural Speech: {}%'.format('%.1f' % (success_rate * 100)))  
print('Time taken: {} minutes'.format('%.1f' % (interval / 60)))

which_party  dem  rep
0                    
dem           20   37
rep           66   68
Success rate in predicting party of President from Inaugural Speech: 42.9%
Time taken: 11.3 minutes


#### Paragraph by paragraph:

In [140]:
start_time = time.time()
score = cross_val_score(clf, X, np.array(Y).ravel(), cv=6)
print(score)
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\n Time taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

[ 0.67484663  0.56441718  0.37037037  0.50931677  0.57142857  0.55900621]

Average accuracy of cross-val data (incl. train data): 0.5416
SD of accuracy: 0.09113
--------------------------------------------------

 Time taken: 85.6 minutes.


### Support Vector Classifier

In [118]:
from sklearn.svm import SVC
start_time = time.time()
svc = SVC()
fit_model = svc.fit(Xtrain, Ytrain)

final_table_pres = final_table[final_table['para_name'].isin(test_pres)]
y_pred = pd.DataFrame(svc.predict(Xtest), index=final_table_pres.index)

tab = pd.crosstab(y_pred[0], Ytest)
print(tab)
success_count = 0
for pres in test_pres:
#    affiliation.append(pres[:3])    
    pred_mat = y_pred[final_table_pres['para_name'] == pres]
    total_dem = sum(1 for value in pred_mat[0] if value == 'dem')
    frac_dem = total_dem / len(pred_mat)
    if frac_dem > 0.5 and pres[:3] == 'dem':
        success_count += 1
    elif frac_dem < 0.5 and pres[:3] == 'rep':
        success_count += 1

success_rate = success_count / len(test_pres)     
interval = time.time() - start_time

print('Success rate in predicting party of President from Inaugural Speech: {}%'.format('%.1f' % (success_rate * 100)))  
print('Time taken: {} minutes'.format('%.1f' % (interval / 60)))

which_party  dem  rep
0                    
rep           86  105
Success rate in predicting party of President from Inaugural Speech: 57.1%
Time taken: 0.5 minutes


#### Paragraph by paragraph:

In [141]:
start_time = time.time()
score = cross_val_score(svc, X, np.array(Y).ravel(), cv=6)
print(score)
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\n Time taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

[ 0.56441718  0.56441718  0.5617284   0.56521739  0.56521739  0.56521739]

Average accuracy of cross-val data (incl. train data): 0.5644
SD of accuracy: 0.00123
--------------------------------------------------

 Time taken: 5.8 minutes.


### K Nearest Neighbors

In [128]:
from sklearn.neighbors import KNeighborsClassifier

start_time = time.time()
knn = KNeighborsClassifier(n_neighbors=250, weights='distance')

#start_time = time.time()
#svc = SVC()
fit_model = knn.fit(Xtrain, Ytrain)

final_table_pres = final_table[final_table['para_name'].isin(test_pres)]
y_pred = pd.DataFrame(knn.predict(Xtest), index=final_table_pres.index)

tab = pd.crosstab(y_pred[0], Ytest)
print(tab)
success_count = 0
for pres in test_pres:
#    affiliation.append(pres[:3])    
    pred_mat = y_pred[final_table_pres['para_name'] == pres]
    total_dem = sum(1 for value in pred_mat[0] if value == 'dem')
    frac_dem = total_dem / len(pred_mat)
    if frac_dem > 0.5 and pres[:3] == 'dem':
        success_count += 1
    elif frac_dem < 0.5 and pres[:3] == 'rep':
        success_count += 1

success_rate = success_count / len(test_pres)     
interval = time.time() - start_time

print('Success rate in predicting party of President from Inaugural Speech: {}%'.format('%.1f' % (success_rate * 100)))  
print('Time taken: {} minutes'.format('%.1f' % (interval / 60)))

which_party  dem  rep
0                    
rep           86  105
Success rate in predicting party of President from Inaugural Speech: 57.1%
Time taken: 0.2 minutes


#### Paragraph by paragraph:

In [142]:
start_time = time.time()
score = cross_val_score(knn, X, np.array(Y).ravel(), cv=6)
print(score)
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\n Time taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

[ 0.56441718  0.56441718  0.32098765  0.56521739  0.56521739  0.56521739]

Average accuracy of cross-val data (incl. train data): 0.5242
SD of accuracy: 0.09090
--------------------------------------------------

 Time taken: 1.6 minutes.


### Naive Bayes Classifier

In [127]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

fit_model = bnb.fit(Xtrain, Ytrain)

final_table_pres = final_table[final_table['para_name'].isin(test_pres)]
y_pred = pd.DataFrame(bnb.predict(Xtest), index=final_table_pres.index)

tab = pd.crosstab(y_pred[0], Ytest)
print(tab)
success_count = 0
for pres in test_pres:
#    affiliation.append(pres[:3])    
    pred_mat = y_pred[final_table_pres['para_name'] == pres]
    total_dem = sum(1 for value in pred_mat[0] if value == 'dem')
    frac_dem = total_dem / len(pred_mat)
    if frac_dem > 0.5 and pres[:3] == 'dem':
        success_count += 1
    elif frac_dem < 0.5 and pres[:3] == 'rep':
        success_count += 1

success_rate = success_count / len(test_pres)     
interval = time.time() - start_time

print('Success rate in predicting party of President from Inaugural Speech: {}%'.format('%.1f' % (success_rate * 100)))  
print('Time taken: {} minutes'.format('%.1f' % (interval / 60)))

which_party  dem  rep
0                    
dem           20   47
rep           66   58
Success rate in predicting party of President from Inaugural Speech: 42.9%
Time taken: 5.2 minutes


#### Paragraph by paragraph

In [143]:
start_time = time.time()
score = cross_val_score(bnb, X, np.array(Y).ravel(), cv=6)
print(score)
print('\nAverage accuracy of cross-val data (incl. train data): {}'.format('%.4f' %(sum(score)/len(score))))
print('SD of accuracy: {}'.format('%.5f' % np.std(score)))
print('-' * 50)
print('\n Time taken: {} minutes.'.format('%.1f' % ((time.time() - start_time) / 60)))

[ 0.65030675  0.68711656  0.40740741  0.34782609  0.55279503  0.48447205]

Average accuracy of cross-val data (incl. train data): 0.5217
SD of accuracy: 0.12217
--------------------------------------------------

 Time taken: 0.4 minutes.
