# Prediction by Individual Tweet

In [3]:
import pandas as pd
import json
import nltk
from nltk.collocations import *
from nltk.stem import PorterStemmer
from nltk.util import ngrams

In [None]:
Final_Congress_tweets = pd.read_csv('final congress.csv')

In [10]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
Final_Congress_tweets["tweet"] = Final_Congress_tweets['tweet'].apply(lambda x: nltk.regexp_tokenize(x,pattern))

In [None]:
with open('vocab_total_D_R.json', 'r') as f:
        vocab_total_D_R = json.load(f)

In [None]:
def stem_to_final_list(list_of_tokens):
    porter = PorterStemmer()
    line = [porter.stem(word) for word in list_of_tokens]
    line_stem = [word for word in line if word in vocab_total_D_R]
    return line_stem

In [15]:
for i in range(len(Final_Congress_tweets)):
    Final_Congress_tweets['tweet'][i] = stem_to_final_list(Final_Congress_tweets['tweet'][i]) 



In [175]:
Final_Congress_tweets

Unnamed: 0,tweet,username,Party
0,.@repjohnlewis was gracious enough to spend ti...,repterrisewell,D
1,"At 450 pages, the Mueller Report is lengthy an...",repterrisewell,D
2,"I am so grateful to have Samuel, our Selma int...",repterrisewell,D
3,Anyone who knows me knows my dad was an instru...,repterrisewell,D
4,Please help me welcome our newest summer inter...,repterrisewell,D
5,I always love meeting with talented students f...,repterrisewell,D
6,No American is above the law & for Congress to...,repterrisewell,D
7,I am thrilled to welcome our newest summer int...,repterrisewell,D
8,"Our DC summer interns — John, Diamond, Marjori...",repterrisewell,D
9,: Please join me this Friday in Pickens County...,repterrisewell,D


In [17]:
Final_Congress_tweets.to_csv('Congress_tweets.csv', encoding='utf-8', index=False)

## Predictions

In [4]:
df1  = pd.read_csv('Congress_tweets.csv')
df1.head()

Unnamed: 0,tweet,username,Party
0,"['repjohnlewi', 'wa', 'enough', 'spend', 'time...",repterrisewell,D
1,"['page', 'mueller', 'report', 'make', 'clear',...",repterrisewell,D
2,"['grate', 'samuel', 'selma', 'intern', 'team',...",repterrisewell,D
3,"['anyon', 'know', 'know', 'dad', 'wa', 'instru...",repterrisewell,D
4,"['pleas', 'help', 'welcom', 'newest', 'summer'...",repterrisewell,D


In [5]:
df1.set_index('username', inplace = True)
df1.head()

Unnamed: 0_level_0,tweet,Party
username,Unnamed: 1_level_1,Unnamed: 2_level_1
repterrisewell,"['repjohnlewi', 'wa', 'enough', 'spend', 'time...",D
repterrisewell,"['page', 'mueller', 'report', 'make', 'clear',...",D
repterrisewell,"['grate', 'samuel', 'selma', 'intern', 'team',...",D
repterrisewell,"['anyon', 'know', 'know', 'dad', 'wa', 'instru...",D
repterrisewell,"['pleas', 'help', 'welcom', 'newest', 'summer'...",D


In [6]:
data = df1['tweet']
target = df1['Party']

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics
import numpy as np
import xgboost as xgb
import os
os.environ['KMP_DUPLICATE_LIB_OK']= 'True'

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data , target, test_size=0.20)

## TF-IDF Vectorizer

In [9]:
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(X_train)
tf_idf_data_test = vectorizer.transform(X_test)

In [10]:
tf_idf_data_train.shape

(141024, 9554)

In [11]:
df_idf_visual = pd.DataFrame(tf_idf_data_train.todense(),columns = vectorizer.get_feature_names())

In [12]:
non_zero_cols = tf_idf_data_train.nnz / float(tf_idf_data_train.shape[0])
print("Average Number of Non-Zero Elements in Vectorized Articles: {}".format(non_zero_cols))

percent_sparse = 1 - (non_zero_cols / float(tf_idf_data_train.shape[1]))
print('Percentage of columns containing 0: {}'.format(percent_sparse))

Average Number of Non-Zero Elements in Vectorized Articles: 19.253850408441117
Percentage of columns containing 0: 0.9979847341000166


## Naive Bayes - TF-IDF

In [13]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier(n_estimators=150,max_depth=180, n_jobs=-1)
xgb_classifier = xgb.XGBClassifier()

In [14]:
nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

In [15]:
confusion = metrics.confusion_matrix(y_test, nb_test_preds, labels = ['D','R'])
confusion

array([[20669,  1778],
       [ 5047,  7762]])

## Random Forest - TF-IDF

In [16]:
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

In [17]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))



Multinomial Naive Bayes
Training Accuracy: 0.8196 		 Testing Accuracy: 0.8064

----------------------------------------------------------------------

Random Forest
Training Accuracy: 0.9646 		 Testing Accuracy: 0.7859


## XGboost TF-IDF

In [18]:
xgb_classifier.fit(tf_idf_data_train, y_train)
xgb_train_preds = xgb_classifier.predict(tf_idf_data_train)
xgb_test_preds = xgb_classifier.predict(tf_idf_data_test)
xgb_train_score = accuracy_score(y_train, xgb_train_preds)
xgb_test_score = accuracy_score(y_test, xgb_test_preds)
print('xgboost')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(xgb_train_score, xgb_test_score))

xgboost
Training Accuracy: 0.706 		 Testing Accuracy: 0.7022


In [19]:
confusion = metrics.confusion_matrix(y_test, xgb_test_preds, labels = ['D','R'])
confusion

array([[21780,   667],
       [ 9833,  2976]])

# Count Vectorizer with Bigrams and Trigrams

In [20]:
count_vectorizer = CountVectorizer(ngram_range=(1, 3),min_df  = 0.0001, max_df = 0.01)
count_data_train = count_vectorizer.fit_transform(X_train)
count_data_test = count_vectorizer.transform(X_test)

In [21]:
count_data_train.shape

(141024, 38243)

## Naive Bayes - Count Vectorizer

In [22]:
nb_classifier.fit(count_data_train, y_train)
nb_train_preds = nb_classifier.predict(count_data_train)
nb_test_preds = nb_classifier.predict(count_data_test)

In [23]:
confusion = metrics.confusion_matrix(y_test, nb_test_preds, labels = ['D','R'])
confusion

array([[18344,  4103],
       [ 2474, 10335]])

In [24]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")


Multinomial Naive Bayes
Training Accuracy: 0.8352 		 Testing Accuracy: 0.8135



In [25]:
rep_class_prob_sorted = nb_classifier.feature_log_prob_[0, :].argsort()
dem_class_prob_sorted = nb_classifier.feature_log_prob_[1, :].argsort()

print(np.take(count_vectorizer.get_feature_names(), rep_class_prob_sorted[:35]))
print(np.take(count_vectorizer.get_feature_names(), dem_class_prob_sorted[:35]))


['westmichigan' 'blunt senat news' 'presid ronald' 'senat iqextranet view'
 'senat iqextranet' 'regulatori reform' 'leftist' 'illeg alien'
 'forc vote born' 'longest serv republican' 'senat health committe'
 'press releas amata' 'senat health' 'in north dakota'
 'press releas blunt' 'press releas byrn' 'central arkansa' 'talofa'
 'countri illeg' 'in northeast wisconsin' 'press releas cassidi'
 'everi human life' 'senalexand statu' 'central will' 'regulatori relief'
 'sen mike' 'senat lankford' 'herrerabeutl hous news' 'countymeet'
 'straight month' 'here soundcloud user' 'congressman ben cline'
 'via dailycal' 'senat news dr' 'evansvil']
['wast border' 'promis mexico' 'machinistsunion' 'seawal'
 'promis mexico would' 'fight help prevent' 'due trumpshutdown'
 'fight health care' 'fight health' 'be check' 'fight gun'
 'fight gender equal' 'fight gender' 'in sustain' 'fight full equal'
 'time re open' 'fight free open' 'fight forthepeopl' 'durbin senat'
 'fight equalpay' 'fight equal righ

## Random Forest - Count Vectorizer

In [26]:
rf_classifier.fit(count_data_train, y_train)
rf_train_preds = rf_classifier.predict(count_data_train)
rf_test_preds = rf_classifier.predict(count_data_test)

In [27]:
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

----------------------------------------------------------------------

Random Forest
Training Accuracy: 0.8232 		 Testing Accuracy: 0.7621


In [28]:
df_count = pd.DataFrame(count_data_train.todense(),columns = count_vectorizer.get_feature_names())
feature_importance = pd.DataFrame(rf_classifier.feature_importances_, index=df_count.columns, columns=['importance']).sort_values('importance',ascending=False)
feature_importance.head(25)

Unnamed: 0,importance
foxnew,0.007137
secur border,0.006894
arkansa,0.00556
hoosier,0.005499
alaska,0.005441
indiana,0.005084
endtheshutdown,0.004504
ohio,0.004498
lgbtq,0.004316
via,0.00425


## Xgboost - Count Vectorizer

In [29]:
xgb_classifier.fit(count_data_train, y_train)
xgb_train_preds = xgb_classifier.predict(count_data_train)
xgb_test_preds = xgb_classifier.predict(count_data_test)

In [30]:
xgb_train_score = accuracy_score(y_train, xgb_train_preds)
xgb_test_score = accuracy_score(y_test, xgb_test_preds)
print('xgboost')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(xgb_train_score, xgb_test_score))

xgboost
Training Accuracy: 0.694 		 Testing Accuracy: 0.6906


In [31]:
confusion = metrics.confusion_matrix(y_test, xgb_test_preds, labels = ['D','R'])
confusion

array([[22126,   321],
       [10586,  2223]])

In [32]:
df_count = pd.DataFrame(count_data_train.todense(),columns = count_vectorizer.get_feature_names())
feature_importance = pd.DataFrame(xgb_classifier.feature_importances_, index=df_count.columns, columns=['importance']).sort_values('importance',ascending=False)
feature_importance.head(25)

Unnamed: 0,importance
foxnew,0.017794
igshid,0.016014
secureourbord,0.016014
senat public,0.016014
endtheshutdown,0.014235
lgbtq,0.014235
protectourcar,0.014235
secur border,0.012456
illeg,0.012456
alaska,0.010676


# Count vector with only words (no bigrams or trigrams)

In [33]:
count_vectorizer = CountVectorizer(ngram_range=(1, 1))
count_data_train = count_vectorizer.fit_transform(X_train)
count_data_test = count_vectorizer.transform(X_test)

In [34]:
count_data_train.shape

(141024, 9554)

## Naive Bayes - Count Vectorizer (no bigrams or trigrams)

In [35]:
nb_classifier.fit(count_data_train, y_train)
nb_train_preds = nb_classifier.predict(count_data_train)
nb_test_preds = nb_classifier.predict(count_data_test)

In [36]:
confusion = metrics.confusion_matrix(y_test, nb_test_preds, labels = ['D','R'])
confusion

array([[18537,  3910],
       [ 2956,  9853]])

In [37]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
print("Multinomial Naive Bayes")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")

Multinomial Naive Bayes
Training Accuracy: 0.813 		 Testing Accuracy: 0.8053



In [38]:
rep_class_prob_sorted = nb_classifier.feature_log_prob_[0, :].argsort()
dem_class_prob_sorted = nb_classifier.feature_log_prob_[1, :].argsort()

print(np.take(count_vectorizer.get_feature_names(), rep_class_prob_sorted[:40]))
print(np.take(count_vectorizer.get_feature_names(), dem_class_prob_sorted[:40]))

['notforthepeopleact' 'heraldtribun' 'hice' 'hoeven' 'holdtothecap'
 'homelandgop' 'rephartzl' 'honda' 'hoosierhighlight' 'hoosierhuddl'
 'austinscott' 'housebudgetgop' 'repmikejohnson' 'housesciencegop' 'hrg'
 'hughhewitt' 'hydesmith' 'idahoan' 'iditarod' 'westerncaucu'
 'repannwagner' 'westmichigan' 'ijr' 'ik' 'asignaci' 'weallbleedblu'
 'haut' 'washex' 'backtheblu' 'gaport' 'geneviev' 'gleason' 'riponadv'
 'gojackswbb' 'riggleman' 'gopheat' 'gophous' 'waco' 'goptaxcut' 'wafb']
['theblackcaucu' 'housebluedog' 'nevadacurr' 'housedemwomen' 'conform'
 'housegvp' 'housejudiciary' 'housenewdem' 'ayanna' 'ayannapressley'
 'virginiabeach' 'nevadaproud' 'liberian' 'concordmonitor' 'sunrisemvmt'
 'lgbtqia' 'backgroundcheck' 'vial' 'backpaynow' 'lgbteqcaucu' 'swindl'
 'newdemcoalit' 'commondream' 'houlahan' 'stub' 'sterigen' 'stevekerr'
 'hofel' 'ppact' 'homag' 'homebuy' 'homecar' 'stopandshop' 'stopbernhardt'
 'networklobbi' 'stopendlesswar' 'stopextinct' 'stoptheban' 'stopwheel'
 'liftameric

## Random Forest - Count Vectorizer (no bigrams or trigrams)

In [39]:
rf_classifier.fit(count_data_train, y_train)
rf_train_preds = rf_classifier.predict(count_data_train)
rf_test_preds = rf_classifier.predict(count_data_test)

In [40]:
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)

print('-'*70)
print("")
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

----------------------------------------------------------------------

Random Forest
Training Accuracy: 0.9612 		 Testing Accuracy: 0.7911


In [41]:
confusion = metrics.confusion_matrix(y_test, rf_test_preds, labels = ['D','R'])
confusion

array([[21220,  1227],
       [ 6138,  6671]])

In [42]:
df_count = pd.DataFrame(count_data_train.todense(),columns = count_vectorizer.get_feature_names())
feature_importance = pd.DataFrame(rf_classifier.feature_importances_, index=df_count.columns, columns=['importance']).sort_values('importance',ascending=False)
feature_importance.head(25)

Unnamed: 0,importance
trump,0.010707
democrat,0.006389
bit,0.0062
great,0.005874
border,0.005107
potu,0.004922
forthepeopl,0.00481
trumpshutdown,0.004788
in,0.004404
climat,0.004381


## Xgboost - Count Vectorizer (no bigrams or trigrams)

In [43]:
xgb_classifier.fit(count_data_train, y_train)
xgb_train_preds = xgb_classifier.predict(count_data_train)
xgb_test_preds = xgb_classifier.predict(count_data_test)

In [44]:
xgb_train_score = accuracy_score(y_train, xgb_train_preds)
xgb_test_score = accuracy_score(y_test, xgb_test_preds)
print('xgboost')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(xgb_train_score, xgb_test_score))

xgboost
Training Accuracy: 0.7037 		 Testing Accuracy: 0.7006


In [45]:
confusion = metrics.confusion_matrix(y_test, xgb_test_preds, labels = ['D','R'])
confusion

array([[21795,   652],
       [ 9904,  2905]])

In [46]:
df_count = pd.DataFrame(count_data_train.todense(),columns = count_vectorizer.get_feature_names())
feature_importance = pd.DataFrame(xgb_classifier.feature_importances_, index=df_count.columns, columns=['importance']).sort_values('importance',ascending=False)
feature_importance.head(25)

Unnamed: 0,importance
trump,0.030744
democrat,0.02589
forthepeopl,0.024272
trumpshutdown,0.022654
border,0.021036
foxnew,0.017799
climat,0.017799
potu,0.016181
presid,0.014563
politician,0.012945
