# Prediction with Total Individual Tweets

In [1]:
import pandas as pd
import json
import nltk
from nltk.collocations import *
from nltk.stem import PorterStemmer
from nltk.util import ngrams

### Democrats

In [None]:
Democrats  = pd.read_csv('Democrats_final.csv')
Democrats.drop(['Unnamed: 0'], axis=1, inplace  = True)
Democrats =  Democrats.groupby('username')['tweet'].apply(' '.join).reset_index()

In [None]:
pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
Democrats["tweet"] =Democrats["tweet"].apply(lambda x: nltk.regexp_tokenize(x,pattern))

In [None]:
with open('vocab_total_D_R.json', 'r') as f:
        vocab_total_D_R = json.load(f)

In [None]:
def stem_to_final_list(list_of_tokens):
    porter = PorterStemmer()
    line = [porter.stem(word) for word in list_of_tokens]
    line_stem = [word for word in line if word in vocab_total_D_R]
    return line_stem

In [None]:
for i in range(len(Democrats)):
    Democrats['tweet'][i] = stem_to_final_list(Democrats['tweet'][i])

In [None]:
Democrats['Target'] = 'D'

In [None]:
Democrats.head()

In [None]:
Democrats.to_csv('joint_democrats.csv')

#### Republican

In [None]:
Republican  = pd.read_csv('Final_Rep.csv')
Republican.drop(['Unnamed: 0'], axis=1, inplace  = True)
Republican =  Republican.groupby('username')['tweet'].apply(' '.join).reset_index()

In [None]:
Republican["tweet"] =Republican["tweet"].apply(lambda x: nltk.regexp_tokenize(x,pattern))

In [None]:
for i in range(len(Republican)):
    Republican['tweet'][i] = stem_to_final_list(Republican['tweet'][i])

In [None]:
Republican['Target'] = 'R'

In [None]:
Republican.to_csv('joint_republican.csv')

## Creating Data Frame by User

In [5]:
Republican  = pd.read_csv('joint_republican.csv')
Republican.drop(['Unnamed: 0'], axis=1, inplace  = True)
Democrats  = pd.read_csv('joint_democrats.csv')
Democrats.drop(['Unnamed: 0'], axis=1, inplace  = True)

In [6]:
Final_RD_Tweet_User = pd.concat([Democrats, Republican], axis=0, ignore_index=True)

In [8]:
Final_RD_Tweet_User.to_csv('Final_RD_Tweet_User.csv')

In [2]:
Final_RD_Tweet_User = pd.read_csv('Final_RD_Tweet_User.csv')

In [5]:
Final_RD_Tweet_User.head(3)
Final_RD_Tweet_User.drop('Unnamed: 0',axis = 1, inplace = True)

In [6]:
Final_RD_Tweet_User.set_index('username', inplace = True)

## Predictions

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn import metrics
import numpy as np
import xgboost as xgb
import os
os.environ['KMP_DUPLICATE_LIB_OK']= 'True'

In [8]:
data = Final_RD_Tweet_User['tweet']
target = Final_RD_Tweet_User['Target']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data , target, test_size=0.20)

In [54]:
def print_wrong_prediction(actual, prediction):
    df5 = pd.DataFrame(actual)
    df6 = pd.DataFrame(prediction, columns = ['PRED'])
    df5['PRED'] = df6['PRED'].values
    df5.reset_index(inplace = True)
    for i in range(len(df5)):
        if df5.Target[i] != df5.PRED[i]:
            print (df5.username[i],df5.Target[i])

## TF-IDF Vectorizer

In [10]:
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(X_train)
tf_idf_data_test = vectorizer.transform(X_test)

In [11]:
tf_idf_data_train.shape

(406, 9517)

In [12]:
nb_classifier = MultinomialNB()
rf_classifier = RandomForestClassifier( n_jobs=-1)
xgb_classifier = xgb.XGBClassifier()

## Naive Bayes - TF-IDF Vectorizer

In [60]:
nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

In [61]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)

Training Accuracy: 0.9187 		 Testing Accuracy: 0.7745

----------------------------------------------------------------------


In [62]:
confusion = metrics.confusion_matrix(y_test, nb_test_preds, labels = ['D','R'])
confusion

array([[54,  0],
       [23, 25]])

In [63]:
print ('TRAINING_wrong:')
print_wrong_prediction(y_train, nb_train_preds)
print ('\n')
print ('TESTING_Wrong:')
print_wrong_prediction(y_test, nb_test_preds)

TRAINING_wrong:
repdancrenshaw R
michaelcburgess R
repdougcollins R
repthomasmassie R
repjohnkatko R
repbrianmast R
senatorcollins R
senatorburr R
reptomreed R
repbrianfitz R
susanwbrooks R
sencorygardner R
hurdonthehill R
repgusbilirakis R
sencapito R
repsmucker R
reprutherfordfl R
senronjohnson R
kencalvert R
rodneydavis R
mariodb R
repmarkmeadows R
robwittman R
repkaygranger R
reppaulcook R
randpaul R
sentoomey R
repgregwalden R
mcconnellpress R
senmikelee R
herrerabeutler R
repdonbacon R
repkinzinger R


TESTING_Wrong:
senjoniernst R
senrickscott R
repdonyoung R
mactxpress R
repkevinbrady R
repdavejoyce R
repgallagher R
repmckinley R
repmoolenaar R
senatorlankford R
repstevestivers R
repagonzalez R
senatortimscott R
repjenniffer R
lisamurkowski R
sendansullivan R
drnealdunnfl2 R
reppeteking R
replamalfa R
markamodeinv2 R
senrobportman R
repleezeldin R
repbost R


## Random Forest - TF-IDF Vectorizer

In [67]:
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

In [68]:
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Random Forest
Training Accuracy: 1.0 		 Testing Accuracy: 0.9216


In [69]:
confusion = metrics.confusion_matrix(y_test, rf_test_preds, labels = ['D','R'])
confusion

array([[50,  4],
       [ 4, 44]])

In [70]:
print ('TRAINING_wrong:')
print_wrong_prediction(y_train, rf_train_preds)
print ('\n')
print ('TESTING_Wrong:')
print_wrong_prediction(y_test, rf_test_preds)

TRAINING_wrong:


TESTING_Wrong:
senrickscott R
reprichmond D
johncornyn R
repaoc D
reppeteking R
senmurphyoffice D
sanfordbishop D
repleezeldin R


In [72]:
df_idf = pd.DataFrame(tf_idf_data_train.todense(),columns = vectorizer.get_feature_names())
feature_importance = pd.DataFrame(rf_classifier.feature_importances_, index=df_idf.columns, columns=['importance']).sort_values('importance',ascending=False)
feature_importance.head(25)

Unnamed: 0,importance
equalityact,0.06485
trumpshutdown,0.050488
equalpayday,0.037397
coverag,0.035883
action,0.034674
ban,0.03099
fight,0.025213
end,0.021686
worker,0.021574
usmca,0.021125


## Xgboost - TF-IDF Vectorizer

In [56]:
xgb_classifier.fit(tf_idf_data_train, y_train)
xgb_train_preds = xgb_classifier.predict(tf_idf_data_train)
xgb_test_preds = xgb_classifier.predict(tf_idf_data_test)

In [57]:
xgb_train_score = accuracy_score(y_train, xgb_train_preds)
xgb_test_score = accuracy_score(y_test, xgb_test_preds)
print('xgboost')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(xgb_train_score, xgb_test_score))

xgboost
Training Accuracy: 1.0 		 Testing Accuracy: 0.951


In [58]:
confusion = metrics.confusion_matrix(y_test, xgb_test_preds, labels = ['D','R'])
confusion

array([[51,  3],
       [ 2, 46]])

In [59]:
print ('TRAINING_wrong:')
print_wrong_prediction(y_train, xgb_train_preds)
print ('\n')
print ('TESTING_Wrong:')
print_wrong_prediction(y_test, xgb_test_preds)

TRAINING_wrong:


TESTING_Wrong:
replouiegohmert R
repjenniffer R
sen_joemanchin D
repaoc D
senmurphyoffice D


In [73]:
df_idf = pd.DataFrame(tf_idf_data_train.todense(),columns = vectorizer.get_feature_names())
feature_importance = pd.DataFrame(xgb_classifier.feature_importances_, index=df_idf.columns, columns=['importance']).sort_values('importance',ascending=False)
feature_importance.head(25)

Unnamed: 0,importance
forthepeopl,0.046392
netneutr,0.036082
aliv,0.036082
equalityact,0.033505
discrimin,0.030928
statu,0.028351
administr,0.025773
climat,0.023196
lgbtq,0.023196
afford,0.023196


# Count Vectorizer

In [23]:
count_vectorizer = CountVectorizer()
count_data_train = count_vectorizer.fit_transform(X_train)
count_data_test = count_vectorizer.transform(X_test)

In [24]:
count_data_train.shape

(406, 9517)

## Naive Bayes - Count Vectorizer

In [25]:
nb_classifier.fit(count_data_train, y_train)
nb_train_preds = nb_classifier.predict(count_data_train)
nb_test_preds = nb_classifier.predict(count_data_test)

In [26]:
nb_train_score = accuracy_score(y_train, nb_train_preds)
nb_test_score = accuracy_score(y_test, nb_test_preds)
print ("NB")
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(nb_train_score, nb_test_score))
print("")
print('-'*70)

NB
Training Accuracy: 0.9901 		 Testing Accuracy: 0.9902

----------------------------------------------------------------------


In [27]:
confusion = metrics.confusion_matrix(y_test, nb_test_preds, labels = ['D','R'])
confusion

array([[53,  1],
       [ 0, 48]])

In [47]:
print ('TRAINING_wrong:')
print_wrong_prediction(y_train, nb_train_preds)
print ('\n')
print ('TESTING_Wrong:')
print_wrong_prediction(y_test, nb_test_preds)

TRAINING_wrong:
senatorcollins R
repcuellar D
repvisclosky D
repconorlamb D


TESTING_Wrong:
sen_joemanchin D


In [64]:
rep_class_prob_sorted = nb_classifier.feature_log_prob_[0, :].argsort()
dem_class_prob_sorted = nb_classifier.feature_log_prob_[1, :].argsort()

print(np.take(count_vectorizer.get_feature_names(), rep_class_prob_sorted[:30]))
print(np.take(count_vectorizer.get_feature_names(), dem_class_prob_sorted[:30]))

['spillway' 'congressmanhic' 'walberg' 'confuciu' 'americafirst'
 'wallswork' 'qi' 'thirddistrictthursday' 'kilmead' 'wafb' 'racin'
 'kentuckynewera' 'razorbackbsb' 'kenoshanew' 'kenosha' 'amata' 'kennedyn'
 'washex' 'thefederalist' 'radewagen' 'kamalmaz' 'waco' 'tikz' 'lakeeri'
 'endinfanticid' 'lakeconew' 'countymeet' 'tipton' 'kymx' 'plano']
['endcorruptionnow' 'lgbteqcaucu' 'seiu' 'lgbtqia' 'liberian'
 'liftamerica' 'seawal' 'seasact' 'bigoil' 'sear' 'lightfoot'
 'seacoastonlin' 'seacoast' 'scrutin' 'lipinski' 'binghamton' 'biodivers'
 'loudli' 'loudoun' 'cozi' 'louis' 'scanlon' 'scan' 'louisa' 'savetp'
 'savetitlex' 'bhm' 'savetheinternet' 'senateapprop' 'bexar']


## Random Forest - Count Vectorizer

In [28]:
rf_classifier.fit(count_data_train, y_train)
rf_train_preds = rf_classifier.predict(count_data_train)
rf_test_preds = rf_classifier.predict(count_data_test)

In [29]:
rf_train_score = accuracy_score(y_train, rf_train_preds)
rf_test_score = accuracy_score(y_test, rf_test_preds)
print('Random Forest')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(rf_train_score, rf_test_score))

Random Forest
Training Accuracy: 1.0 		 Testing Accuracy: 0.902


In [30]:
confusion = metrics.confusion_matrix(y_test, rf_test_preds, labels = ['D','R'])
confusion

array([[49,  5],
       [ 5, 43]])

In [48]:
print ('TRAINING_wrong:')
print_wrong_prediction(y_train, rf_train_preds)
print ('\n')
print ('TESTING_Wrong:')
print_wrong_prediction(y_test, rf_test_preds)

TRAINING_wrong:


TESTING_Wrong:
senrickscott R
reprichardneal D
repkevinbrady R
senatortimscott R
lisamurkowski R
repaoc D
senmurphyoffice D
repgonzalez D
sanfordbishop D
johnboozman R


In [65]:
df_count = pd.DataFrame(count_data_train.todense(),columns = count_vectorizer.get_feature_names())
feature_importance = pd.DataFrame(rf_classifier.feature_importances_, index=df_count.columns, columns=['importance']).sort_values('importance',ascending=False)
feature_importance.head(30)

Unnamed: 0,importance
protectourcar,0.066359
trumpshutdown,0.041246
aliv,0.040916
paycheckfair,0.036446
woman,0.035387
discrimin,0.034234
pre,0.03242
color,0.030295
public,0.029829
shoot,0.027157


## XGboost - Count Vectorizer

In [31]:
xgb_classifier.fit(count_data_train, y_train)
xgb_train_preds = xgb_classifier.predict(count_data_train)
xgb_test_preds = xgb_classifier.predict(count_data_test)

In [32]:
xgb_train_score = accuracy_score(y_train, xgb_train_preds)
xgb_test_score = accuracy_score(y_test, xgb_test_preds)
print('xgboost')
print("Training Accuracy: {:.4} \t\t Testing Accuracy: {:.4}".format(xgb_train_score, xgb_test_score))

xgboost
Training Accuracy: 1.0 		 Testing Accuracy: 0.9412


In [33]:
confusion = metrics.confusion_matrix(y_test, xgb_test_preds, labels = ['D','R'])
confusion

array([[50,  4],
       [ 2, 46]])

In [49]:
print ('TRAINING_wrong:')
print_wrong_prediction(y_train, xgb_train_preds)
print ('\n')
print ('TESTING_Wrong:')
print_wrong_prediction(y_test, xgb_test_preds)

TRAINING_wrong:


TESTING_Wrong:
reprichmond D
replouiegohmert R
repjenniffer R
repaoc D
senmurphyoffice D
sanfordbishop D


In [66]:
df_count = pd.DataFrame(count_data_train.todense(),columns = count_vectorizer.get_feature_names())
feature_importance = pd.DataFrame(xgb_classifier.feature_importances_, index=df_count.columns, columns=['importance']).sort_values('importance',ascending=False)
feature_importance.head(25)

Unnamed: 0,importance
forthepeopl,0.046392
netneutr,0.036082
aliv,0.036082
equalityact,0.033505
discrimin,0.030928
statu,0.028351
administr,0.025773
climat,0.023196
lgbtq,0.023196
afford,0.023196
