In [1]:
from __future__ import print_function

import pandas as pd 
import numpy as np 
import sklearn

# NLTK/NLP
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk import FreqDist, word_tokenize
import string, re
import urllib
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from gensim.models import word2vec
from nltk.collocations import *
import gensim
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# Classifiers 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
from sklearn.model_selection import train_test_split


#Sampling
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

import sklearn.decomposition as decomposition

#Visualization
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")

import csv

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot

from sklearn.metrics import average_precision_score, auc, roc_curve, precision_recall_curve



Using TensorFlow backend.


In [2]:
# import customized functions
# import import_ipynb
# from custom_functions import *

%run custom_functions.ipynb

In [3]:
df = pd.read_csv('data/cleaned-reshuffled.csv')
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.lem_tweet= df.lem_tweet.apply(str)
df.stem_tweet= df.stem_tweet.apply(str)
df.tokenized_tweet.apply(eval)
df.stemmed_tokens.apply(eval)
df.lemmatized_tokens.apply(eval)

0        [sad, see, the, scene, hooligan, pre, engrus, ...
1        [gooddyeyoung, yoyoyo, super, happy, apa, the,...
2        [queen, evil, bihday, lnic, lnicjustanevilbday...
3        [you, might, libtard, libtard, sjw, liberal, p...
4          [what, are, your, goal, find, out, here, smile]
5                                   [retweets, nuascannan]
6                               [classic, trump, follower]
7        [the, mixture, emotion, here, one, from, the, ...
8        [the, meps, bear, that, travelled, sandy, hook...
9        [just, because, affected, her, son, son, abuse...
10       [thomas, always, say, live, dream, world, this...
11       [when, the, local, library, try, charge, you, ...
12       [cia, nc, sockpuppetarmy, hbgary, troll, snowd...
13       [year, anniversary, michael, jackson, vindicat...
14       [over, everyone, that, say, cringe, cuz, cool,...
15       [good, morning, life, blackhair, travel, home,...
16       [yay, let, talk, food, the, every, day, the, m.

## Train / Test Split for TF-IDF

In [4]:
X = df.drop(['label'], axis = 1)

In [5]:
y = df['label']

In [6]:
#splitting into train and test 
X_model, X_test, y_model, y_test = train_test_split(X, y, stratify = y,  test_size=0.20, random_state=123)

#splitting "model" into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_model, y_model, test_size=0.20, random_state=123)

# df_train_full = X_train.copy()
# df_train_full['label']= y_train
# train_full_df.to_csv('train_full_df.csv')

In [7]:
y.value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

### Upsampling and Downsampling Training Data

In [8]:
upsample_training_data(X_train, y_train)

Unnamed: 0,id,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet,label
565,20676,@user f*** this ð¦ðº government that deli...,this government that deliberately toures #refu...,this government that deliberately toures refug...,"['this', 'government', 'that', 'deliberately',...","['this', 'govern', 'that', 'deliber', 'tour', ...","['this', 'government', 'that', 'deliberately',...",this government that deliberately toures refug...,this government that deliberately toures refug...,1
21531,24025,despite a demoralizing 2016: may ur #newyear20...,despite demoralizing may #newyear #classism fr...,despite demoralizing may newyear classism free...,"['despite', 'demoralizing', 'may', 'newyear', ...","['despit', 'demor', 'may', 'newyear', 'classis...","['despite', 'demoralizing', 'may', 'newyear', ...",despite demoralizing may newyear classism free...,despite demoralizing may newyear classism free...,1
13300,18145,"@user #koreans &amp; joseon people in japan, w...",#koreans amp joseon people japan will abuse th...,koreans amp joseon people japan will abuse the...,"['koreans', 'amp', 'joseon', 'people', 'japan'...","['korean', 'amp', 'joseon', 'peopl', 'japan', ...","['korean', 'amp', 'joseon', 'people', 'japan',...",koreans amp joseon people japan will abuse the...,koreans amp joseon people japan will abuse the...,1
18925,8506,@user @user @user @user classic ! yet you jewi...,classic yet you jewish bastards wonder why you...,classic yet you jewish bastards wonder why you...,"['classic', 'yet', 'you', 'jewish', 'bastards'...","['classic', 'yet', 'you', 'jewish', 'bastard',...","['classic', 'yet', 'you', 'jewish', 'bastard',...",classic yet you jewish bastards wonder why you...,classic yet you jewish bastards wonder why you...,1
12619,15464,@user did someone say #antisemetic ? gee (((@u...,did someone say #antisemetic gee you bit trigg...,did someone say antisemetic gee you bit triggered,"['did', 'someone', 'say', 'antisemetic', 'gee'...","['did', 'someon', 'say', 'antisemet', 'gee', '...","['did', 'someone', 'say', 'antisemetic', 'gee'...",did someone say antisemetic gee you bit triggered,did someone say antisemetic gee you bit trigg,1
26964,28937,couldn't have said this any better nor truthfu...,couldn have said this any better nor truthfull...,couldn have said this any better nor truthfull...,"['couldn', 'have', 'said', 'this', 'any', 'bet...","['couldn', 'have', 'said', 'this', 'ani', 'bet...","['couldn', 'have', 'said', 'this', 'any', 'bet...",couldn have said this any better nor truthfull...,couldn have said this any better nor truthfull...,1
17273,25291,@user racism stuffed into skinny jeans with a ...,racism stuffed into skinny jeans with hipster ...,racism stuffed into skinny jeans with hipster ...,"['racism', 'stuffed', 'into', 'skinny', 'jeans...","['racism', 'stuf', 'into', 'skinni', 'jean', '...","['racism', 'stuffed', 'into', 'skinny', 'jean'...",racism stuffed into skinny jeans with hipster ...,racism stuffed into skinny jeans with hipster ...,1
1561,12717,the end of #me #selfie # #love #messi #cr7 #...,the end #me #selfie #love #messi #cr #religion...,the end me selfie love messi cr religion chris...,"['the', 'end', 'me', 'selfie', 'love', 'messi'...","['the', 'end', 'me', 'selfi', 'love', 'messi',...","['the', 'end', 'me', 'selfie', 'love', 'messi'...",the end me selfie love messi cr religion chris...,the end me selfie love messi cr religion chris...,1
17875,11612,trump ally wishes mad cow disease death for ob...,trump ally wishes mad cow disease death for ob...,trump ally wishes mad cow disease death for ob...,"['trump', 'ally', 'wishes', 'mad', 'cow', 'dis...","['trump', 'alli', 'wish', 'mad', 'cow', 'disea...","['trump', 'ally', 'wish', 'mad', 'cow', 'disea...",trump ally wishes mad cow disease death for ob...,trump ally wishes mad cow disease death for ob...,1
17184,20554,opinion: is rife in the #lgbt community. #gay...,opinion rife the #lgbt community #gay people c...,opinion rife the lgbt community gay people can...,"['opinion', 'rife', 'the', 'lgbt', 'community'...","['opinion', 'rife', 'the', 'lgbt', 'communiti'...","['opinion', 'rife', 'the', 'lgbt', 'community'...",opinion rife the lgbt community gay people can...,opinion rife the lgbt community gay people can...,1


In [9]:
train_upsampled = upsample_training_data(X_train, y_train)

X_train_up = train_upsampled.drop(['label'], axis = 1)
y_train_up = pd.DataFrame(train_upsampled.label)

In [10]:
train_upsampled['label'].value_counts()

1    18991
0    18991
Name: label, dtype: int64

In [11]:
train_downsampled = downsample_training_data(X_train, y_train)

X_train_down = train_downsampled.drop(['label'], axis = 1)
y_train_down = pd.DataFrame(train_downsampled.label)

In [12]:
train_downsampled['label'].value_counts()

1    1464
0    1464
Name: label, dtype: int64

## Comparing Vectorization and Method Performance

In [13]:
count_vect = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer(min_df=.001)
tfidf_ngram = TfidfVectorizer(ngram_range=(1,2), min_df=.001)
tfidf_ngram2 = TfidfVectorizer(ngram_range=(2,3),min_df=.001)

logreg = LogisticRegression()
rfc = RandomForestClassifier(random_state=10)
nb = GaussianNB()
svc = SVC(random_state=10)

vectorization_list = [('COUNT_VECTORIZER', count_vect),
                      ('TFIDF_VECTORIZER', tfidf_vectorizer),
                      ('TFIDF_NGRAM_1_2', tfidf_ngram),
                      ('TFIDF_NGRAM_2_3', tfidf_ngram2)]



In [14]:
%run custom_functions.ipynb

## Naive Bayes

In [15]:
np.random.seed(0)

In [16]:
NB_compare_vectorization_model(X_train.lem_tweet, y_train, 
                                   X_val.lem_tweet, y_val, GaussianNB())

KeyboardInterrupt: 

## Logistic Regression

In [17]:
test = wrapper_single_vectorization(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, 
                                    logreg, tfidf_vectorizer, apply_smote = False)
test

Train Accuracy: 0.95
Train Precision: 0.88
Train Recall: 0.33
Train F1: 0.48


Validation Accuracy: 0.95
Validation Precision: 0.83
Validation Recall: 0.3
Validation F1: 0.44


In [18]:
test = wrapper_single_vectorization(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, 
                                    logreg, tfidf_vectorizer, apply_smote = True)
test

Train Accuracy: 0.89
Train Precision: 0.38
Train Recall: 0.85
Train F1: 0.53


Validation Accuracy: 0.88
Validation Precision: 0.31
Validation Recall: 0.73
Validation F1: 0.43


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4243,541,4784
1,89,241,330
All,4332,782,5114


In [19]:
%run custom_functions.ipynb

In [20]:
LR_cw_lemm = wrapper_compare_vectorizations(X_train.lem_tweet, 
                            y_train, X_val.lem_tweet, y_val, 
                            LogisticRegression(class_weight='balanced', solver = 'lbfgs'), 
                                            vectorization_list, apply_smote = False)
LR_cw_lemm
# pd.DataFrame(LR_cw_lemm)

({'COUNT_VECTORIZER': {'Train Accuracy': 0.99,
   'Train Precision': 0.86,
   'Train Recall': 1.0,
   'Train F1': 0.92,
   'Validation Accuracy': 0.95,
   'Validation Precision': 0.6,
   'Validation Recall': 0.69,
   'Validation F1': 0.64},
  'TFIDF_VECTORIZER': {'Train Accuracy': 0.89,
   'Train Precision': 0.38,
   'Train Recall': 0.92,
   'Train F1': 0.54,
   'Validation Accuracy': 0.87,
   'Validation Precision': 0.3,
   'Validation Recall': 0.77,
   'Validation F1': 0.43},
  'TFIDF_NGRAM_1_2': {'Train Accuracy': 0.89,
   'Train Precision': 0.4,
   'Train Recall': 0.93,
   'Train F1': 0.56,
   'Validation Accuracy': 0.88,
   'Validation Precision': 0.31,
   'Validation Recall': 0.77,
   'Validation F1': 0.44},
  'TFIDF_NGRAM_2_3': {'Train Accuracy': 0.41,
   'Train Precision': 0.1,
   'Train Recall': 0.94,
   'Train F1': 0.19,
   'Validation Accuracy': 0.38,
   'Validation Precision': 0.09,
   'Validation Recall': 0.89,
   'Validation F1': 0.16}},
 <20455x598 sparse matrix of type 

In [21]:
LR_cw_lemm = wrapper_compare_vectorizations(X_train.lem_tweet, 
                            y_train, X_val.lem_tweet, y_val, 
                            LogisticRegression(class_weight='balanced', solver = 'lbfgs'),
                            vectorization_list, apply_smote= True)
pd.DataFrame(LR_cw_lemm)

Unnamed: 0,COUNT_VECTORIZER,TFIDF_VECTORIZER,TFIDF_NGRAM_1_2,TFIDF_NGRAM_2_3
Train Accuracy,0.96,0.89,0.9,0.42
Train F1,0.75,0.53,0.54,0.18
Train Precision,0.66,0.38,0.4,0.1
Train Recall,0.87,0.85,0.86,0.92
Validation Accuracy,0.89,0.88,0.88,0.39
Validation F1,0.44,0.43,0.45,0.15
Validation Precision,0.32,0.31,0.32,0.08
Validation Recall,0.68,0.73,0.74,0.86


In [None]:
with open('mycsvfile.csv','a') as f:
    w = csv.writer(f)
    w.writerow('LogisticRegression')
    w.writerows(LR_cw_lemm.items())

In [None]:
(pd.DataFrame.from_dict(data= LR_cw_lemm)
   .to_csv('dict_file.csv', header=False))

In [None]:
#Logistic Regression: compare vectorizers using stemming + class balances
pd.DataFrame(wrapper_compare_vectorizations(X_train.stem_tweet, 
                            y_train, X_val.stem_tweet, y_val, 
                            LogisticRegression(class_weight='balanced', solver = 'lbfgs'),
                            vectorization_list, apply_smote= True))

##### Regularization:

- Count Vectorizer:   

l2 (default), no alpha tuning: F1: 0.99, 0.66
C = .1:  .91,  .52
C = .2:  .96,  .57
C = .3:  .98,  .58
C = .01:  .67,  .39
C = .001:  .62, .39

In [None]:
count_vect

In [None]:
X_train_up.lem_tweet.shape

In [None]:
y_train_up.shape

In [None]:
X_val.lem_tweet.shape

In [None]:
y_val.shape

In [47]:
%run custom_functions.ipynb

In [49]:
single_vector_model(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, LogisticRegression(), count_vect)

Train Accuracy: 0.98
Train Precision: 0.99
Train Recall: 0.8
Train F1: 0.88


Validation Accuracy: 0.96
Validation Precision: 0.86
Validation Recall: 0.5
Validation F1: 0.63


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4756,28,4784
1,164,166,330
All,4920,194,5114


In [77]:
%run custom_functions.ipynb

In [79]:
single_vector_model(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val,
                            LogisticRegression(class_weight='balanced', penalty = 'l1'), count_vect)

Train Accuracy: 0.99
Train Precision: 0.99
Train Recall: 0.99
Train F1: 0.99


Validation Accuracy: 0.95
Validation Precision: 0.6
Validation Recall: 0.66
Validation F1: 0.63


ValueError: Shape of passed values is (1, 2), indices imply (37982, 2)

In [73]:
wrapper_single_vectorization(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val,
                            LogisticRegression(class_weight='balanced', penalty = 'l1', C=.05), count_vect,
                            apply_smote = False)

Train Accuracy: 0.86
Train Precision: 0.89
Train Recall: 0.83
Train F1: 0.86


Validation Accuracy: 0.89
Validation Precision: 0.32
Validation Recall: 0.72
Validation F1: 0.45


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4289,495,4784
1,93,237,330
All,4382,732,5114


In [74]:
wrapper_single_vectorization(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val,
                            LogisticRegression(class_weight='balanced', penalty = 'l1', C=.05), count_vect,
                            apply_smote = True)

Train Accuracy: 0.86
Train Precision: 0.89
Train Recall: 0.83
Train F1: 0.86


Validation Accuracy: 0.89
Validation Precision: 0.32
Validation Recall: 0.72
Validation F1: 0.45


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4290,494,4784
1,93,237,330
All,4383,731,5114


### Test Functions

In [68]:
%run custom_functions.ipynb

In [69]:
wrapper_single_vectorization(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                            LogisticRegression(class_weight='balanced', penalty = 'l1'), 
                            count_vect, apply_smote = False)

Train Accuracy: 0.99
Train Precision: 0.99
Train Recall: 0.99
Train F1: 0.99


Validation Accuracy: 0.95
Validation Precision: 0.6
Validation Recall: 0.66
Validation F1: 0.63


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4638,146,4784
1,113,217,330
All,4751,363,5114


In [67]:
wrapper_single_vectorization(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                            LogisticRegression(class_weight='balanced', penalty = 'l1'), 
                            count_vect, apply_smote = True)

Train Accuracy: 0.99
Train Precision: 0.99
Train Recall: 0.99
Train F1: 0.99


Validation Accuracy: 0.95
Validation Precision: 0.6
Validation Recall: 0.66
Validation F1: 0.63


Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4638,146,4784
1,112,218,330
All,4750,364,5114


#### Multiple Comparison Functions

In [60]:
return_dict, transformed_x, transformed_y = compare_vectorization_model(X_train.lem_tweet, 
                            y_train, X_val.lem_tweet, y_val, 
                            LogisticRegression(class_weight='balanced', solver = 'lbfgs'),
                            vectorization_list)

In [52]:
return_dict

{'COUNT_VECTORIZER': {'Train Accuracy': 0.99,
  'Train Precision': 0.86,
  'Train Recall': 1.0,
  'Train F1': 0.92,
  'Validation Accuracy': 0.95,
  'Validation Precision': 0.6,
  'Validation Recall': 0.69,
  'Validation F1': 0.65},
 'TFIDF_VECTORIZER': {'Train Accuracy': 0.88,
  'Train Precision': 0.37,
  'Train Recall': 0.91,
  'Train F1': 0.53,
  'Validation Accuracy': 0.87,
  'Validation Precision': 0.3,
  'Validation Recall': 0.75,
  'Validation F1': 0.43},
 'TFIDF_NGRAM_1_2': {'Train Accuracy': 0.89,
  'Train Precision': 0.39,
  'Train Recall': 0.92,
  'Train F1': 0.54,
  'Validation Accuracy': 0.87,
  'Validation Precision': 0.31,
  'Validation Recall': 0.76,
  'Validation F1': 0.44},
 'TFIDF_NGRAM_2_3': {'Train Accuracy': 0.4,
  'Train Precision': 0.1,
  'Train Recall': 0.94,
  'Train F1': 0.18,
  'Validation Accuracy': 0.37,
  'Validation Precision': 0.08,
  'Validation Recall': 0.9,
  'Validation F1': 0.15}}

In [75]:
wrapper_compare_vectorizations(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                            LogisticRegression(class_weight='balanced', penalty = 'l1'), 
                            vectorization_list, apply_smote = False)

({'COUNT_VECTORIZER': {'Train Accuracy': 0.99,
   'Train Precision': 0.99,
   'Train Recall': 0.99,
   'Train F1': 0.99,
   'Validation Accuracy': 0.95,
   'Validation Precision': 0.6,
   'Validation Recall': 0.66,
   'Validation F1': 0.63},
  'TFIDF_VECTORIZER': {'Train Accuracy': 0.95,
   'Train Precision': 0.94,
   'Train Recall': 0.96,
   'Train F1': 0.95,
   'Validation Accuracy': 0.91,
   'Validation Precision': 0.39,
   'Validation Recall': 0.72,
   'Validation F1': 0.51},
  'TFIDF_NGRAM_1_2': {'Train Accuracy': 0.96,
   'Train Precision': 0.95,
   'Train Recall': 0.98,
   'Train F1': 0.96,
   'Validation Accuracy': 0.92,
   'Validation Precision': 0.44,
   'Validation Recall': 0.73,
   'Validation F1': 0.55},
  'TFIDF_NGRAM_2_3': {'Train Accuracy': 0.74,
   'Train Precision': 0.89,
   'Train Recall': 0.54,
   'Train F1': 0.67,
   'Validation Accuracy': 0.9,
   'Validation Precision': 0.28,
   'Validation Recall': 0.35,
   'Validation F1': 0.31}},
 <37982x970 sparse matrix of ty

In [76]:
wrapper_compare_vectorizations(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                            LogisticRegression(class_weight='balanced', penalty = 'l1'), 
                            vectorization_list, apply_smote = True)

{'COUNT_VECTORIZER': {'Train Accuracy': 0.99,
  'Train Precision': 0.99,
  'Train Recall': 0.99,
  'Train F1': 0.99,
  'Validation Accuracy': 0.95,
  'Validation Precision': 0.6,
  'Validation Recall': 0.66,
  'Validation F1': 0.63},
 'TFIDF_VECTORIZER': {'Train Accuracy': 0.95,
  'Train Precision': 0.94,
  'Train Recall': 0.96,
  'Train F1': 0.95,
  'Validation Accuracy': 0.91,
  'Validation Precision': 0.39,
  'Validation Recall': 0.72,
  'Validation F1': 0.51},
 'TFIDF_NGRAM_1_2': {'Train Accuracy': 0.96,
  'Train Precision': 0.95,
  'Train Recall': 0.98,
  'Train F1': 0.96,
  'Validation Accuracy': 0.92,
  'Validation Precision': 0.44,
  'Validation Recall': 0.73,
  'Validation F1': 0.55},
 'TFIDF_NGRAM_2_3': {'Train Accuracy': 0.74,
  'Train Precision': 0.89,
  'Train Recall': 0.54,
  'Train F1': 0.67,
  'Validation Accuracy': 0.9,
  'Validation Precision': 0.28,
  'Validation Recall': 0.35,
  'Validation F1': 0.31}}

#### Final Log Model

In [None]:
wrapper_single_vectorization(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                            LogisticRegression(class_weight='balanced', penalty = 'l1'), 
                            count_vect, apply_smote = True)

In [None]:
log = LogisticRegression(class_weight='balanced', penalty = 'l1')

In [None]:
X_train_countvect =  count_vect.fit_transform(X_train_up.lem_tweet)
X_val_countvect =  count_vect.transform(X_val.lem_tweet)

In [None]:
log = logreg.fit(X_train_countvect, y_train_up)

In [None]:
y_val_pred = logreg.predict(X_val_countvect)

In [None]:
y_val_pred 

In [None]:
pd.DataFrame(confusion_matrix(y_val, y_val_pred), index = ['actual 0', 'actual 1'], columns = ['predicted 0', 'predicted 1'])

In [None]:
y_score = log.decision_function(X_val_countvect)
   
fpr, tpr, thresholds = roc_curve(y_val, y_score)
y_train_score = model_log.decision_function(X_train_countvect)
train_fpr, train_tpr, thresholds = roc_curve(y_train_up, y_train_score)
#Seaborns Beautiful Styling
sns.set_style("darkgrid", {"axes.facecolor": ".9"})
plt.figure(figsize=(10,8))
lw = 2
plt.plot(fpr, tpr, color='darkorange',
         lw=lw, label='ROC curve')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.yticks([i/20.0 for i in range(21)])
plt.xticks([i/20.0 for i in range(21)])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) Curve for Validation Set')
plt.legend(loc="lower right")
print('AUC: {}'.format(auc(fpr, tpr)))
plt.show()

In [None]:
average_precision = average_precision_score(y_val, y_val_pred)

print('Average precision-recall score RF: {}'.format(average_precision))

In [None]:
weighted_predictions = []
for item in logreg.predict_proba(X_val_countvect):
    if item[0] <= .85:
        weighted_predictions.append(1)
    else:
        weighted_predictions.append(0)

In [None]:
#original predictions
pd.DataFrame(confusion_matrix(y_val, y_val_pred), index = ['actual 0', 'actual 1'], columns = ['predicted 0', 'predicted 1'])

In [None]:
#with threshhold adjustment
pd.DataFrame(confusion_matrix(y_val, weighted_predictions), index = ['actual 0', 'actual 1'], columns = ['predicted 0', 'predicted 1'])

In [None]:
y_val_predict_prob = log.predict_proba(X_val_countvect)

y_val_predict_prob

In [None]:
pred_df = pd.DataFrame(y_val_predict_prob)
y_val = pd.DataFrame(y_val)
y_val.reset_index(drop=True, inplace=True)

In [None]:
y_val = pd.DataFrame(y_val)
y_val.reset_index(drop=True, inplace=True)

In [None]:
pred_df['actual_class']=y_val
pred_df['predicted_class']=y_val_pred

In [None]:
pred_df.head()

In [None]:
pred_df[pred_df['actual_class'] != pred_df['predicted_class']]

In [None]:
df.tidy_tweet

## SVM


In [None]:
#class weight = balanced + lemmatized
compare_vectorization_model(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, 
                                   SVC(class_weight ='balanced', gamma='auto', ))

In [None]:
#upsampling + lemmatized
compare_vectorization_model(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                                   SVC(class_weight ='balanced', gamma ='auto'))

In [None]:
#SMOTE + lemmatized 
SMOTE_compare_vectorization_model(X_train.lem_tweet, y_train, X_val.lem_tweet, 
                                    y_val, SVC(class_weight ='balanced', gamma='auto', ))

#### Grid Searching:

In [None]:
X_train_tfid2 =  tfidf_ngram2.fit_transform(X_train_up.lemmatized_tweet)
X_val_tfid2 =  tfidf_ngram2.transform(X_val.lemmatized_tweet)

In [None]:
# svc = SVC(kernel='linear', C=1, gamma=1, class_weight ='balanced')

params = {
'C': [0.1,.2, .3, 0.8,1,1.2,1.4],
'kernel':['linear', 'rbf'],
'gamma' :[0.1,0.8,1,1.2,1.4]
}

svm_gs= GridSearchCV(svc, param_grid = params, cv = 3)

scores = ['f1','accuracy','recall']

In [None]:
svm_gs.fit(X_train_tfid2, y_train_up)

In [None]:
svm_gs.best_estimator_

In [None]:
single_vector_model()

In [None]:
compare_vectorization_model(X_train_up.lemmatized_tweet, y_train_up, X_val.lemmatized_tweet, y_val, 
                                   SVC(C=1.2, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1.4, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))

# Random Forest

### Compare Multiple Methods:

In [None]:
np.random.seed(0)

In [None]:
# Random Forest: compare vectorizers with class weight balances + lemmatizing 

pd.DataFrame(compare_vectorization_model(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, 
                                   RandomForestClassifier(max_depth= 20, 
                                   n_estimators = 100, class_weight='balanced', random_state=10)))

In [None]:
# Random Forest: compare vectorizers with upsampling + lemmatizing 
compare_vectorization_model(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                                   RandomForestClassifier(max_depth= 20,
                                   n_estimators = 100, class_weight='balanced', random_state=10))

In [None]:
# Random Forest: compare vectorizers with SMOTE + lemmatizing  
SMOTE_compare_vectorization_model(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, 
                                   RandomForestClassifier(max_depth= 20,
                                   n_estimators = 100, class_weight = 'balanced', random_state=10))

In [None]:
# Random Forest: compare vectorizers with upsampling + stemming
compare_vectorization_model(X_train.stem_tweet, y_train, X_val.stem_tweet, y_val, 
                                   RandomForestClassifier(max_depth= 20,
                                   n_estimators = 100, class_weight='balanced', random_state=10))

#### Random Forest Fine-Tuning Hyperparameters: Max depth 10.... regularization??

In [None]:
# Random Forest: compare vectorizers with upsampling + lemmatizing 
compare_vectorization_model(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                                   RandomForestClassifier(max_depth= 10,
                                   n_estimators = 100, class_weight='balanced', random_state=10))

In [None]:
pd.DataFrame(compare_vectorization_model(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                                   RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=200, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=10, verbose=0, warm_start=False)))

### Grid-Searching

In [None]:
np.random.seed(0)
# rfc = RandomForestClassifier(n_estimators=60, max_depth=6, random_state=10, class_weight = 'balanced')

In [None]:
X_train_countvect =  count_vect.fit_transform(X_train_up.lem_tweet)
X_val_countvect =  count_vect.transform(X_val.lem_tweet)
# X_test_countvect = count_vect.transform(X_test.lemmatized_tweet)

In [None]:
np.random.seed(0)

parameters = {'n_estimators' : [40, 60, 80, 100],
'max_leaf_nodes' : [200, 400, 600],
'random_state' : [10],
'max_depth': [5, 7, 10, 20],
 'verbose' : [0],
'class_weight': ['balanced']
             }
          
rfc_gs = GridSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state = 10), param_grid=parameters, cv = 3)

In [None]:
rfc_gs.fit(X_train_countvect, y_train_up)

In [None]:
rfc_gs.best_params_

In [None]:
rfc_gs.best_estimator_

In [None]:
rfc_gs.score(X_val_countvect, y_val)

In [None]:
rfc2 = RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=200, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=10, verbose=0, warm_start=False)

In [None]:
rfc2.fit (X_train_countvect, y_train_up)

In [None]:
y_train_predict = rfc2.predict(X_train_countvect)
metrics.f1_score(y_train_up, y_train_predict)

In [None]:
y_val_predict = rfc2.predict(X_val_countvect)

In [None]:
metrics.f1_score(y_val, y_val_predict)

In [None]:
compare_vectorization_model(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, 
                     RandomForestClassifier(class_weight='balanced'))

## Word2Vec

In [None]:
import gensim
# https://radimrehurek.com/gensim/models/word2vec.html

In [None]:
np.random.seed(0)

In [None]:
# word2vec = gensim.models.Word2Vec()

In [None]:
# t = time()

# word2vec.build_vocab(df_tokenized_list, progress_per=10000)

# print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

# Word to Vec

In [None]:
type(df.tokenized_tweet[0])

In [None]:
(X_train.tokenized_tweet[0])

In [None]:
X_train.tokenized_tweet.head()

### X-train pre-processing

In [None]:
X_train.tokenized_tweet.shape

In [None]:
X_train.columns

In [None]:
X_train.head()

In [None]:
# X_train['tokenized_tweet']= X_train['tokenized_tweet'].apply(eval)

In [None]:
X_train_token_list = list(X_train.tokenized_tweet)

In [None]:
X_train_token_list 

In [None]:
type(X_train_token_list[0])

In [None]:
X_train_token_sumlist = sum(X_train_token_list,[])

In [None]:
X_train_token_sumlist = sum(X_train_token_list,[])

In [None]:
X_train_unique_tokens = set(X_train_token_sumlist)
print('The unique number of words in the training dataset is: {}'.format(len(X_train_unique_tokens)))

In [None]:
X_train_unique_tokens

In [None]:
X_train_token_list

#### X-val pre-processing

In [None]:
X_val['tokenized_tweet']= X_val['tokenized_tweet'].apply(eval)

In [None]:
X_val_token_list = list(X_val['tokenized_tweet'])
X_val_token_sumlist = sum(X_val_token_list,[])
X_val_unique_tokens = set(X_val_token_sumlist)

print('The unique number of words in the validation dataset is: {}'.format(len(X_val_unique_tokens)))

#### X-test pre-processing

In [None]:
# X_test_token_list = list(X_test['tokenized_tweet'])
# X_test_token_sumlist = sum(X_test_token_list,[])

# X_test_unique_tokens = set(X_test_token_sumlist)
# print('The unique number of words in the training dataset is: {}'.format(len(X_test_unique_tokens)))

### Modeling

In [None]:
from time import time
t = time()

w2v = gensim.models.Word2Vec(X_train_token_list, sg=1, min_count=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
w2v.train(X_train_token_list, total_examples=w2v.corpus_count, epochs=w2v.epochs)

In [None]:
#w2v.save('w2v-min1.model')
# w2v = gensim.models.Word2Vec.load('w2v-min1.model')

In [None]:
w2v.corpus_count

In [None]:
w2v_vocab= w2v.wv.vocab

In [None]:
len(w2v_vocab)

In [None]:
w2v.wv.vectors.shape

In [None]:
w2v.wv['trump']

In [None]:
w2v.wv.most_similar(['trump'])

In [None]:
w2v.wv.most_similar(positive=['lazy','black'])

In [None]:
w2v.wv

In [None]:
w2v.wv.get_keras_embedding

In [None]:
w2v_X = w2v.wv.vectors

#### Classification with Word2Vec

In [None]:
sentence = X_train_token_list[1]
sentence

In [None]:
np.mean([w2v[w] for w in sentence if w in w2v]
                   or [np.zeros(100)], axis=0)

In [None]:
input_to_lr = np.empty((31410, 100))
for sentence in X_train_token_list:
    np.append(input_to_lr, np.mean([w2v[w] for w in sentence if w in w2v]
                   or [np.zeros(100)], axis=0))
# np.mean([w2v[w] for w in sentence if w in w2v], axis=0)

In [None]:
input_to_lr[0]

In [None]:
input_to_lr[0].shape

In [None]:
X_temp = input_to_lr

In [None]:
X_temp_df = pd.DataFrame(X_temp)

In [None]:
a.fit(X_train_temp, y)
a.score(X_train_temp, y)
c = a.predict(X_train_temp)
# print scores  
print('Train Accuracy: ' + str(round(metrics.f1_score(y, c),2)))

## RNN 

In [None]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [None]:
X_train_sample, X_train_remainder, y_train_sample, y_train_remainder = train_test_split(X_train, y_train, test_size=0.99, random_state=123)

In [None]:
X_train_sample.shape

In [None]:
X_RNN_sample= X_train_sample['tokenized_tweet']

In [None]:
X_RNN_sample

In [None]:
y_RNN_sample=y_train_sample
y_RNN_sample.shape

In [None]:
# define documents
docs = X_RNN_sample
# define class labels
labels = y_RNN_sample

In [None]:
# integer encode the documents
vocab_size = 100
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

## Word2Vec Visualization

In [None]:
# model = gensim.models.Word2Vec(df_tokenized_list, size=dimsize, window=5, min_count=50, workers=4)

In [None]:
# tsne_plot(w2v_model)

### Extra

In [None]:
%run custom_functions.ipynb

In [None]:
#size of hidden layer (length of continuous word representation)
dimsize= 100

# model_w2v = gensim.models.Word2Vec(X_train_token_list, size= dimsize, window=5, min_count=1, workers=4)
model_w2v = gensim.models.Word2Vec(X_train_token_list, size= dimsize,min_count=1)

#create average vector for train and test from model
#returned list of numpy arrays are then stacked 
X_train_w2v = np.concatenate([avg_word_vectors(w, dimsize, model_w2v) for w in X_train_token_list])
X_val_w2v = np.concatenate([avg_word_vectors(w,dimsize, model_w2v) for w in X_val_token_list])

In [None]:
X_train_w2v[0]

In [None]:
X_val_w2v[0]

In [None]:
X_train_token_list.shape

In [None]:
input_to_lr = np.empty((31410, 100))
for sentence in X_train_token_list:
    np.append(input_to_lr, np.mean([model_w2v[w] for w in sentence if w in model_w2v]
                   or [np.zeros(100)], axis=0))

In [None]:
input_to_lr[0]

In [None]:
def smote_w2v_model (X_train_w2v, y_train, X_val_w2v, y_val, classifier):
        
    smote = SMOTE(random_state=1, sampling_strategy='not majority')
        
    pipe = make_pipeline(smote, classifier) 
    
    model = pipe.fit(X_train_w2v, y_train)
    
    train_predictions = model.predict(X_train_w2v)
    val_predictions = model.predict (X_val_w2v)
    
   # print scores  
    print('Train Accuracy: ' + str(round(metrics.accuracy_score(y_train, train_predictions),2)))
    print('Train Precision: ' + str(round(metrics.precision_score(y_train, train_predictions),2)))
    print('Train Recall: ' + str(round(metrics.recall_score(y_train, train_predictions),2)))
    print('Train F1: ' + str(round(metrics.f1_score(y_train, train_predictions),2)))
    print('\n')
    print('Validation Accuracy: ' + str(round(metrics.accuracy_score(y_val, val_predictions),2)))
    print('Validation Precision: ' + str(round(metrics.precision_score(y_val, val_predictions),2)))
    print('Validation Recall: ' + str(round(metrics.recall_score(y_val, val_predictions),2)))
    print('Validation F1: ' + str(round(metrics.f1_score(y_val, val_predictions),2)))

    log_confusion_test = pd.crosstab(y_val, val_predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    return log_confusion_test

In [None]:
smote_w2v_model(X_train_w2v, y_train, X_val_w2v, y_val, LogisticRegression(solver='lbfgs'))

In [None]:
smote_w2v_model(X_train_w2v, y_train, X_val_w2v, y_val, RandomForestClassifier(n_estimators=100, max_depth= 20))

In [None]:
def pca_smote_w2v_model (X_train_w2v, y_train, X_val_w2v, y_val, classifier):
    
    pca = decomposition.PCA(n_components=50)
    
    smote = SMOTE(random_state=1, sampling_strategy='not majority')
        
    pipe = make_pipeline(pca, smote, classifier) 
    
    model = pipe.fit(X_train_w2v, y_train)
    
    train_predictions = model.predict(X_train_w2v)
    val_predictions = model.predict (X_val_w2v)
    
   # print scores  
    print('Train Accuracy: ' + str(round(metrics.accuracy_score(y_train, train_predictions),2)))
    print('Train Precision: ' + str(round(metrics.precision_score(y_train, train_predictions),2)))
    print('Train Recall: ' + str(round(metrics.recall_score(y_train, train_predictions),2)))
    print('Train F1: ' + str(round(metrics.f1_score(y_train, train_predictions),2)))
    print('\n')
    print('Validation Accuracy: ' + str(round(metrics.accuracy_score(y_val, val_predictions),2)))
    print('Validation Precision: ' + str(round(metrics.precision_score(y_val, val_predictions),2)))
    print('Validation Recall: ' + str(round(metrics.recall_score(y_val, val_predictions),2)))
    print('Validation F1: ' + str(round(metrics.f1_score(y_val, val_predictions),2)))

    log_confusion_test = pd.crosstab(y_val, val_predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    return log_confusion_test

In [None]:
pca_smote_w2v_model(X_train_w2v, y_train, X_val_w2v, y_val, LogisticRegression(solver='lbfgs'))

In [None]:
pca_smote_w2v_model(X_train_w2v, y_train, X_val_w2v, y_val, RandomForestClassifier(n_estimators=100, max_depth=20))

## Glove Embeddings

In [None]:
# !pip install glove_python

In [None]:
# ! pip install glove

In [None]:
# ! pip install glovepy

In [None]:
from gensim.models import KeyedVectors
filename = 'glove.twitter.27B.100d.txt'

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'glove.twitter.27B.100d.txt'
glove_output_file = 'glove.txt.word2vec'
glove2word2vec(glove_input_file, glove_output_file)

In [None]:
glove_model = KeyedVectors.load_word2vec_format('glove.txt.word2vec', binary=False)

In [None]:
glove_model.most_similar('love')

In [None]:
glove_model['love']

In [None]:
len(glove_model.vocab)

In [None]:
X_train_glove = np.concatenate([avg_word_vectors(w, dimsize, glove_model) for w in X_train_token_list])
X_val_glove = np.concatenate([avg_word_vectors(w, dimsize, glove_model) for w in X_val_token_list])

In [None]:
X_train_glove[255]

In [None]:
X_train_glove_2 = np.empty((31410, 100))
for sentence in X_train_token_list:
    np.append(input_to_lr, np.mean([glove_model[w] for w in sentence if w in glove_model]
                   or [np.zeros(100)], axis=0))

In [None]:
X_train_glove_2[225]

In [None]:
X_train_glove.shape

In [None]:
X_train.shape

####  Learnco 

In [None]:
glove_dict = {}
with open('glove.twitter.27B.100d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in X_train_unique_tokens:
            vector = np.array(parts[1:], dtype=np.float32)
            glove_dict[word] = vector

In [None]:
glove_dict['love']

In [None]:
input_to_lr = np.empty((31410, 100))
for sentence in X_train_token_list:
    np.append(input_to_lr, np.mean([w2v[w] for w in sentence if w in w2v]
                   or [np.zeros(100)], axis=0))

In [None]:
input_to_lr['love']

## Classification with Word Embeddings

In [None]:
def smote_w2v_model (X_train_w2v, y_train, X_val_w2v, y_val, classifier):
        
    smote = SMOTE(random_state=1, sampling_strategy='not majority')
        
    pipe = make_pipeline(smote, classifier) 
    
    model = pipe.fit(X_train_w2v, y_train)
    
    train_predictions = model.predict(X_train_w2v)
    val_predictions = model.predict (X_val_w2v)
    
   # print scores  
    print('Train Accuracy: ' + str(round(metrics.accuracy_score(y_train, train_predictions),2)))
    print('Train Precision: ' + str(round(metrics.precision_score(y_train, train_predictions),2)))
    print('Train Recall: ' + str(round(metrics.recall_score(y_train, train_predictions),2)))
    print('Train F1: ' + str(round(metrics.f1_score(y_train, train_predictions),2)))
    print('\n')
    print('Validation Accuracy: ' + str(round(metrics.accuracy_score(y_val, val_predictions),2)))
    print('Validation Precision: ' + str(round(metrics.precision_score(y_val, val_predictions),2)))
    print('Validation Recall: ' + str(round(metrics.recall_score(y_val, val_predictions),2)))
    print('Validation F1: ' + str(round(metrics.f1_score(y_val, val_predictions),2)))

    log_confusion_test = pd.crosstab(y_val, val_predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    return log_confusion_test

In [None]:
smote_w2v_model(X_train_glove, y_train, X_val_glove, y_val, RandomForestClassifier(n_estimators=100, max_depth=10))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, RandomForestClassifier(max_depth=10))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, RandomForestClassifier(max_depth=10))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, LogisticRegression(penalty ='l1', C = 10,
                                                                            class_weight='balanced'))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, LogisticRegression(penalty ='l1', C = .001,
                                                                            class_weight='balanced'))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, LogisticRegression(penalty ='l1', class_weight='balanced'))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, 
                 LogisticRegression(penalty ='l2', C = 5, class_weight ={0: 5 , 1: 5}))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, 
                 LogisticRegression(penalty ='l2', C = .1, class_weight ={0: 5 , 1: 5}))

# Testing Trump Tweets

In [None]:
trump_df= pd.read_csv('data/cleaned-trump-tweet.csv')
trump_df.head()

In [None]:
X_train_countvect =  count_vect.fit_transform(X_train_up.lem_tweet)

In [None]:
logreg = LogisticRegression(class_weight='balanced')

In [None]:
logreg.fit(X_train_countvect, y_train_up)

In [None]:
X_trump = count_vect.transform(trump_df.lem_tweet)

In [None]:
X_trump = X_trump.toarray()

In [None]:
X_trump.shape

In [None]:
y_trump_predict = logreg.predict(X_trump)

In [None]:
y_trump_predict

In [None]:
trump_df['predictions'] = y_trump_predict

In [None]:
y_trump_predict_prob = logreg.predict_proba(X_trump)

In [None]:
y_trump_predict_prob = pd.DataFrame(y_trump_predict_prob)

In [None]:
trump_df['predict_probability'] = y_trump_predict_prob[0]

In [None]:
trump_df = trump_df[['tweet','predictions', 'predict_probability']]


In [None]:
trump0 = trump_df[trump_df.predictions == 0]

In [None]:
pd.set_option('display.max_colwidth', -1)

In [None]:
trump0.tweet

In [None]:
trump_df

In [None]:
trump_df[trump_df.predictions == 0]

In [None]:
trump1 = trump_df[trump_df.predictions == 1]

In [None]:
trump1.tweet