In [1]:
from __future__ import print_function

import pandas as pd 
import numpy as np 
import sklearn

# NLTK/NLP
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk import FreqDist, word_tokenize
import string, re
import urllib
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from gensim.models import word2vec
from nltk.collocations import *
import gensim

# Classifiers 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
from sklearn.model_selection import train_test_split


#Sampling
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

import sklearn.decomposition as decomposition

#Visualization
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")

from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

Using TensorFlow backend.


In [2]:
!ls

Data Cleaning-TrumpTwitter.ipynb
Dataset2-Cleaning.ipynb
Dataset2-Notebook.ipynb
Study2-Data Exploration-Visualizations.ipynb
Trump-Twitter-Practice.ipynb
custom_functions_2.ipynb
data2-cleaned.csv
dataset2.csv
trump_tweet.csv


In [3]:
# import customized functions
# import import_ipynb
# from custom_functions import *

%run custom_functions.ipynb

E0724 12:56:09.825612 140735734276992 execution.py:701] File `'custom_functions.ipynb.py'` not found.


In [4]:
df = pd.read_csv('data2-cleaned.csv')
df.drop(['Unnamed: 0'], axis = 1, inplace = True)
df.lem_tweet= df.lem_tweet.apply(str)
df.stem_tweet= df.stem_tweet.apply(str)

In [5]:
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,woman shouldn complain about cleaning your hou...,woman shouldn complain about cleaning your hou...,"['woman', 'shouldn', 'complain', 'about', 'cle...","['woman', 'shouldn', 'complain', 'about', 'cle...","['woman', 'shouldn', 'complain', 'about', 'cle...",woman shouldn complain about cleaning your hou...,woman shouldn complain about cleaning your hou...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,dats cold tyga cuffin place,dats cold tyga cuffin place,"['dats', 'cold', 'tyga', 'cuffin', 'place']","['dat', 'cold', 'tyga', 'cuffin', 'place']","['dat', 'cold', 'tyga', 'cuffin', 'place']",dats cold tyga cuffin place,dats cold tyga cuffin plac
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Dawg ever fuck bitch start confused shit,Dawg ever fuck bitch start confused shit,"['Dawg', 'ever', 'fuck', 'bitch', 'start', 'co...","['dawg', 'ever', 'fuck', 'bitch', 'start', 'co...","['Dawg', 'ever', 'fuck', 'bitch', 'start', 'co...",Dawg ever fuck bitch start confused shit,dawg ever fuck bitch start confused shit
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranny,look like tranny,"['look', 'like', 'tranny']","['look', 'like', 'tranni']","['look', 'like', 'tranny']",look like tranny,look like tranni
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear about might true might faker than bi...,shit hear about might true might faker than bi...,"['shit', 'hear', 'about', 'might', 'true', 'mi...","['shit', 'hear', 'about', 'might', 'true', 'mi...","['shit', 'hear', 'about', 'might', 'true', 'mi...",shit hear about might true might faker than bi...,shit hear about might true might faker than bi...


## Train / Test Split for TF-IDF

In [13]:
X = df.drop(['class'], axis = 1)

In [14]:
y = df['class']

In [15]:
#splitting into train and test 
X_model, X_test, y_model, y_test = train_test_split(X, y, stratify = y,  test_size=0.20, random_state=123)

#splitting "model" into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_model, y_model, test_size=0.20, random_state=123)

# df_train_full = X_train.copy()
# df_train_full['label']= y_train
# train_full_df.to_csv('train_full_df.csv')

In [16]:
y.value_counts(normalize=True)

1    0.774321
2    0.167978
0    0.057701
Name: class, dtype: float64

## Comparing Vectorization and Method Performance

In [10]:
count_vect = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer(min_df=.001)
tfidf_ngram = TfidfVectorizer(ngram_range=(1,2), min_df=.001)
tfidf_ngram2 = TfidfVectorizer(ngram_range=(2,3),min_df=.001)

logreg = LogisticRegression()
rfc = RandomForestClassifier(random_state=10)
nb = GaussianNB()
svc = SVC(random_state=10)

vectorization_list = [('COUNT_VECTORIZER', count_vect),
                      ('TFIDF_VECTORIZER', tfidf_vectorizer),
                      ('TFIDF_NGRAM_1_2', tfidf_ngram),
                      ('TFIDF_NGRAM_2_3', tfidf_ngram2)]



## Naive Bayes

In [11]:
np.random.seed(0)

In [17]:
NB_compare_vectorization_model(X_train.lem_tweet, y_train, 
                                   X_val.lem_tweet, y_val, GaussianNB())

NameError: name 'NB_compare_vectorization_model' is not defined

## Logistic Regression

In [None]:
SMOTE_vector_model(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, tfidf_vectorizer, logreg)

In [None]:
%run custom_functions-2.ipynb

In [None]:
single_vector_model(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, tfidf_vectorizer, logreg)

In [None]:
#Logistic Regression: compare vectorizers with class weight balances + lemmatizing
LR_cw_lemm = compare_vectorization_model(X_train.lem_tweet, 
                            y_train, X_val.lem_tweet, y_val, 
                            LogisticRegression(class_weight='balanced', solver = 'lbfgs'))

In [None]:
pd.DataFrame(LR_cw_lemm)

In [None]:
#Logistic Regression: compare vectorizers with SMOTE + lemmatizing
LR_smote_lemm = SMOTE_compare_vectorization_model(X_train.lem_tweet, y_train, X_val.lem_tweet, 
                                    y_val, LogisticRegression(class_weight='balanced', solver= 'lbfgs'))

In [None]:
LR_smote_lemm 

In [None]:
#Logistic Regression: compare vectorizers with upsampling + lemmatizing
compare_vectorization_model(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                                   LogisticRegression(class_weight='balanced', solver='lbfgs'))

In [None]:
#Logistic Regression: compare vectorizers using stemming + class balances
pd.DataFrame(compare_vectorization_model(X_train.stem_tweet, y_train, X_val.stem_tweet, 
                                    y_val, LogisticRegression(class_weight='balanced', solver='lbfgs')))

##### Regularization:

- Count Vectorizer:   

l2 (default), no alpha tuning: F1: 0.99, 0.66
C = .1:  .91,  .52
C = .2:  .96,  .57
C = .3:  .98,  .58
C = .01:  .67,  .39
C = .001:  .62, .39

In [None]:
single_vector_model(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, count_vect, 
                   LogisticRegression(penalty = 'l1', C = .1,  class_weight='balanced'))

## SVM


In [None]:
#class weight = balanced + lemmatized
compare_vectorization_model(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, 
                                   SVC(class_weight ='balanced', gamma='auto', ))

In [None]:
#upsampling + lemmatized
compare_vectorization_model(X_train_up.lem_tweet, y_train_up, X_val.lem_tweet, y_val, 
                                   SVC(class_weight ='balanced', gamma ='auto'))

In [None]:
#SMOTE + lemmatized 
SMOTE_compare_vectorization_model(X_train.lem_tweet, y_train, X_val.lem_tweet, 
                                    y_val, SVC(class_weight ='balanced', gamma='auto', ))

#### Grid Searching:

In [None]:
X_train_tfid2 =  tfidf_ngram2.fit_transform(X_train_up.lemmatized_tweet)
X_val_tfid2 =  tfidf_ngram2.transform(X_val.lemmatized_tweet)

In [None]:
# svc = SVC(kernel='linear', C=1, gamma=1, class_weight ='balanced')

params = {
'C': [0.1,.2, .3, 0.8,1,1.2,1.4],
'kernel':['linear', 'rbf'],
'gamma' :[0.1,0.8,1,1.2,1.4]
}

svm_gs= GridSearchCV(svc, param_grid = params, cv = 3)

scores = ['f1','accuracy','recall']

In [None]:
svm_gs.fit(X_train_tfid2, y_train_up)

In [None]:
svm_gs.best_estimator_

In [None]:
single_vector_model()

In [None]:
compare_vectorization_model(X_train_up.lemmatized_tweet, y_train_up, X_val.lemmatized_tweet, y_val, 
                                   SVC(C=1.2, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1.4, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False))

# Random Forest

### Compare Multiple Methods:

In [None]:
np.random.seed(0)

In [None]:
# Random Forest: compare vectorizers with class weight balances + lemmatizing 
pd.DataFrame(compare_vectorization_model(X_train.lem_tweet, y_train, X_val.lem_tweet, y_val, 
                                   RandomForestClassifier(max_depth= 20, 
                                   n_estimators = 100, class_weight='balanced', random_state=10)))

In [None]:
# Random Forest: compare vectorizers with upsampling + lemmatizing 
compare_vectorization_model(X_train_up.lemmatized_tweet, y_train_up, X_val.lemmatized_tweet, y_val, 
                                   RandomForestClassifier(max_depth= 20,
                                   n_estimators = 100, class_weight='balanced', random_state=10))

In [None]:
# Random Forest: compare vectorizers with SMOTE + lemmatizing  
SMOTE_compare_vectorization_model(X_train.lemmatized_tweet, y_train, X_val.lemmatized_tweet, y_val, 
                                   RandomForestClassifier(max_depth= 20,
                                   n_estimators = 100, class_weight = 'balanced', random_state=10))

In [None]:
# Random Forest: compare vectorizers with upsampling + stemming
compare_vectorization_model(X_train.stemmed_tweet_meta, y_train, X_val.stemmed_tweet_meta, y_val, 
                                   RandomForestClassifier(max_depth= 20,
                                   n_estimators = 100, class_weight='balanced', random_state=10))

#### Random Forest Fine-Tuning Hyperparameters: Max depth 10.... regularization??

In [None]:
# Random Forest: compare vectorizers with upsampling + lemmatizing 
compare_vectorization_model(X_train_up.lemmatized_tweet, y_train_up, X_val.lemmatized_tweet, y_val, 
                                   RandomForestClassifier(max_depth= 10,
                                   n_estimators = 100, class_weight='balanced', random_state=10))

In [None]:
compare_vectorization_model(X_train_up.lemmatized_tweet, y_train_up, X_val.lemmatized_tweet, y_val, 
                                   RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=200, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=10, verbose=0, warm_start=False))

### Grid-Searching

In [None]:
np.random.seed(0)
# rfc = RandomForestClassifier(n_estimators=60, max_depth=6, random_state=10, class_weight = 'balanced')

In [None]:
np.random.seed(0)

parameters = {'n_estimators' : [40, 60, 80, 100],
'max_leaf_nodes' : [200, 400, 600],
'random_state' : [10],
'max_depth': [5, 7, 10, 20],
 'verbose' : [0],
'class_weight': ['balanced']
             }
          
rfc_gs = GridSearchCV(
    RandomForestClassifier(class_weight='balanced', random_state = 10), param_grid=parameters, cv = 3)

In [None]:
rfc_gs.fit(X_train_countvect, y_train_up)

In [None]:
rfc_gs.best_params_

In [None]:
rfc_gs.best_estimator_

In [None]:
rfc_gs.score(X_val_countvect, y_val)

### Attempt with New Data

In [20]:
X_train_countvect =  count_vect.fit_transform(X_train.lem_tweet)
X_val_countvect =  count_vect.transform(X_val.lem_tweet)
# X_test_countvect = count_vect.transform(X_test.lemmatized_tweet)

In [21]:
rfc2 = RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=200, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=10, verbose=0, warm_start=False)

In [22]:
rfc2.fit (X_train_countvect, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=200, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=10, verbose=0, warm_start=False)

In [23]:
y_train_predict = rfc2.predict(X_train_countvect)
metrics.accuracy_score(y_train, y_train_predict)

0.8404161412358133

In [24]:
metrics.f1_score(y_train, y_train_predict, average ='macro')

0.7382561705291922

In [25]:
y_train.shape

(15860,)

In [29]:
y_val_pred = rfc2.predict(X_val_countvect)

In [45]:
y_val_prob

array([[0.30287502, 0.30506359, 0.39206139],
       [0.29946016, 0.32940845, 0.37113139],
       [0.29315212, 0.30766772, 0.39918016],
       ...,
       [0.30128052, 0.4242076 , 0.27451188],
       [0.39402892, 0.29757306, 0.30839802],
       [0.3128928 , 0.41573666, 0.27137054]])

In [31]:
pd.DataFrame(y_val_pred)

Unnamed: 0,0
0,2
1,2
2,2
3,1
4,1
5,2
6,1
7,1
8,1
9,1


In [32]:
confusion_test = pd.crosstab(y_val, y_val_pred, rownames=['Actual'], colnames=['Predicted'], margins=True)

confusion_test

Predicted,0,1,2,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,108,44,47,199
1,215,2430,433,3078
2,12,12,665,689
All,335,2486,1145,3966


In [35]:
metrics.f1_score(y_val, y_val_pred,  average ='macro')

0.6677191812625272

In [37]:
metrics.precision_score(y_val, y_val_pred,  average ='macro')

0.626882646494138

In [38]:
metrics.recall_score(y_val, y_val_pred, average = 'macro')

0.7657847202042857

In [39]:
df.columns

Index(['count', 'hate_speech', 'offensive_language', 'neither', 'class',
       'tweet', 'tidy_tweet', 'no_hash_tweet', 'tokenized_tweet',
       'stemmed_tokens', 'lemmatized_tokens', 'lem_tweet', 'stem_tweet'],
      dtype='object')

In [40]:
df['class'].value_counts(normalize=True)

1    0.774321
2    0.167978
0    0.057701
Name: class, dtype: float64

In [41]:
y_val_prob = rfc2.predict_proba(X_val_countvect)
pd.DataFrame(y_val_prob).head()

Unnamed: 0,0,1,2
0,0.302875,0.305064,0.392061
1,0.29946,0.329408,0.371131
2,0.293152,0.307668,0.39918
3,0.320898,0.4056,0.273502
4,0.281834,0.453192,0.264974


In [43]:
pred_df = pd.DataFrame([y_val_pred, y_val]).T

pred_df.columns = ['predictions', 'actual']

pred_df.head()

Unnamed: 0,predictions,actual
0,2,2
1,2,1
2,2,1
3,1,1
4,1,1


In [44]:
df.tidy_tweet[0:5]

0    woman shouldn complain about cleaning your hou...
1                          dats cold tyga cuffin place
2             Dawg ever fuck bitch start confused shit
3                                     look like tranny
4    shit hear about might true might faker than bi...
Name: tidy_tweet, dtype: object

## Vader

In [None]:
analyzer = SentimentIntensityAnalyzer()
sentence = 'people suck'

In [None]:
def sentiment_analyzer_scores(sentence):
    score = analyzer.polarity_scores(sentence)['compound']
    print("{:-<40} {}".format(sentence, str(score)))

    if compound > 0:
        return 1  ## positive
    else:
        return 0 ## negative
   # else:
        #return "Neutral"     
    return compound

sentiment_analyzer_scores(sentence)

In [None]:
report = classification_report(y_val, y_val_pred, margins = True)
report

In [None]:
compare_vectorization_model(X_train.lemmatized_tweet, y_train, X_val.lemmatized_tweet, y_val, 
                     RandomForestClassifier(class_weight='balanced'))

## Word2Vec

In [None]:
import gensim
# https://radimrehurek.com/gensim/models/word2vec.html

In [None]:
np.random.seed(0)

In [None]:
# word2vec = gensim.models.Word2Vec()

In [None]:
# t = time()

# word2vec.build_vocab(df_tokenized_list, progress_per=10000)

# print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

# Word to Vec

### X-train pre-processing

In [None]:
X_train.tokenized_tweet.head()

In [None]:
X_train.tokenized_tweet.shape

In [None]:
X_train_token_list = list(X_train['tokenized_tweet'])
X_train_token_sumlist = sum(X_train_token_list,[])

In [None]:
X_train_unique_tokens = set(X_train_token_sumlist)
print('The unique number of words in the training dataset is: {}'.format(len(X_train_unique_tokens)))

In [None]:
X_train_unique_tokens

In [None]:
X_train_token_list

#### X-val pre-processing

In [None]:
# X_val_token_list = list(X_val['tokenized_tweet'])
# X_val_token_sumlist = sum(X_val_token_list,[])
# X_val_unique_tokens = set(X_val_token_sumlist)

# print('The unique number of words in the validation dataset is: {}'.format(len(X_val_unique_tokens)))

#### X-test pre-processing

In [None]:
# X_test_token_list = list(X_test['tokenized_tweet'])
# X_test_token_sumlist = sum(X_test_token_list,[])

# X_test_unique_tokens = set(X_test_token_sumlist)
# print('The unique number of words in the training dataset is: {}'.format(len(X_test_unique_tokens)))

### Modeling

In [None]:
from time import time
t = time()

w2v = gensim.models.Word2Vec(X_train_token_list, sg=1, min_count=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
w2v.train(X_train_token_list, total_examples=w2v.corpus_count, epochs=w2v.epochs)

In [None]:
#w2v.save('w2v-min1.model')
# w2v = gensim.models.Word2Vec.load('w2v-min1.model')

In [None]:
w2v.corpus_count

In [None]:
w2v_vocab= w2v.wv.vocab

In [None]:
len(w2v_vocab)

In [None]:
w2v.wv.vectors.shape

In [None]:
w2v.wv['trump']

In [None]:
w2v.wv.most_similar(['trump'])

In [None]:
w2v.wv.most_similar(positive=['lazy','black'])

In [None]:
w2v.wv

In [None]:
w2v.wv.get_keras_embedding

In [None]:
w2v_X = w2v.wv.vectors

#### Classification with Word2Vec

In [None]:
np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])

In [None]:
sentence = df_tokenized_list[1]
sentence

In [None]:
np.mean([w2v[w] for w in sentence if w in w2v]
                   or [np.zeros(100)], axis=0)

In [None]:
 np.mean([w2v[w] for w in sentence if w in w2v]  or np.zeros(100)

In [None]:
input_to_lr = np.empty((31410, 100))
for sentence in X_train_token_list:
    np.append(input_to_lr, np.mean([w2v[w] for w in sentence if w in w2v]
                   or [np.zeros(100)], axis=0))
# np.mean([w2v[w] for w in sentence if w in w2v], axis=0)

In [None]:
input_to_lr[0]

In [None]:
input_to_lr[0].shape

In [None]:
X_temp = input_to_lr

In [None]:
X_temp_df = pd.DataFrame(X_temp)

In [None]:
a.fit(X_train_temp, y)
a.score(X_train_temp, y)
c = a.predict(X_train_temp)
# print scores  
print('Train Accuracy: ' + str(round(metrics.f1_score(y, c),2)))

## NN

## RNN 

In [None]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [None]:
X_train_sample, X_train_remainder, y_train_sample, y_train_remainder = train_test_split(X_train, y_train, test_size=0.99, random_state=123)

In [None]:
X_train_sample.shape

In [None]:
X_RNN_sample= X_train_sample['tokenized_tweet']

In [None]:
X_RNN_sample

In [None]:
y_RNN_sample=y_train_sample
y_RNN_sample.shape

In [None]:
# define documents
docs = X_RNN_sample
# define class labels
labels = y_RNN_sample

In [None]:
# integer encode the documents
vocab_size = 100
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)

## Word2Vec Visualization

In [None]:
# model = gensim.models.Word2Vec(df_tokenized_list, size=dimsize, window=5, min_count=50, workers=4)

In [None]:
# tsne_plot(w2v_model)

### Extra

In [None]:
%run custom_functions.ipynb

In [None]:
#size of hidden layer (length of continuous word representation)
dimsize= 100

# model_w2v = gensim.models.Word2Vec(X_train_token_list, size= dimsize, window=5, min_count=1, workers=4)
model_w2v = gensim.models.Word2Vec(X_train_token_list, size= dimsize,min_count=1)



#create average vector for train and test from model
#returned list of numpy arrays are then stacked 
X_train_w2v = np.concatenate([avg_word_vectors(w, dimsize, model_w2v) for w in X_train_token_list])
X_val_w2v = np.concatenate([avg_word_vectors(w,dimsize, model_w2v) for w in X_val_token_list])

In [None]:
X_train_w2v[0]

In [None]:
X_val_w2v[0]

In [None]:
X_train_token_list.shape

In [None]:
input_to_lr = np.empty((31410, 100))
for sentence in X_train_token_list:
    np.append(input_to_lr, np.mean([model_w2v[w] for w in sentence if w in model_w2v]
                   or [np.zeros(100)], axis=0))

In [None]:
input_to_lr[0]

In [None]:
def smote_w2v_model (X_train_w2v, y_train, X_val_w2v, y_val, classifier):
        
    smote = SMOTE(random_state=1, sampling_strategy='not majority')
        
    pipe = make_pipeline(smote, classifier) 
    
    model = pipe.fit(X_train_w2v, y_train)
    
    train_predictions = model.predict(X_train_w2v)
    val_predictions = model.predict (X_val_w2v)
    
   # print scores  
    print('Train Accuracy: ' + str(round(metrics.accuracy_score(y_train, train_predictions),2)))
    print('Train Precision: ' + str(round(metrics.precision_score(y_train, train_predictions),2)))
    print('Train Recall: ' + str(round(metrics.recall_score(y_train, train_predictions),2)))
    print('Train F1: ' + str(round(metrics.f1_score(y_train, train_predictions),2)))
    print('\n')
    print('Validation Accuracy: ' + str(round(metrics.accuracy_score(y_val, val_predictions),2)))
    print('Validation Precision: ' + str(round(metrics.precision_score(y_val, val_predictions),2)))
    print('Validation Recall: ' + str(round(metrics.recall_score(y_val, val_predictions),2)))
    print('Validation F1: ' + str(round(metrics.f1_score(y_val, val_predictions),2)))

    log_confusion_test = pd.crosstab(y_val, val_predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    return log_confusion_test

In [None]:
smote_w2v_model(X_train_w2v, y_train, X_val_w2v, y_val, LogisticRegression(solver='lbfgs'))

In [None]:
smote_w2v_model(X_train_w2v, y_train, X_val_w2v, y_val, RandomForestClassifier(n_estimators=100, max_depth= 20))

In [None]:
def pca_smote_w2v_model (X_train_w2v, y_train, X_val_w2v, y_val, classifier):
    
    pca = decomposition.PCA(n_components=50)
    
    smote = SMOTE(random_state=1, sampling_strategy='not majority')
        
    pipe = make_pipeline(pca, smote, classifier) 
    
    model = pipe.fit(X_train_w2v, y_train)
    
    train_predictions = model.predict(X_train_w2v)
    val_predictions = model.predict (X_val_w2v)
    
   # print scores  
    print('Train Accuracy: ' + str(round(metrics.accuracy_score(y_train, train_predictions),2)))
    print('Train Precision: ' + str(round(metrics.precision_score(y_train, train_predictions),2)))
    print('Train Recall: ' + str(round(metrics.recall_score(y_train, train_predictions),2)))
    print('Train F1: ' + str(round(metrics.f1_score(y_train, train_predictions),2)))
    print('\n')
    print('Validation Accuracy: ' + str(round(metrics.accuracy_score(y_val, val_predictions),2)))
    print('Validation Precision: ' + str(round(metrics.precision_score(y_val, val_predictions),2)))
    print('Validation Recall: ' + str(round(metrics.recall_score(y_val, val_predictions),2)))
    print('Validation F1: ' + str(round(metrics.f1_score(y_val, val_predictions),2)))

    log_confusion_test = pd.crosstab(y_val, val_predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    return log_confusion_test

In [None]:
pca_smote_w2v_model(X_train_w2v, y_train, X_val_w2v, y_val, LogisticRegression(solver='lbfgs'))

In [None]:
pca_smote_w2v_model(X_train_w2v, y_train, X_val_w2v, y_val, RandomForestClassifier(n_estimators=100, max_depth=20))

## Glove Embeddings

In [None]:
# !pip install glove_python

In [None]:
# ! pip install glove

In [None]:
# ! pip install glovepy

In [None]:
from gensim.models import KeyedVectors
filename = 'glove.twitter.27B.100d.txt'

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'glove.twitter.27B.100d.txt'
glove_output_file = 'glove.txt.word2vec'
glove2word2vec(glove_input_file, glove_output_file)

In [None]:
glove_model = KeyedVectors.load_word2vec_format('glove.txt.word2vec', binary=False)

In [None]:
glove_model.most_similar('love')

In [None]:
glove_model['love']

In [None]:
len(glove_model.vocab)

In [None]:
X_train_glove = np.concatenate([avg_word_vectors(w, dimsize, glove_model) for w in X_train_token_list])
X_val_glove = np.concatenate([avg_word_vectors(w, dimsize, glove_model) for w in X_val_token_list])

In [None]:
X_train_glove[255]

In [None]:
X_train_glove_2 = np.empty((31410, 100))
for sentence in X_train_token_list:
    np.append(input_to_lr, np.mean([glove_model[w] for w in sentence if w in glove_model]
                   or [np.zeros(100)], axis=0))

In [None]:
X_train_glove_2[225]

In [None]:
X_train_glove.shape

In [None]:
X_train.shape

####  Learnco 

In [None]:
glove_dict = {}
with open('glove.twitter.27B.100d.txt', 'rb') as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode('utf-8')
        if word in X_train_unique_tokens:
            vector = np.array(parts[1:], dtype=np.float32)
            glove_dict[word] = vector

In [None]:
glove_dict['love']

In [None]:
input_to_lr = np.empty((31410, 100))
for sentence in X_train_token_list:
    np.append(input_to_lr, np.mean([w2v[w] for w in sentence if w in w2v]
                   or [np.zeros(100)], axis=0))

In [None]:
input_to_lr['love']

## Classification with Word Embeddings

In [None]:
pca_smote_w2v_model(X_train_glove, y_train, X_val_glove, y_val, RandomForestClassifier(n_estimators=100, max_depth=10))

In [None]:
def smote_w2v_model (X_train_w2v, y_train, X_val_w2v, y_val, classifier):
        
    smote = SMOTE(random_state=1, sampling_strategy='not majority')
        
    pipe = make_pipeline(smote, classifier) 
    
    model = pipe.fit(X_train_w2v, y_train)
    
    train_predictions = model.predict(X_train_w2v)
    val_predictions = model.predict (X_val_w2v)
    
   # print scores  
    print('Train Accuracy: ' + str(round(metrics.accuracy_score(y_train, train_predictions),2)))
    print('Train Precision: ' + str(round(metrics.precision_score(y_train, train_predictions),2)))
    print('Train Recall: ' + str(round(metrics.recall_score(y_train, train_predictions),2)))
    print('Train F1: ' + str(round(metrics.f1_score(y_train, train_predictions),2)))
    print('\n')
    print('Validation Accuracy: ' + str(round(metrics.accuracy_score(y_val, val_predictions),2)))
    print('Validation Precision: ' + str(round(metrics.precision_score(y_val, val_predictions),2)))
    print('Validation Recall: ' + str(round(metrics.recall_score(y_val, val_predictions),2)))
    print('Validation F1: ' + str(round(metrics.f1_score(y_val, val_predictions),2)))

    log_confusion_test = pd.crosstab(y_val, val_predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    return log_confusion_test

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, RandomForestClassifier(max_depth=10))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, RandomForestClassifier(max_depth=10))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, LogisticRegression(penalty ='l1', C = 10,
                                                                            class_weight='balanced'))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, LogisticRegression(penalty ='l1', C = .001,
                                                                            class_weight='balanced'))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, LogisticRegression(penalty ='l1', class_weight='balanced'))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, 
                 LogisticRegression(penalty ='l2', C = 5, class_weight ={0: 5 , 1: 5}))

In [None]:
smote_w2v_model (X_train_glove, y_train, X_val_glove, y_val, 
                 LogisticRegression(penalty ='l2', C = .1, class_weight ={0: 5 , 1: 5}))

# Testing Trump Tweets

In [None]:
trump_df= pd.read_csv('data/cleaned-trump-tweet.csv')
trump_df.head()

In [None]:
trump_df.stem_tweet

In [None]:
X_train_countvect =  count_vect.fit_transform(X_train_up.lem_tweet)

In [None]:
logreg.fit(X_train_countvect, y_train_up)

In [None]:
X_trump = count_vect.transform(trump_df.lem_tweet)

In [None]:
X_trump = X_trump.toarray()

In [None]:
X_trump.shape

In [None]:
X_train_up.shape

In [None]:
X_train.shape