In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import preprocessor as p
import string
import re
from nltk import TweetTokenizer
import pickle

In [None]:
larger_tweets = pd.read_csv('Desktop/full_tweets.csv', usecols = [0,5], names = ['label','tweet'], encoding = 'latin-1')

In [None]:
larger_tweets['label'] = larger_tweets['label'].map({0:1, 2:0, 4:0})

In [None]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
larger_tweets['tweet_cleaned'] = larger_tweets['tweet'].apply(lambda x : p.clean(x))
larger_tweets['tweet_cleaned'] = larger_tweets['tweet_cleaned'].apply(lambda x : re.sub(r'#', '', x))

In [None]:
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
larger_tweets['tokenized_tweets'] = larger_tweets['tweet_cleaned'].apply(
    lambda x : tokenizer.tokenize(x))

In [None]:
larger_tweets

In [None]:
dic_lst = {}
for dim in tqdm([25,50,100,200]):
    glove_file = 'glove.twitter.27B.' + str(dim) + 'd.txt'
    glove = open('Desktop/glove_embeddings/' + glove_file)
    emb_dict = {}
    for line in glove:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        emb_dict[word] = vector
    dic_lst[str(dim)] = emb_dict
    glove.close()

In [None]:
dic_lst['25']['booty']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(larger_tweets['tokenized_tweets'], larger_tweets['label'],
                                                   test_size = .25, random_state = 42)

In [None]:
def word_vec_generator(data, vectors, vec_length):
    vocab = vectors.keys()
    dense_feature_lst = []
    for tweet in tqdm(data.values):
        tot_vec = np.zeros((vec_length,))
        words = [word for word in tweet if word in vocab]
        if len(words) == 0:
            dense_feature_lst.append(tot_vec)
            continue
        for word in words:
            tot_vec += vectors[word]
        tot_vec = tot_vec / len(words)
        dense_feature_lst.append(tot_vec)
    return np.array(dense_feature_lst)

In [None]:
X_train_feats_25 = word_vec_generator(X_train, dic_lst['25'], 25)
X_train_feats_50 = word_vec_generator(X_train, dic_lst['50'], 50)
X_train_feats_100 = word_vec_generator(X_train, dic_lst['100'], 100)
X_train_feats_200 = word_vec_generator(X_train, dic_lst['200'], 200)

In [None]:
X_test_feats_25 = word_vec_generator(X_test, dic_lst['25'], 25)
X_test_feats_50 = word_vec_generator(X_test, dic_lst['50'], 50)
X_test_feats_100 = word_vec_generator(X_test, dic_lst['100'], 100)
X_test_feats_200 = word_vec_generator(X_test, dic_lst['200'], 200)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [None]:
train_lst = [X_train_feats_25, X_train_feats_50, X_train_feats_100, X_train_feats_200]
test_lst = [X_test_feats_25,X_test_feats_50,X_test_feats_100,X_test_feats_200]
scaled_train_feats = []
scaled_test_feats = []
for feats in zip(train_lst,test_lst):
    scaler = StandardScaler()
    fitted = scaler.fit(feats[0])
    scaled_train_feats.append(fitted.transform(feats[0]))
    scaled_test_feats.append(fitted.transform(feats[1]))

In [None]:
params = {'penalty': ['l1', 'l2'], 'C': [.1,1,10,100]}
scores = ['accuracy', 'precision', 'recall', 'f1']
logreg = LogisticRegression(solver='saga')
results = []
for feat_set in [X_train_feats_25[:10000], X_train_feats_50[:10000], X_train_feats_100[:10000], X_train_feats_200[:10000]]:
    search = GridSearchCV(logreg, params, scoring = scores, refit = False)
    search.fit(feat_set, y_train[:10000])
    results.append(search.cv_results_)

In [None]:
feats = []
for feat_length in ['25','50','100','200']:
    feats.append([feat_length]*8)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in results:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_lr = pd.concat(df_lst, axis = 0)
full_metric_df_lr.insert(0,'word_vector_length',feats)

In [None]:
full_metric_df_lr

In [None]:
params = {'penalty': ['l1', 'l2'], 'C': [.1,1,10,100]}
scores = ['accuracy', 'precision', 'recall', 'f1']
logreg = LogisticRegression(solver='saga')
scaled_results = []
for feat_set in [scaled_train_feats[0][:10000], scaled_train_feats[1][:10000], scaled_train_feats[2][:10000], scaled_train_feats[3][:10000]]:
    search = GridSearchCV(logreg, params, scoring = scores, refit = False)
    search.fit(feat_set, y_train[:10000])
    scaled_results.append(search.cv_results_)

In [None]:
feats = []
for feat_length in ['25','50','100','200']:
    feats.append([feat_length]*8)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in scaled_results:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_scaled_lr = pd.concat(df_lst, axis = 0)
full_metric_df_scaled_lr.insert(0,'word_vector_length',feats)

In [None]:
full_metric_df_scaled_lr

In [None]:
params = {'n_estimators': [500,1000], 'max_features': ['auto','log2']}
scores = ['accuracy', 'precision', 'recall', 'f1']
rf = RandomForestClassifier()
results_rf = []
for feat_set in [X_train_feats_25[:10000], X_train_feats_50[:10000], X_train_feats_100[:10000], X_train_feats_200[:10000]]:
    search = GridSearchCV(rf, params, scoring = scores, refit = False, n_jobs=3)
    search.fit(feat_set, y_train[:10000])
    results_rf.append(search.cv_results_)

In [None]:
feats = []
for feat_length in ['25','50','100','200']:
    feats.append([feat_length]*4)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in results_rf:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_rf = pd.concat(df_lst, axis = 0)
full_metric_df_rf.insert(0,'word_vector_length',feats)

In [None]:
full_metric_df_rf

In [None]:
from xgboost import XGBClassifier

In [None]:
%%time
params = {'n_estimators': [10,100,250], 'max_depth':[10,50], 'eta':[.1,.3,.5]}
scores = ['accuracy', 'precision', 'recall', 'f1']
xgb = XGBClassifier(n_jobs=3)
results_xgb = []
for feat_set in [X_train_feats_25[:10000], X_train_feats_50[:10000], X_train_feats_100[:10000], X_train_feats_200[:10000]]:
    search = GridSearchCV(xgb, params, scoring = scores, refit = False, n_jobs=3)
    search.fit(feat_set, y_train[:10000])
    results_xgb.append(search.cv_results_)

In [None]:
feats = []
for feat_length in ['25','50','100','200']:
    feats.append([feat_length]*18)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in results_xgb:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_xgb = pd.concat(df_lst, axis = 0)
full_metric_df_xgb.insert(0,'word_vector_length',feats)

In [None]:
full_metric_df_xgb.tail(20)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
import pickle

In [None]:
training_sizes = [1200,12000,120000,1200000]
lr_models = []
lr_preds = {}
for size in tqdm(training_sizes):
    logreg = LogisticRegression(C=1, penalty='l1', solver='saga')
    logreg.fit(X_train_feats_200[:size], y_train[:size])
    lr_models.append(logreg)
    size_preds = logreg.predict(X_test_feats_200[:size])
    lr_preds[size] = size_preds

In [None]:
for train_size, pred_values in lr_preds.items():
    acc = accuracy_score(pred_values, y_test[:train_size])
    prec = precision_score(pred_values, y_test[:train_size])
    rec = recall_score(pred_values, y_test[:train_size])
    f1 = f1_score(pred_values, y_test[:train_size])
    print(train_size)
    print('accuracy:' + str(acc))
    print('precision:' + str(prec))
    print('recall:' + str(rec))
    print('f1 score:' + str(f1))

In [None]:
from xgboost import XGBClassifier
training_sizes = [1200,12000,120000,500000]
models = []
xgb_preds = {}
for size in tqdm(training_sizes):
    xgb = XGBClassifier(max_depth=50, n_estimators = 250, eta = .5, n_jobs=3)
    xgb.fit(X_train_feats_200[:size], y_train[:size])
    models.append(xgb)
    size_preds = xgb.predict(X_test_feats_200[:size])
    xgb_preds[size] = size_preds

In [None]:
for train_size, pred_values in xgb_preds.items():
    acc = accuracy_score(pred_values, y_test[:train_size])
    prec = precision_score(pred_values, y_test[:train_size])
    rec = recall_score(pred_values, y_test[:train_size])
    f1 = f1_score(pred_values, y_test[:train_size])
    print(train_size)
    print('accuracy:' + str(acc))
    print('precision:' + str(prec))
    print('recall:' + str(rec))
    print('f1 score:' + str(f1))

In [None]:
import altair as alt

In [None]:
train_size = [1200,12000,120000,1200000,1200,12000,120000,500000]
models = ['Logistic Regression','Logistic Regression','Logistic Regression','Logistic Regression',
          'XGB Classifier','XGB Classifier','XGB Classifier','XGB Classifier']
acc = [72.9,75.4,76.0,76.0,69.8,73.2,75.5,76.2]
prec = [72.8,73.8,75.3,75.5,67.3,72.2,75.6,76.5]
rec = [73.0,76.2,76.3,76.2,70.9,73.5,75.3,76.0]
f1 = [72.9,75.0,75.8,75.8,69.1,72.9,75.5,76.3]

glove_models = pd.DataFrame()
glove_models['Model'] = models
glove_models['Training Size'] = train_size
glove_models['Accuracy'] = acc
glove_models['Precision'] = prec
glove_models['Recall'] = rec
glove_models['F1 Score'] = f1

acc_glove = alt.Chart(glove_models).mark_line().encode(
    x=alt.X('Training Size',scale=alt.Scale(type='log')),
    y=alt.Y('Accuracy',scale=alt.Scale(domain=[65,80])),
    color = 'Model'
    ).properties(title='GloVE Embeddings: Accuracy vs. Training Size')

acc_glove.show()

In [None]:
from sklearn.metrics import classification_report
logreg_preds = 

In [None]:
logreg_report = classification_report(y_test, lr_preds[1200000], output_dict=True)
logreg_report = pd.DataFrame(logreg_report).T
logreg_report = logreg_report.rename(index={'1':'Negative (0)', '0':'Postive/Neutral (4)'})
logreg_report[['precision','recall','f1-score']] = logreg_report[['precision','recall','f1-score']]*100
logreg_report = logreg_report.round(1)
logreg_report.iloc[2,:2] = ''
logreg_report.iloc[2,3] = ''
logreg_report.index.name = 'Logistic Regression'
logreg_report

In [None]:
xgbreport = classification_report(y_test, xgb_preds[500000], output_dict=True)
xgbreport = pd.DataFrame(xgbreport).T
xgbreport = xgbreport.rename(index={'1':'Negative (0)', '0':'Postive/Neutral (4)'})
xgbreport[['precision','recall','f1-score']] = xgbreport[['precision','recall','f1-score']]*100
xgbreport = xgbreport.round(1)
xgbreport.iloc[2,:2] = ''
xgbreport.iloc[2,3] = ''
xgbreport.index.name = 'XGBoost'
xgbreport