In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import preprocessor as p
import string
import re

In [None]:
import nltk
from nltk import TweetTokenizer
from nltk.stem import WordNetLemmatizer

In [None]:
larger_tweets = pd.read_csv('Desktop/full_tweets.csv', usecols = [0,5], names = ['label','tweet'], encoding = 'latin-1')

In [None]:
larger_tweets['label'] = larger_tweets['label'].map({0:1, 2:0, 4:0})

In [None]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
larger_tweets['tweet_cleaned'] = larger_tweets['tweet'].apply(lambda x : p.clean(x))
larger_tweets['tweet_cleaned'] = larger_tweets['tweet_cleaned'].apply(lambda x : re.sub(r'#', '', x))

In [None]:
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
larger_tweets['tokenized_tweets'] = larger_tweets['tweet_cleaned'].apply(
    lambda x : tokenizer.tokenize(x))

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [None]:
X_train, X_test, y_train, y_test = train_test_split(larger_tweets['tokenized_tweets'], larger_tweets['label'],
                                                   test_size = .25, random_state = 42)

In [None]:
def word_vec_generator_ft(data, vectors, vec_size):
    dense_feature_lst = []
    for tweet in tqdm(data.values):
        tot_vec = np.zeros((vec_size,))
        if len(tweet) == 0:
            dense_feature_lst.append(tot_vec)
            continue
        for word in tweet:
            tot_vec += vectors.get_word_vector(word)
        tot_vec = tot_vec / len(tweet)
        dense_feature_lst.append(tot_vec)
    return np.array(dense_feature_lst)

In [None]:
import fasttext.util
ft_eng = fasttext.load_model('cc.en.300.bin')

In [None]:
X_train_300 = word_vec_generator_ft(X_train, ft_eng, 300)
X_test_300 = word_vec_generator_ft(X_test, ft_eng, 300)

In [None]:
fasttext.util.reduce_model(ft_eng, 200)
X_train_200 = word_vec_generator_ft(X_train, ft_eng, 200)
X_test_200 = word_vec_generator_ft(X_test, ft_eng, 200)

In [None]:
fasttext.util.reduce_model(ft_eng, 100)
X_train_100 = word_vec_generator_ft(X_train, ft_eng, 100)
X_test_100 = word_vec_generator_ft(X_test, ft_eng, 100)

In [None]:
fasttext.util.reduce_model(ft_eng, 50)
X_train_50 = word_vec_generator_ft(X_train, ft_eng, 50)
X_test_50 = word_vec_generator_ft(X_test, ft_eng, 50)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [None]:
train_lst = [X_train_50, X_train_100, X_train_200, X_train_300]
test_lst = [X_test_50, X_test_100, X_test_200, X_test_300]
scaled_train_feats = []
scaled_test_feats = []
for feats in zip(train_lst,test_lst):
    scaler = StandardScaler()
    fitted = scaler.fit(feats[0])
    scaled_train_feats.append(fitted.transform(feats[0]))
    scaled_test_feats.append(fitted.transform(feats[1]))

In [None]:
params = {'penalty': ['l1', 'l2'], 'C': [.1,1,10,100]}
scores = ['accuracy', 'precision', 'recall', 'f1']
logreg = LogisticRegression(solver='saga')
results = []
for feat_set in [X_train_50[:10000], X_train_100[:10000], X_train_200[:10000], X_train_300[:10000]]:
    search = GridSearchCV(logreg, params, scoring = scores, refit = False)
    search.fit(feat_set, y_train[:10000])
    results.append(search.cv_results_)

In [None]:
feats = []
for feat_length in ['25','50','100','200']:
    feats.append([feat_length]*8)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in results:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_lr = pd.concat(df_lst, axis = 0)
full_metric_df_lr.insert(0,'word_vector_length',feats)

In [None]:
full_metric_df_lr

In [None]:
params = {'penalty': ['l1', 'l2'], 'C': [.1,1,10,100]}
scores = ['accuracy', 'precision', 'recall', 'f1']
logreg = LogisticRegression(solver='saga')
scaled_results = []
for feat_set in [scaled_train_feats[0][:10000], scaled_train_feats[1][:10000], scaled_train_feats[2][:10000], scaled_train_feats[3][:10000]]:
    search = GridSearchCV(logreg, params, scoring = scores, refit = False)
    search.fit(feat_set, y_train[:10000])
    scaled_results.append(search.cv_results_)

In [None]:
feats = []
for feat_length in ['25','50','100','200']:
    feats.append([feat_length]*8)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in scaled_results:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_lr = pd.concat(df_lst, axis = 0)
full_metric_df_lr.insert(0,'word_vector_length',feats)

In [None]:
full_metric_df_lr

In [None]:
%%time
params = {'n_estimators': [500,1000], 'max_features': ['auto','log2']}
scores = ['accuracy', 'precision', 'recall', 'f1']
rf = RandomForestClassifier()
results_rf = []
for feat_set in [X_train_50[:10000], X_train_100[:10000], X_train_200[:10000], X_train_300[:10000]]:
    search = GridSearchCV(rf, params, scoring = scores, refit = False, n_jobs=3)
    search.fit(feat_set, y_train[:10000])
    results_rf.append(search.cv_results_)

In [None]:
feats = []
for feat_length in ['50','100','200','300']:
    feats.append([feat_length]*4)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in results_rf:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_rf = pd.concat(df_lst, axis = 0)
full_metric_df_rf.insert(0,'word_vector_length',feats)

In [None]:
full_metric_df_rf

In [None]:
from xgboost import XGBClassifier

In [None]:
%%time
params = {'n_estimators': [10,100,250], 'max_depth':[10,50], 'eta':[.1,.3,.5]}
scores = ['accuracy', 'precision', 'recall', 'f1']
xgb = XGBClassifier(n_jobs=3)
results_xgb = []
for feat_set in [X_train_50[:10000], X_train_100[:10000], X_train_200[:10000], X_train_300[:10000]]:
    search = GridSearchCV(xgb, params, scoring = scores, refit = False, n_jobs=3)
    search.fit(feat_set, y_train[:10000])
    results_xgb.append(search.cv_results_)

In [None]:
feats = []
for feat_length in ['50','100','200','300']:
    feats.append([feat_length]*18)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in results_xgb:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_xgb = pd.concat(df_lst, axis = 0)
full_metric_df_xgb.insert(0,'word_vector_length',feats)

In [None]:
full_metric_df_xgb.tail(20)