In [28]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import preprocessor as p
import string
import re

In [29]:
import nltk
from nltk import TweetTokenizer
from nltk.stem import WordNetLemmatizer

In [30]:
larger_tweets = pd.read_csv('Desktop/full_tweets.csv', usecols = [0,5], names = ['label','tweet'], encoding = 'latin-1')

In [31]:
larger_tweets['label'] = larger_tweets['label'].map({0:1, 2:0, 4:0})

In [5]:
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
larger_tweets['tweet_cleaned'] = larger_tweets['tweet'].apply(lambda x : p.clean(x))
larger_tweets['tweet_cleaned'] = larger_tweets['tweet_cleaned'].apply(lambda x : re.sub(r'#', '', x))

In [6]:
tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True)
larger_tweets['tokenized_tweets'] = larger_tweets['tweet_cleaned'].apply(
    lambda x : tokenizer.tokenize(x))

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [8]:
X_train, X_test, y_train, y_test = train_test_split(larger_tweets['tokenized_tweets'], larger_tweets['label'],
                                                   test_size = .25, random_state = 42)

In [9]:
def word_vec_generator_ft(data, vectors, vec_size):
    dense_feature_lst = []
    for tweet in tqdm(data.values):
        tot_vec = np.zeros((vec_size,))
        if len(tweet) == 0:
            dense_feature_lst.append(tot_vec)
            continue
        for word in tweet:
            tot_vec += vectors.get_word_vector(word)
        tot_vec = tot_vec / len(tweet)
        dense_feature_lst.append(tot_vec)
    return np.array(dense_feature_lst)

In [10]:
import fasttext.util
ft_eng = fasttext.load_model('cc.en.300.bin')



In [11]:
X_train_300 = word_vec_generator_ft(X_train, ft_eng, 300)
X_test_300 = word_vec_generator_ft(X_test, ft_eng, 300)

100%|███████████████████████████████| 1200000/1200000 [10:29<00:00, 1905.09it/s]
100%|█████████████████████████████████| 400000/400000 [04:40<00:00, 1427.40it/s]


In [12]:
fasttext.util.reduce_model(ft_eng, 200)
X_train_200 = word_vec_generator_ft(X_train, ft_eng, 200)
X_test_200 = word_vec_generator_ft(X_test, ft_eng, 200)

100%|███████████████████████████████| 1200000/1200000 [06:58<00:00, 2865.36it/s]
100%|█████████████████████████████████| 400000/400000 [03:06<00:00, 2142.98it/s]


In [13]:
fasttext.util.reduce_model(ft_eng, 100)
X_train_100 = word_vec_generator_ft(X_train, ft_eng, 100)
X_test_100 = word_vec_generator_ft(X_test, ft_eng, 100)

100%|███████████████████████████████| 1200000/1200000 [05:32<00:00, 3613.02it/s]
100%|█████████████████████████████████| 400000/400000 [01:46<00:00, 3756.95it/s]


In [14]:
fasttext.util.reduce_model(ft_eng, 50)
X_train_50 = word_vec_generator_ft(X_train, ft_eng, 50)
X_test_50 = word_vec_generator_ft(X_test, ft_eng, 50)

100%|███████████████████████████████| 1200000/1200000 [02:46<00:00, 7208.94it/s]
100%|█████████████████████████████████| 400000/400000 [00:48<00:00, 8177.17it/s]


In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [35]:
train_lst = [X_train_50, X_train_100, X_train_200, X_train_300]
test_lst = [X_test_50, X_test_100, X_test_200, X_test_300]
scaled_train_feats = []
scaled_test_feats = []
for feats in zip(train_lst,test_lst):
    scaler = StandardScaler()
    fitted = scaler.fit(feats[0])
    scaled_train_feats.append(fitted.transform(feats[0]))
    scaled_test_feats.append(fitted.transform(feats[1]))

In [40]:
params = {'penalty': ['l1', 'l2'], 'C': [.1,1,10,100]}
scores = ['accuracy', 'precision', 'recall', 'f1']
logreg = LogisticRegression(solver='saga')
results = []
for feat_set in [X_train_50[:10000], X_train_100[:10000], X_train_200[:10000], X_train_300[:10000]]:
    search = GridSearchCV(logreg, params, scoring = scores, refit = False)
    search.fit(feat_set, y_train[:10000])
    results.append(search.cv_results_)



In [41]:
feats = []
for feat_length in ['25','50','100','200']:
    feats.append([feat_length]*8)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in results:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_lr = pd.concat(df_lst, axis = 0)
full_metric_df_lr.insert(0,'word_vector_length',feats)

In [42]:
full_metric_df_lr

Unnamed: 0,word_vector_length,params,accuracy,precision,recall,f1_score
0,25,"{'C': 0.1, 'penalty': 'l1'}",0.6653,0.660333,0.667879,0.663887
1,25,"{'C': 0.1, 'penalty': 'l2'}",0.664,0.657023,0.671919,0.66425
2,25,"{'C': 1, 'penalty': 'l1'}",0.6985,0.696885,0.692121,0.694405
3,25,"{'C': 1, 'penalty': 'l2'}",0.6954,0.691978,0.693535,0.692661
4,25,"{'C': 10, 'penalty': 'l1'}",0.6991,0.697352,0.692929,0.695077
5,25,"{'C': 10, 'penalty': 'l2'}",0.7006,0.698833,0.694545,0.696613
6,25,"{'C': 100, 'penalty': 'l1'}",0.699,0.697362,0.692525,0.694877
7,25,"{'C': 100, 'penalty': 'l2'}",0.6985,0.696899,0.691919,0.694338
0,50,"{'C': 0.1, 'penalty': 'l1'}",0.6651,0.660266,0.667071,0.663486
1,50,"{'C': 0.1, 'penalty': 'l2'}",0.6726,0.669172,0.669697,0.669318


In [37]:
params = {'penalty': ['l1', 'l2'], 'C': [.1,1,10,100]}
scores = ['accuracy', 'precision', 'recall', 'f1']
logreg = LogisticRegression(solver='saga')
scaled_results = []
for feat_set in [scaled_train_feats[0][:10000], scaled_train_feats[1][:10000], scaled_train_feats[2][:10000], scaled_train_feats[3][:10000]]:
    search = GridSearchCV(logreg, params, scoring = scores, refit = False)
    search.fit(feat_set, y_train[:10000])
    scaled_results.append(search.cv_results_)



In [38]:
feats = []
for feat_length in ['25','50','100','200']:
    feats.append([feat_length]*8)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in scaled_results:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_lr = pd.concat(df_lst, axis = 0)
full_metric_df_lr.insert(0,'word_vector_length',feats)

In [39]:
full_metric_df_lr

Unnamed: 0,word_vector_length,params,accuracy,precision,recall,f1_score
0,25,"{'C': 0.1, 'penalty': 'l1'}",0.6987,0.697065,0.692323,0.694603
1,25,"{'C': 0.1, 'penalty': 'l2'}",0.699,0.697383,0.692525,0.694875
2,25,"{'C': 1, 'penalty': 'l1'}",0.6986,0.697032,0.691919,0.694409
3,25,"{'C': 1, 'penalty': 'l2'}",0.6991,0.697672,0.692121,0.694826
4,25,"{'C': 10, 'penalty': 'l1'}",0.699,0.697524,0.692121,0.694755
5,25,"{'C': 10, 'penalty': 'l2'}",0.699,0.697524,0.692121,0.694755
6,25,"{'C': 100, 'penalty': 'l1'}",0.6991,0.697586,0.692323,0.694888
7,25,"{'C': 100, 'penalty': 'l2'}",0.6991,0.697586,0.692323,0.694888
0,50,"{'C': 0.1, 'penalty': 'l1'}",0.7186,0.72059,0.705051,0.712663
1,50,"{'C': 0.1, 'penalty': 'l2'}",0.7158,0.717795,0.702222,0.709779


In [47]:
%%time
params = {'n_estimators': [500,1000], 'max_features': ['auto','log2']}
scores = ['accuracy', 'precision', 'recall', 'f1']
rf = RandomForestClassifier()
results_rf = []
for feat_set in [X_train_50[:10000], X_train_100[:10000], X_train_200[:10000], X_train_300[:10000]]:
    search = GridSearchCV(rf, params, scoring = scores, refit = False, n_jobs=3)
    search.fit(feat_set, y_train[:10000])
    results_rf.append(search.cv_results_)

CPU times: user 390 ms, sys: 431 ms, total: 821 ms
Wall time: 18min 32s


In [48]:
feats = []
for feat_length in ['50','100','200','300']:
    feats.append([feat_length]*4)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in results_rf:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_rf = pd.concat(df_lst, axis = 0)
full_metric_df_rf.insert(0,'word_vector_length',feats)

In [49]:
full_metric_df_rf

Unnamed: 0,word_vector_length,params,accuracy,precision,recall,f1_score
0,50,"{'max_features': 'auto', 'n_estimators': 500}",0.6772,0.678179,0.662222,0.669926
1,50,"{'max_features': 'auto', 'n_estimators': 1000}",0.6804,0.681838,0.664242,0.672841
2,50,"{'max_features': 'log2', 'n_estimators': 500}",0.6757,0.677654,0.657778,0.667481
3,50,"{'max_features': 'log2', 'n_estimators': 1000}",0.6793,0.680422,0.66404,0.672052
0,100,"{'max_features': 'auto', 'n_estimators': 500}",0.6761,0.676694,0.66202,0.669139
1,100,"{'max_features': 'auto', 'n_estimators': 1000}",0.678,0.679396,0.661818,0.670334
2,100,"{'max_features': 'log2', 'n_estimators': 500}",0.6773,0.678491,0.661414,0.669754
3,100,"{'max_features': 'log2', 'n_estimators': 1000}",0.6801,0.680737,0.666465,0.673384
0,200,"{'max_features': 'auto', 'n_estimators': 500}",0.686,0.68607,0.674343,0.680083
1,200,"{'max_features': 'auto', 'n_estimators': 1000}",0.6839,0.682945,0.674949,0.678732


In [50]:
from xgboost import XGBClassifier

In [56]:
%%time
params = {'n_estimators': [10,100,250], 'max_depth':[10,50], 'eta':[.1,.3,.5]}
scores = ['accuracy', 'precision', 'recall', 'f1']
xgb = XGBClassifier(n_jobs=3)
results_xgb = []
for feat_set in [X_train_50[:10000], X_train_100[:10000], X_train_200[:10000], X_train_300[:10000]]:
    search = GridSearchCV(xgb, params, scoring = scores, refit = False, n_jobs=3)
    search.fit(feat_set, y_train[:10000])
    results_xgb.append(search.cv_results_)

Wall time: 1h 32min 13s


In [57]:
feats = []
for feat_length in ['50','100','200','300']:
    feats.append([feat_length]*18)
feats = [val for lst in feats for val in lst]
df_lst = []
for result in results_xgb:
    dic = {}
    dic['params'] = result['params']
    dic['accuracy'] = result['mean_test_accuracy']
    dic['precision'] = result['mean_test_precision']
    dic['recall'] = result['mean_test_recall']
    dic['f1_score'] = result['mean_test_f1']
    df_lst.append(pd.DataFrame(dic))
full_metric_df_xgb = pd.concat(df_lst, axis = 0)
full_metric_df_xgb.insert(0,'word_vector_length',feats)

In [58]:
full_metric_df_xgb.tail(20)

Unnamed: 0,word_vector_length,params,accuracy,precision,recall,f1_score
16,200,"{'eta': 0.5, 'max_depth': 50, 'n_estimators': ...",0.6888,0.686275,0.68404,0.685
17,200,"{'eta': 0.5, 'max_depth': 50, 'n_estimators': ...",0.6952,0.694278,0.686667,0.690342
0,300,"{'eta': 0.1, 'max_depth': 10, 'n_estimators': 10}",0.6535,0.6463,0.662828,0.654392
1,300,"{'eta': 0.1, 'max_depth': 10, 'n_estimators': ...",0.7012,0.697711,0.699596,0.698538
2,300,"{'eta': 0.1, 'max_depth': 10, 'n_estimators': ...",0.7106,0.707039,0.709293,0.708054
3,300,"{'eta': 0.1, 'max_depth': 50, 'n_estimators': 10}",0.6411,0.637426,0.637778,0.637507
4,300,"{'eta': 0.1, 'max_depth': 50, 'n_estimators': ...",0.6998,0.695472,0.7,0.697591
5,300,"{'eta': 0.1, 'max_depth': 50, 'n_estimators': ...",0.7105,0.707856,0.707071,0.707303
6,300,"{'eta': 0.3, 'max_depth': 10, 'n_estimators': 10}",0.6531,0.650081,0.648081,0.648967
7,300,"{'eta': 0.3, 'max_depth': 10, 'n_estimators': ...",0.6978,0.695633,0.693333,0.694125











changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.



