In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
import ast

pd.set_option("display.max_rows",10)

## logistic regression model from tfidf-word, all labels

In [None]:
full_set = pd.read_pickle('full_cleaned.pkl')
full_set_sw = pd.read_pickle('full_cleaned_sw.pkl')
word2vec_model = Word2Vec.load('models/Myword2vec.model')

In [34]:
%%time

word_lg_records = pd.DataFrame(index=labels, columns=['best_params', 'cv_score', 'train_score', 'test_score'])
word_lg_full = pd.DataFrame(index=full_set.index, columns=labels)
for label in labels:
    print('Start training classifier on {} label'.format(label))
    x_train, x_test, y_train, y_test, train_idx, test_idx = train_test_split(full_set.cleaned_text[:n_train], 
                                                        full_set[label][:n_train], 
                                                        np.arange(n_train),
                                                        test_size = 0.2, 
                                                        random_state=2018,
                                                        stratify = full_set[label][:n_train])
    word_lg = Pipeline([('tfidf', TfidfVectorizer()), ('lg', LogisticRegression())])
    parameters = {'tfidf__max_df': [0.5, 0.4, 0.2],\
                  'tfidf__min_df': [0.00005, 0.0001, 0.0002],\
                  'lg__class_weight': ['balanced'],\
                  'lg__solver': ['lbfgs'],\
                  'lg__C': [0.05, 0.1, 0.2, 0.4, 0.8, 1]
                 }
    word_lg_cv = GridSearchCV(word_lg, parameters, n_jobs=-1, scoring = 'roc_auc', verbose=1, cv=5)
    word_lg_cv.fit(x_train, y_train)
    word_lg_records.loc[label,'best_params'] = str(word_lg_cv.best_params_)
    word_lg_records.loc[label,'cv_score'] = word_lg_cv.best_score_
    word_lg_records.loc[label, 'train_score'] = metrics.roc_auc_score(y_train, word_lg_cv.predict_proba(x_train)[:,1])
    word_lg_records.loc[label, 'test_score'] = metrics.roc_auc_score(y_test, word_lg_cv.predict_proba(x_test)[:,1])
    print('Best params: {}, cv_score: {}, train_score: {}, test_score: {}'.format(*word_lg_records.loc[label]))
    word_lg.set_params(**word_lg_cv.best_params_)
    print('Fitting final model and making prediction for submission on {} label'.format(label))
    word_lg.fit(full_set.cleaned_text[:n_train], full_set[label][:n_train])
    word_lg_full[label] = word_lg.predict_proba(full_set.cleaned_text)[:,1]

Start training classifier on toxic label
Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 31.8min finished


Best params: {'lg__C': 0.8, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 5e-05}, cv_score: 0.9686441612554454, train_score: 0.9872585613774817, test_score: 0.9717791800531692
Fitting final model and making prediction for submission on toxic label
Start training classifier on severe_toxic label
Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 31.9min finished


Best params: {'lg__C': 0.1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 5e-05}, cv_score: 0.9851474552431835, train_score: 0.9917170363946917, test_score: 0.9833844191221379
Fitting final model and making prediction for submission on severe_toxic label
Start training classifier on obscene label
Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 15.8min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 28.9min finished


Best params: {'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.2, 'tfidf__min_df': 5e-05}, cv_score: 0.9848137941650619, train_score: 0.9943263376125865, test_score: 0.9831182110502592
Fitting final model and making prediction for submission on obscene label
Start training classifier on threat label
Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 30.3min finished


Best params: {'lg__C': 0.1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 0.0001}, cv_score: 0.9811428041068667, train_score: 0.9973667522113111, test_score: 0.9736231038582817
Fitting final model and making prediction for submission on threat label
Start training classifier on insult label
Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 29.5min finished


Best params: {'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.2, 'tfidf__min_df': 5e-05}, cv_score: 0.9761933842973832, train_score: 0.9889081949539981, test_score: 0.9758932102834541
Fitting final model and making prediction for submission on insult label
Start training classifier on identity_hate label
Fitting 5 folds for each of 54 candidates, totalling 270 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 16.1min
[Parallel(n_jobs=-1)]: Done 270 out of 270 | elapsed: 30.6min finished


Best params: {'lg__C': 0.2, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 5e-05}, cv_score: 0.9727445487435387, train_score: 0.9927667679299728, test_score: 0.9790201632236318
Fitting final model and making prediction for submission on identity_hate label
Wall time: 3h 11min 54s


In [35]:
print(*word_lg_records.best_params, sep='\n')
print(word_lg_records.iloc[:,1:])
print(word_lg_records.iloc[:,1:].mean())

{'lg__C': 0.8, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 5e-05}
{'lg__C': 0.1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 5e-05}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.2, 'tfidf__min_df': 5e-05}
{'lg__C': 0.1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 0.0001}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.2, 'tfidf__min_df': 5e-05}
{'lg__C': 0.2, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 5e-05}
               cv_score train_score test_score
toxic          0.968644    0.987259   0.971779
severe_toxic   0.985147    0.991717   0.983384
obscene        0.984814    0.994326   0.983118
threat         0.981143    0.997367   0.973623
insult         0.976193    0.988908   0.975893
identity_hate  0.9727

{'lg__C': 0.8, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0001}
{'lg__C': 0.1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0001}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001}
{'lg__C': 0.1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0001}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001}
{'lg__C': 0.2, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0001}
               cv_score train_score test_score
toxic           0.96785     0.98584   0.971031
severe_toxic   0.984672    0.991401    0.98346
obscene        0.984222    0.993775   0.981968
threat         0.981143    0.997367   0.973623
insult          0.97526    0.988024   0.975399
identity_hate  0.971298    0.992237   0.978517
cv_score       0.977407
train_score    0.991440
test_score     0.977333
dtype: float64

{'lg__C': 0.8, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 5e-05}
{'lg__C': 0.1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 5e-05}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.2, 'tfidf__min_df': 5e-05}
{'lg__C': 0.1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 0.0001}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.2, 'tfidf__min_df': 5e-05}
{'lg__C': 0.2, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__max_df': 0.5, 'tfidf__min_df': 5e-05}
               cv_score train_score test_score
toxic          0.968644    0.987259   0.971779
severe_toxic   0.985147    0.991717   0.983384
obscene        0.984814    0.994326   0.983118
threat         0.981143    0.997367   0.973623
insult         0.976193    0.988908   0.975893
identity_hate  0.972745    0.992767    0.97902
cv_score       0.978114
train_score    0.992057
test_score     0.977803

                  'tfidf__max_df': [0.5, 0.4, 0.3, 0.2],\
                  'tfidf__min_df': [0.00005, 0.0001, 0.0002, 0.0003, 0.001, 0.003],\
                  'lg__class_weight': ['balanced'],\
                  'lg__solver': ['lbfgs'],\
                  'lg__C': [0.05, 0.1, 0.2, 0.4, 0.8, 1]

In [43]:
word_lg_full.to_csv('word_lg_full.csv',index=False)
word_lg_records.to_csv('word_lg_records.csv', index=True)

## logistic regression model with tfidf-char, all labels

In [37]:
%%time

char_lg_records = pd.DataFrame(index=labels, columns=['best_params', 'cv_score', 'train_score', 'test_score'])
char_lg_full = pd.DataFrame(index=full_set.index, columns=labels)
for label in labels:
    print('Start training classifier on {} label'.format(label))
    x_train, x_test, y_train, y_test, train_idx, test_idx = train_test_split(full_set.cleaned_text[:n_train], 
                                                        full_set[label][:n_train], 
                                                        np.arange(n_train),
                                                        test_size = 0.2, 
                                                        random_state=2018,
                                                        stratify = full_set[label][:n_train])
    char_lg = Pipeline([('tfidf', TfidfVectorizer()), ('lg', LogisticRegression())])
    parameters = {'tfidf__analyzer': ['char'],\
              'tfidf__max_df': [0.4, 0.2],\
              'tfidf__min_df': [0.0001, 0.0003],\
              'tfidf__ngram_range': [(1,5)],\
#               'tfidf__max_features': [40000],\
              'lg__class_weight': ['balanced'],\
              'lg__solver': ['lbfgs'],\
              'lg__C': [0.1, 0.2, 0.4, 0.8, 1]
                 }
    char_lg_cv = GridSearchCV(char_lg, parameters, n_jobs=-1, scoring = 'roc_auc', verbose=1, cv=3)
    char_lg_cv.fit(x_train, y_train)
    char_lg_records.loc[label,'best_params'] = str(char_lg_cv.best_params_)
    char_lg_records.loc[label,'cv_score'] = char_lg_cv.best_score_
    char_lg_records.loc[label, 'train_score'] = metrics.roc_auc_score(y_train, char_lg_cv.predict_proba(x_train)[:,1])
    char_lg_records.loc[label, 'test_score'] = metrics.roc_auc_score(y_test, char_lg_cv.predict_proba(x_test)[:,1])
    print('Best params: {}, cv_score: {}, train_score: {}, test_score: {}'.format(*char_lg_records.loc[label]))
    char_lg.set_params(**char_lg_cv.best_params_)
    print('Fitting final model and making prediction for submission on {} label'.format(label))
    char_lg.fit(full_set.cleaned_text[:n_train], full_set[label][:n_train])
    char_lg_full[label] = char_lg.predict_proba(full_set.cleaned_text)[:,1]

Start training classifier on toxic label
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 76.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 104.6min finished


Best params: {'lg__C': 1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9688033831267864, train_score: 0.9923210747108505, test_score: 0.9725522185757541
Fitting final model and making prediction for submission on toxic label
Start training classifier on severe_toxic label
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 71.0min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 99.7min finished


Best params: {'lg__C': 0.2, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9872511038013121, train_score: 0.993530780682829, test_score: 0.9875079917659509
Fitting final model and making prediction for submission on severe_toxic label
Start training classifier on obscene label
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 77.2min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 166.8min finished


Best params: {'lg__C': 1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9886951396500243, train_score: 0.9973444868645687, test_score: 0.988596326368802
Fitting final model and making prediction for submission on obscene label
Start training classifier on threat label
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 82.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 111.3min finished


Best params: {'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9761448347308186, train_score: 0.999219692732018, test_score: 0.9760630113558986
Fitting final model and making prediction for submission on threat label
Start training classifier on insult label
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 87.7min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 123.4min finished


Best params: {'lg__C': 1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9778383091315479, train_score: 0.9935849276972556, test_score: 0.9778042816335499
Fitting final model and making prediction for submission on insult label
Start training classifier on identity_hate label
Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 87.3min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 122.9min finished


Best params: {'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9795780395096644, train_score: 0.9965552579050235, test_score: 0.9858488220588821
Fitting final model and making prediction for submission on identity_hate label
Wall time: 13h 26min 45s


In [38]:
print(*char_lg_records.best_params, sep='\n')
print(char_lg_records.iloc[:,1:])
print(char_lg_records.iloc[:,1:].mean())

{'lg__C': 1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.2, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 

{'lg__C': 0.8, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.8, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.2, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.4, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.8, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
               cv_score train_score test_score
toxic          0.967772    0.988358   0.971176
severe_toxic   0.987057    0.992343    0.98716
obscene        0.988175    0.996573   0.988169
threat         0.975422    0.998486   0.976327
insult         0.977073    0.991987   0.977095
identity_hate  0.979267    0.996085   0.985326
cv_score       0.979128
train_score    0.993972
test_score     0.980875
dtype: float64

{'lg__C': 1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.2, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 1, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001, 'tfidf__ngram_range': (1, 5)}
{'lg__C': 0.4, 'lg__class_weight': 'balanced', 'lg__solver': 'lbfgs', 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
               cv_score train_score test_score
toxic          0.968803    0.992321   0.972552
severe_toxic   0.987251    0.993531   0.987508
obscene        0.988695    0.997344   0.988596
threat         0.976145     0.99922   0.976063
insult         0.977838    0.993585   0.977804
identity_hate  0.979578    0.996555   0.985849
cv_score       0.979718
train_score    0.995426
test_score     0.981395

              'tfidf__analyzer': ['char'],\
              'tfidf__max_df': [0.4, 0.2],\
              'tfidf__min_df': [0.0001],\
              'tfidf__ngram_range': [(1,5)],\
              'tfidf__max_features': [40000],\
              'lg__class_weight': ['balanced'],\
              'lg__solver': ['lbfgs'],\
              'lg__C': [0.1, 0.2, 0.4, 0.8]

In [44]:
char_lg_full.to_csv('char_lg_full.csv',index=False)
char_lg_records.to_csv('char_lg_records.csv', index=True)

## logistic regression model with word2vec, all labels

In [None]:
%%time
def get_comment_vect(comment_column):
    return(np.array([np.mean([word2vec_model.wv.word_vec(word) 
                              for word in comment.split() if word in word2vec_model.wv.vocab] 
                             or [np.zeros(300)], axis=0) 
                     for comment in comment_column]
                   ))
comment_vect = pd.DataFrame(get_comment_vect(full_set_sw.cleaned_text))

In [114]:
%%time

w2v_lg_records = pd.DataFrame(index=labels, columns=['best_params', 'cv_score', 'train_score', 'test_score'])
w2v_lg_full = pd.DataFrame(index=full_set.index, columns=labels)
for label in labels:
    print('Start training classifier on {} label'.format(label))
    x_train, x_test, y_train, y_test, train_idx, test_idx = train_test_split(comment_vect[:n_train], 
                                                        full_set_sw[label][:n_train], 
                                                        np.arange(n_train),
                                                        test_size = 0.2, 
                                                        random_state=2018,
                                                        stratify = full_set_sw[label][:n_train])
    w2v_lg = LogisticRegression()
    parameters = {'class_weight': ['balanced'],\
                  'solver': ['lbfgs'],\
                  'C': [0.001, 0.003, 0.01, 0.03, 0.1, 0.3]
                 }
    w2v_lg_cv = GridSearchCV(w2v_lg, parameters, n_jobs=-1, scoring = 'roc_auc', verbose=5, cv=5)
    w2v_lg_cv.fit(x_train, y_train)
    w2v_lg_records.loc[label,'best_params'] = str(w2v_lg_cv.best_params_)
    w2v_lg_records.loc[label,'cv_score'] = w2v_lg_cv.best_score_
    w2v_lg_records.loc[label, 'train_score'] = metrics.roc_auc_score(y_train, w2v_lg_cv.predict_proba(x_train)[:,1])
    w2v_lg_records.loc[label, 'test_score'] = metrics.roc_auc_score(y_test, w2v_lg_cv.predict_proba(x_test)[:,1])
    print('Best params: {}, cv_score: {}, train_score: {}, test_score: {}'.format(*w2v_lg_records.loc[label]))
    w2v_lg.set_params(**w2v_lg_cv.best_params_)
    print('Fitting final model and making prediction for submission on {} label'.format(label))
    w2v_lg.fit(comment_vect[:n_train], full_set_sw[label][:n_train])
    w2v_lg_full[label] = w2v_lg.predict_proba(comment_vect)[:,1]

Start training classifier on toxic label
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   46.8s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.6min finished


Best params: {'C': 0.3, 'class_weight': 'balanced', 'solver': 'lbfgs'}, cv_score: 0.9596047524368049, train_score: 0.9621766444419689, test_score: 0.9625495850799717
Fitting final model and making prediction for submission on toxic label
Start training classifier on severe_toxic label
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   38.7s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.8min finished


Best params: {'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}, cv_score: 0.9829759387488306, train_score: 0.9855514617060257, test_score: 0.9818582944311431
Fitting final model and making prediction for submission on severe_toxic label
Start training classifier on obscene label
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   52.3s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.9min finished


Best params: {'C': 0.1, 'class_weight': 'balanced', 'solver': 'lbfgs'}, cv_score: 0.9716778450816964, train_score: 0.9749443268377088, test_score: 0.9706400418948615
Fitting final model and making prediction for submission on obscene label
Start training classifier on threat label
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.8min finished


Best params: {'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}, cv_score: 0.9775136135931969, train_score: 0.9867680969787161, test_score: 0.9751129762615628
Fitting final model and making prediction for submission on threat label
Start training classifier on insult label
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.9min finished


Best params: {'C': 0.1, 'class_weight': 'balanced', 'solver': 'lbfgs'}, cv_score: 0.9675018756965723, train_score: 0.9705305976921453, test_score: 0.9692536752780653
Fitting final model and making prediction for submission on insult label
Start training classifier on identity_hate label
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.2min finished


Best params: {'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}, cv_score: 0.960261719137092, train_score: 0.9681775813986767, test_score: 0.9641517066753486
Fitting final model and making prediction for submission on identity_hate label
Wall time: 19min 26s


In [112]:
print(*w2v_lg_records.best_params, sep='\n')
print(w2v_lg_records.iloc[:,1:])
print(w2v_lg_records.iloc[:,1:].mean())

{'C': 0.3, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.1, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.1, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}
               cv_score train_score test_score
toxic          0.959605    0.962177    0.96255
severe_toxic   0.982976    0.985551   0.981858
obscene        0.971678    0.974944    0.97064
threat         0.977514    0.986768   0.975113
insult         0.967502    0.970531   0.969254
identity_hate  0.960262    0.968178   0.964152
cv_score       0.969923
train_score    0.974691
test_score     0.970594
dtype: float64


word2vec: size=300, window=5
{'C': 0.3, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.1, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.1, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}
               cv_score train_score test_score
toxic          0.959605    0.962177    0.96255
severe_toxic   0.982976    0.985551   0.981858
obscene        0.971678    0.974944    0.97064
threat         0.977514    0.986768   0.975113
insult         0.967502    0.970531   0.969254
identity_hate  0.960262    0.968178   0.964152
cv_score       0.969923
train_score    0.974691
test_score     0.970594

word2vec: size=100, window=3
{'C': 1, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 10, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.1, 'class_weight': 'balanced', 'solver': 'lbfgs'}
{'C': 0.01, 'class_weight': 'balanced', 'solver': 'lbfgs'}
               cv_score train_score test_score
toxic           0.95375    0.954787   0.957034
severe_toxic     0.9822    0.983757   0.982192
obscene        0.964333    0.965793   0.966064
threat          0.97269    0.979577   0.972365
insult         0.963145    0.964357   0.964598
identity_hate  0.957263    0.961801   0.959968
cv_score       0.965564
train_score    0.968345
test_score     0.967037


In [113]:
w2v_lg_full.to_csv('w2v_lg_full.csv',index=False)
w2v_lg_records.to_csv('w2v_lg_records.csv', index=True)

- create train test sets for stacking

In [170]:
%%time

w2v_lg_stack_train = pd.DataFrame(np.zeros((n_train, 6)), columns=labels)
w2v_lg_stack_test = pd.read_csv('w2v_lg_full.csv')[n_train:]
w2v_lg_records = pd.read_csv('w2v_lg_records.csv')

for j,label in enumerate(labels):
    print('Start generating out-of-sample predictions on {} label for stacking'.format(label))
    skf = list(StratifiedKFold(full_set_sw[label][:n_train], 5))
    w2v_lg = LogisticRegression(**ast.literal_eval(w2v_lg_records.loc[j, 'best_params']))
    for i,(train, test) in enumerate(skf):
        print("Fold", i+1)
        X_train = comment_vect.iloc[train,:]
        y_train = full_set_sw[label][train]
        X_test = comment_vect.iloc[test,:]
        y_test = full_set_sw[label][test]
        w2v_lg.fit(X_train, y_train)
        w2v_lg_stack_train.iloc[test, j] = w2v_lg.predict_proba(X_test)[:, 1]

Start generating out-of-sample predictions on toxic label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on severe_toxic label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on obscene label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on threat label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on insult label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on identity_hate label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5


In [176]:
w2v_lg_stack_train.to_csv('w2v_lg_stack_train.csv')
w2v_lg_stack_test.to_csv('w2v_lg_stack_test')