In [4]:
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn import metrics
from sklearn.pipeline import Pipeline, FeatureUnion
import ast

pd.set_option("display.max_rows",10)

## mnb model with tfidf-word, all labels

In [25]:
train = pd.read_csv('data/train.csv', sep=',', encoding='utf8')
test = pd.read_csv('data/test.csv', sep=',', encoding='utf8')
n_train = train.shape[0]
n_test = test.shape[0]
full_set = pd.read_pickle('full_cleaned.pkl')
labels = train.columns[2:8].tolist()

In [33]:
%%time

word_mnb_records = pd.DataFrame(index=labels, columns=['best_params', 'cv_score', 'train_score', 'test_score'])
word_mnb_full = pd.DataFrame(index=full_set.index, columns=labels)
for label in labels:
    print('Start training classifier on {} label'.format(label))
    x_train, x_test, y_train, y_test, train_idx, test_idx = train_test_split(full_set.cleaned_text[:n_train], 
                                                        full_set[label][:n_train], 
                                                        np.arange(n_train),
                                                        test_size = 0.2, 
                                                        random_state=2018,
                                                        stratify = full_set[label][:n_train])
    word_mnb = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
    parameters = {'tfidf__max_df': [0.4, 0.2, 0.15],\
                  'tfidf__min_df': [0.0001, 0.0003, 0.003],\
                  'nb__alpha': [0.6, 0.5, 0.4, 0.2]
                 }
    word_mnb_cv = GridSearchCV(word_mnb, parameters, n_jobs=-1, scoring = 'roc_auc', verbose=1, cv=5)
    word_mnb_cv.fit(x_train, y_train)
    word_mnb_records.loc[label,'best_params'] = str(word_mnb_cv.best_params_)
    word_mnb_records.loc[label,'cv_score'] = word_mnb_cv.best_score_
    word_mnb_records.loc[label, 'train_score'] = metrics.roc_auc_score(y_train, word_mnb_cv.predict_proba(x_train)[:,1])
    word_mnb_records.loc[label, 'test_score'] = metrics.roc_auc_score(y_test, word_mnb_cv.predict_proba(x_test)[:,1])
    print('Best params: {}, cv_score: {}, train_score: {}, test_score: {}'.format(*word_mnb_records.loc[label]))
    word_mnb.set_params(**word_mnb_cv.best_params_)
    print('Fitting final model and making prediction for submission on {} label'.format(label))
    word_mnb.fit(full_set.cleaned_text[:n_train], full_set[label][:n_train])
    word_mnb_full[label] = word_mnb.predict_proba(full_set.cleaned_text)[:,1]

Start training classifier on toxic label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 15.5min finished


Best params: {'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001}, cv_score: 0.9550078827130518, train_score: 0.9679276328979506, test_score: 0.9596303256634855
Fitting final model and making prediction for submission on toxic label
Start training classifier on severe_toxic label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 14.2min finished


Best params: {'nb__alpha': 0.2, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}, cv_score: 0.9789676614405037, train_score: 0.9883241955519528, test_score: 0.9772268899559129
Fitting final model and making prediction for submission on severe_toxic label
Start training classifier on obscene label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 13.9min finished


Best params: {'nb__alpha': 0.6, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}, cv_score: 0.9657177130254938, train_score: 0.9733249942525654, test_score: 0.9642367549101658
Fitting final model and making prediction for submission on obscene label
Start training classifier on threat label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 13.9min finished


Best params: {'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.003}, cv_score: 0.9697533098478018, train_score: 0.9853950030058413, test_score: 0.9615744851084782
Fitting final model and making prediction for submission on threat label
Start training classifier on insult label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 13.7min finished


Best params: {'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}, cv_score: 0.9612750237601925, train_score: 0.9712145235929304, test_score: 0.9640399284301722
Fitting final model and making prediction for submission on insult label
Start training classifier on identity_hate label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 13.6min finished


Best params: {'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0003}, cv_score: 0.9625447197861673, train_score: 0.9818428366846956, test_score: 0.9723479872212811
Fitting final model and making prediction for submission on identity_hate label
Wall time: 1h 34min 16s


In [34]:
print(*word_mnb_records.best_params, sep='\n')
print(word_mnb_records.iloc[:,1:])
print(word_mnb_records.iloc[:,1:].mean())

{'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.6, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.003}
{'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0003}
               cv_score train_score test_score
toxic          0.955008    0.967928    0.95963
severe_toxic   0.978968    0.988324   0.977227
obscene        0.965718    0.973325   0.964237
threat         0.969753    0.985395   0.961574
insult         0.961275    0.971215    0.96404
identity_hate  0.962545    0.981843   0.972348
cv_score       0.965544
train_score    0.978005
test_score     0.966509
dtype: float64


{'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.6, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.003}
{'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0003}
               cv_score train_score test_score
toxic          0.955008    0.967928    0.95963
severe_toxic   0.978968    0.988324   0.977227
obscene        0.965718    0.973325   0.964237
threat         0.969753    0.985395   0.961574
insult         0.961275    0.971215    0.96404
identity_hate  0.962545    0.981843   0.972348
cv_score       0.965544
train_score    0.978005
test_score     0.966509

{'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.5, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.003}
{'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0003}
               cv_score train_score test_score
toxic          0.955008    0.967928    0.95963
severe_toxic   0.978968    0.988324   0.977227
obscene         0.96566    0.973624   0.964084
threat         0.969753    0.985395   0.961574
insult         0.961275    0.971215    0.96404
identity_hate  0.962545    0.981843   0.972348
cv_score       0.965535
train_score    0.978055
test_score     0.966484


In [36]:
word_mnb_full.to_csv('data/word_mnb_full.csv',index=False)
word_mnb_records.to_csv('data/word_mnb_records.csv',index=True)

                  'tfidf__max_df': [0.4, 0.3, 0.2, 0.15],\
                  'tfidf__min_df': [0.0001, 0.0003, 0.001, 0.003],\
                  'nb__alpha': [0.6, 0.5, 0.4, 0.2, 0.1]
                  

- **create train test sets for stacking**

In [11]:
%%time

word_mnb_stack_train = pd.DataFrame(np.zeros((n_train, 6)), columns=labels)
word_mnb_stack_test = pd.read_csv('data/word_mnb_full.csv')[n_train:]
word_mnb_records = pd.read_csv('data/word_mnb_records.csv')

for j,label in enumerate(labels):
    print('Start generating out-of-sample predictions on {} label for stacking'.format(label))
    skf = list(StratifiedKFold(full_set[label][:n_train], 5))
    word_mnb = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
    word_mnb.set_params(**ast.literal_eval(word_mnb_records.loc[j, 'best_params']))
    for i,(train, test) in enumerate(skf):
        print("Fold", i+1)
        X_train = full_set.cleaned_text[train]
        y_train = full_set[label][train]
        X_test = full_set.cleaned_text[test]
        y_test = full_set[label][test]
        word_mnb.fit(X_train, y_train)
        word_mnb_stack_train.iloc[test, j] = word_mnb.predict_proba(X_test)[:, 1]

Start generating out-of-sample predictions on identity_hate label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on insult label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on obscene label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on severe_toxic label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on threat label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on toxic label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Wall time: 3min 16s


In [14]:
word_mnb_stack_train

Unnamed: 0,identity_hate,insult,obscene,severe_toxic,threat,toxic
0,0.000439,0.002457,0.005074,0.000160,0.000229,0.008053
1,0.000743,0.002486,0.004467,0.000587,0.000495,0.009322
2,0.000044,0.014916,0.014376,0.000809,0.000012,0.056350
3,0.000007,0.000025,0.000162,0.000015,0.000013,0.000105
4,0.002124,0.059278,0.060850,0.001994,0.000976,0.108738
...,...,...,...,...,...,...
159566,0.000026,0.000743,0.002093,0.000220,0.000029,0.004163
159567,0.002106,0.122075,0.101095,0.003875,0.004834,0.433457
159568,0.012858,0.056633,0.068233,0.000805,0.005588,0.084456
159569,0.000147,0.006671,0.006905,0.000442,0.000060,0.019894


In [15]:
word_mnb_stack_train.to_csv('data/word_mnb_stack_train.csv')
word_mnb_stack_test.to_csv('data/word_mnb_stack_test.csv')

## mnb model with tfidf-char, all labels ---very time consuming to fine tune

In [45]:
%%time

char_mnb_records = pd.DataFrame(index=labels, columns=['best_params', 'cv_score', 'train_score', 'test_score'])
char_mnb_full = pd.DataFrame(index=full_set.index, columns=labels)
for label in labels:
    print('Start training classifier on {} label'.format(label))
    x_train, x_test, y_train, y_test, train_idx, test_idx = train_test_split(full_set.cleaned_text[:n_train], 
                                                        full_set[label][:n_train], 
                                                        np.arange(n_train),
                                                        test_size = 0.2, 
                                                        random_state=2018,
                                                        stratify = full_set[label][:n_train])
    char_mnb = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
    parameters = {'tfidf__analyzer': ['char'],\
              'tfidf__max_df': [0.15, 0.12],\
              'tfidf__min_df': [0.0003],\
              'tfidf__ngram_range': [(1,5)],\
              'tfidf__max_features': [40000],\
              'nb__alpha': [0.4, 0.2]}
    char_mnb_cv = GridSearchCV(char_mnb, parameters, n_jobs=-1, scoring = 'roc_auc', verbose=1, cv=3)
    char_mnb_cv.fit(x_train, y_train)
    char_mnb_records.loc[label,'best_params'] = str(char_mnb_cv.best_params_)
    char_mnb_records.loc[label,'cv_score'] = char_mnb_cv.best_score_
    char_mnb_records.loc[label, 'train_score'] = metrics.roc_auc_score(y_train, char_mnb_cv.predict_proba(x_train)[:,1])
    char_mnb_records.loc[label, 'test_score'] = metrics.roc_auc_score(y_test, char_mnb_cv.predict_proba(x_test)[:,1])
    print('Best params: {}, cv_score: {}, train_score: {}, test_score: {}'.format(*char_mnb_records.loc[label]))
    char_mnb.set_params(**char_mnb_cv.best_params_)
    print('Fitting final model and making prediction for submission on {} label'.format(label))
    char_mnb.fit(full_set.cleaned_text[:n_train], full_set[label][:n_train])
    char_mnb_full[label] = char_mnb.predict_proba(full_set.cleaned_text)[:,1]

Start training classifier on toxic label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 19.2min finished


Best params: {'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9438663617735209, train_score: 0.9513621686238408, test_score: 0.947215629356778
Fitting final model and making prediction for submission on toxic label
Start training classifier on severe_toxic label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 18.0min finished


Best params: {'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9810017332462886, train_score: 0.9877322633982899, test_score: 0.9796894551550314
Fitting final model and making prediction for submission on severe_toxic label
Start training classifier on obscene label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 21.0min finished


Best params: {'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9616746624720707, train_score: 0.9677143459412112, test_score: 0.9615631971260908
Fitting final model and making prediction for submission on obscene label
Start training classifier on threat label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 21.0min finished


Best params: {'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9491728837908645, train_score: 0.9793309125622283, test_score: 0.9514403409388522
Fitting final model and making prediction for submission on threat label
Start training classifier on insult label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 18.0min finished


Best params: {'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9555089673517058, train_score: 0.9635760462895477, test_score: 0.9627183245963734
Fitting final model and making prediction for submission on insult label
Start training classifier on identity_hate label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 19.2min finished


Best params: {'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9625542433104942, train_score: 0.9789429647042994, test_score: 0.970552315777182
Fitting final model and making prediction for submission on identity_hate label
Wall time: 2h 58min 1s


In [46]:
print(*char_mnb_records.best_params, sep='\n')
print(char_mnb_records.iloc[:,1:])
print(char_mnb_records.iloc[:,1:].mean())

{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
               cv_score train_score test_score
toxic          0.943866    0.

{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
               cv_score train_score test_score
toxic          0.943866    0.951362   0.947216
severe_toxic   0.981002    0.987732   0.979689
obscene        0.961675    0.967714   0.961563
threat         0.949173    0.979331    0.95144
insult         0.955509    0.963576   0.962718
identity_hate  0.962554    0.978943   0.970552
cv_score       0.958963
train_score    0.971443
test_score     0.962197

{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
               cv_score train_score test_score
toxic          0.943866    0.951362   0.947216
severe_toxic   0.981002    0.987732   0.979689
obscene        0.961675    0.967714   0.961563
threat         0.948283    0.978424   0.950496
insult         0.955456    0.963371   0.962704
identity_hate  0.962063    0.978343   0.970227
cv_score       0.958724
train_score    0.971158
test_score     0.961983

{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
               cv_score train_score test_score
toxic          0.943409    0.956741   0.948965
severe_toxic   0.973072    0.985497   0.973623
obscene        0.958114    0.969775    0.95988
threat         0.912432    0.962793   0.919866
insult         0.952491    0.966795   0.961544
identity_hate  0.940987    0.970586   0.954784
cv_score       0.946751
train_score    0.968698
test_score     0.953111

              'tfidf__analyzer': ['char'],\
              'tfidf__max_df': [0.3, 0.2, 0.15],\
              'tfidf__min_df': [0.0001, 0.0003],\
              'tfidf__ngram_range': [(1,5)],\
              'tfidf__max_features': [40000],\
              'nb__alpha': [0.6, 0.4, 0.2]}

In [None]:
char_mnb_full.to_csv('char_mnb_full.csv',index=False)
char_mnb_records.to_csv('char_mnb_records.csv',index=True)

- **create train test sets for stacking**

In [16]:
%%time

char_mnb_stack_train = pd.DataFrame(np.zeros((n_train, 6)), columns=labels)
char_mnb_stack_test = pd.read_csv('data/char_mnb_full.csv')[n_train:]
char_mnb_records = pd.read_csv('data/char_mnb_records.csv')

for j,label in enumerate(labels):
    print('Start generating out-of-sample predictions on {} label for stacking'.format(label))
    skf = list(StratifiedKFold(full_set[label][:n_train], 5))
    char_mnb = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
    char_mnb.set_params(**ast.literal_eval(char_mnb_records.loc[j, 'best_params']))
    for i,(train, test) in enumerate(skf):
        print("Fold", i+1)
        X_train = full_set.cleaned_text[train]
        y_train = full_set[label][train]
        X_test = full_set.cleaned_text[test]
        y_test = full_set[label][test]
        char_mnb.fit(X_train, y_train)
        char_mnb_stack_train.iloc[test, j] = char_mnb.predict_proba(X_test)[:, 1]

Start generating out-of-sample predictions on identity_hate label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on insult label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on obscene label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on severe_toxic label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on threat label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Start generating out-of-sample predictions on toxic label for stacking
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Wall time: 1h 6min 18s


In [27]:
char_mnb_stack_train

Unnamed: 0,identity_hate,insult,obscene,severe_toxic,threat,toxic
0,3.463650e-06,5.032522e-04,7.684948e-04,2.750926e-06,9.642752e-07,0.006465
1,8.732348e-05,9.451995e-05,2.477398e-04,6.093440e-05,5.210060e-05,0.000321
2,3.064627e-07,2.758274e-04,3.027401e-04,1.384921e-07,1.548660e-08,0.003166
3,9.817968e-09,4.821126e-08,1.888385e-07,4.137550e-10,1.730474e-09,0.000001
4,7.578316e-04,9.839470e-03,1.483689e-02,2.402789e-04,4.698432e-05,0.027643
...,...,...,...,...,...,...
159566,7.541756e-08,2.362018e-06,5.580502e-06,8.939188e-09,1.014323e-08,0.000044
159567,1.355518e-03,2.828723e-01,2.167588e-01,4.550665e-03,9.553775e-04,0.860932
159568,4.134106e-03,2.610219e-02,2.081535e-02,3.141605e-04,2.410661e-04,0.050555
159569,1.334546e-06,2.575844e-04,1.845466e-04,4.589477e-07,2.747028e-07,0.001428


In [28]:
char_mnb_stack_train.to_csv('data/char_mnb_stack_train.csv')
char_mnb_stack_test.to_csv('data/char_mnb_stack_test.csv')