In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.pipeline import Pipeline, FeatureUnion

pd.set_option("display.max_rows",10)

## Modeling

## mnb model with tfidf-word, all labels

In [None]:
full_set = pd.read_pickle('full_cleaned.pkl')

In [33]:
%%time

word_mnb_records = pd.DataFrame(index=labels, columns=['best_params', 'cv_score', 'train_score', 'test_score'])
word_mnb_full = pd.DataFrame(index=full_set.index, columns=labels)
for label in labels:
    print('Start training classifier on {} label'.format(label))
    x_train, x_test, y_train, y_test, train_idx, test_idx = train_test_split(full_set.cleaned_text[:n_train], 
                                                        full_set[label][:n_train], 
                                                        np.arange(n_train),
                                                        test_size = 0.2, 
                                                        random_state=2018,
                                                        stratify = full_set[label][:n_train])
    word_mnb = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
    parameters = {'tfidf__max_df': [0.4, 0.2, 0.15],\
                  'tfidf__min_df': [0.0001, 0.0003, 0.003],\
                  'nb__alpha': [0.6, 0.5, 0.4, 0.2]
                 }
    word_mnb_cv = GridSearchCV(word_mnb, parameters, n_jobs=-1, scoring = 'roc_auc', verbose=1, cv=5)
    word_mnb_cv.fit(x_train, y_train)
    word_mnb_records.loc[label,'best_params'] = str(word_mnb_cv.best_params_)
    word_mnb_records.loc[label,'cv_score'] = word_mnb_cv.best_score_
    word_mnb_records.loc[label, 'train_score'] = metrics.roc_auc_score(y_train, word_mnb_cv.predict_proba(x_train)[:,1])
    word_mnb_records.loc[label, 'test_score'] = metrics.roc_auc_score(y_test, word_mnb_cv.predict_proba(x_test)[:,1])
    print('Best params: {}, cv_score: {}, train_score: {}, test_score: {}'.format(*word_mnb_records.loc[label]))
    word_mnb.set_params(**word_mnb_cv.best_params_)
    print('Fitting final model and making prediction for submission on {} label'.format(label))
    word_mnb.fit(full_set.cleaned_text[:n_train], full_set[label][:n_train])
    word_mnb_full[label] = word_mnb.predict_proba(full_set.cleaned_text)[:,1]

Start training classifier on toxic label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 15.5min finished


Best params: {'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001}, cv_score: 0.9550078827130518, train_score: 0.9679276328979506, test_score: 0.9596303256634855
Fitting final model and making prediction for submission on toxic label
Start training classifier on severe_toxic label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 14.2min finished


Best params: {'nb__alpha': 0.2, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}, cv_score: 0.9789676614405037, train_score: 0.9883241955519528, test_score: 0.9772268899559129
Fitting final model and making prediction for submission on severe_toxic label
Start training classifier on obscene label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 13.9min finished


Best params: {'nb__alpha': 0.6, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}, cv_score: 0.9657177130254938, train_score: 0.9733249942525654, test_score: 0.9642367549101658
Fitting final model and making prediction for submission on obscene label
Start training classifier on threat label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 13.9min finished


Best params: {'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.003}, cv_score: 0.9697533098478018, train_score: 0.9853950030058413, test_score: 0.9615744851084782
Fitting final model and making prediction for submission on threat label
Start training classifier on insult label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 13.7min finished


Best params: {'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}, cv_score: 0.9612750237601925, train_score: 0.9712145235929304, test_score: 0.9640399284301722
Fitting final model and making prediction for submission on insult label
Start training classifier on identity_hate label
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed: 13.6min finished


Best params: {'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0003}, cv_score: 0.9625447197861673, train_score: 0.9818428366846956, test_score: 0.9723479872212811
Fitting final model and making prediction for submission on identity_hate label
Wall time: 1h 34min 16s


In [34]:
print(*word_mnb_records.best_params, sep='\n')
print(word_mnb_records.iloc[:,1:])
print(word_mnb_records.iloc[:,1:].mean())

{'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.6, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.003}
{'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0003}
               cv_score train_score test_score
toxic          0.955008    0.967928    0.95963
severe_toxic   0.978968    0.988324   0.977227
obscene        0.965718    0.973325   0.964237
threat         0.969753    0.985395   0.961574
insult         0.961275    0.971215    0.96404
identity_hate  0.962545    0.981843   0.972348
cv_score       0.965544
train_score    0.978005
test_score     0.966509
dtype: float64


{'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0001}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.5, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.003}
{'nb__alpha': 0.4, 'tfidf__max_df': 0.2, 'tfidf__min_df': 0.0003}
{'nb__alpha': 0.2, 'tfidf__max_df': 0.4, 'tfidf__min_df': 0.0003}
               cv_score train_score test_score
toxic          0.955008    0.967928    0.95963
severe_toxic   0.978968    0.988324   0.977227
obscene         0.96566    0.973624   0.964084
threat         0.969753    0.985395   0.961574
insult         0.961275    0.971215    0.96404
identity_hate  0.962545    0.981843   0.972348
cv_score       0.965535
train_score    0.978055
test_score     0.966484
dtype: float64

In [36]:
word_mnb_full.to_csv('word_mnb_full.csv',index=False)
word_mnb_records.to_csv('word_mnb_records.csv',index=True)

                  'tfidf__max_df': [0.4, 0.3, 0.2, 0.15],\
                  'tfidf__min_df': [0.0001, 0.0003, 0.001, 0.003],\
                  'nb__alpha': [0.6, 0.5, 0.4, 0.2, 0.1]
                  

## mnb model with tfidf-char, all labels ---very time consuming to fine tune

In [45]:
%%time

char_mnb_records = pd.DataFrame(index=labels, columns=['best_params', 'cv_score', 'train_score', 'test_score'])
char_mnb_full = pd.DataFrame(index=full_set.index, columns=labels)
for label in labels:
    print('Start training classifier on {} label'.format(label))
    x_train, x_test, y_train, y_test, train_idx, test_idx = train_test_split(full_set.cleaned_text[:n_train], 
                                                        full_set[label][:n_train], 
                                                        np.arange(n_train),
                                                        test_size = 0.2, 
                                                        random_state=2018,
                                                        stratify = full_set[label][:n_train])
    char_mnb = Pipeline([('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
    parameters = {'tfidf__analyzer': ['char'],\
              'tfidf__max_df': [0.15, 0.12],\
              'tfidf__min_df': [0.0003],\
              'tfidf__ngram_range': [(1,5)],\
              'tfidf__max_features': [40000],\
              'nb__alpha': [0.4, 0.2]}
    char_mnb_cv = GridSearchCV(char_mnb, parameters, n_jobs=-1, scoring = 'roc_auc', verbose=1, cv=3)
    char_mnb_cv.fit(x_train, y_train)
    char_mnb_records.loc[label,'best_params'] = str(char_mnb_cv.best_params_)
    char_mnb_records.loc[label,'cv_score'] = char_mnb_cv.best_score_
    char_mnb_records.loc[label, 'train_score'] = metrics.roc_auc_score(y_train, char_mnb_cv.predict_proba(x_train)[:,1])
    char_mnb_records.loc[label, 'test_score'] = metrics.roc_auc_score(y_test, char_mnb_cv.predict_proba(x_test)[:,1])
    print('Best params: {}, cv_score: {}, train_score: {}, test_score: {}'.format(*char_mnb_records.loc[label]))
    char_mnb.set_params(**char_mnb_cv.best_params_)
    print('Fitting final model and making prediction for submission on {} label'.format(label))
    char_mnb.fit(full_set.cleaned_text[:n_train], full_set[label][:n_train])
    char_mnb_full[label] = char_mnb.predict_proba(full_set.cleaned_text)[:,1]

Start training classifier on toxic label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 19.2min finished


Best params: {'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9438663617735209, train_score: 0.9513621686238408, test_score: 0.947215629356778
Fitting final model and making prediction for submission on toxic label
Start training classifier on severe_toxic label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 18.0min finished


Best params: {'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9810017332462886, train_score: 0.9877322633982899, test_score: 0.9796894551550314
Fitting final model and making prediction for submission on severe_toxic label
Start training classifier on obscene label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 21.0min finished


Best params: {'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9616746624720707, train_score: 0.9677143459412112, test_score: 0.9615631971260908
Fitting final model and making prediction for submission on obscene label
Start training classifier on threat label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 21.0min finished


Best params: {'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9491728837908645, train_score: 0.9793309125622283, test_score: 0.9514403409388522
Fitting final model and making prediction for submission on threat label
Start training classifier on insult label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 18.0min finished


Best params: {'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9555089673517058, train_score: 0.9635760462895477, test_score: 0.9627183245963734
Fitting final model and making prediction for submission on insult label
Start training classifier on identity_hate label
Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 19.2min finished


Best params: {'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}, cv_score: 0.9625542433104942, train_score: 0.9789429647042994, test_score: 0.970552315777182
Fitting final model and making prediction for submission on identity_hate label
Wall time: 2h 58min 1s


In [46]:
print(*char_mnb_records.best_params, sep='\n')
print(char_mnb_records.iloc[:,1:])
print(char_mnb_records.iloc[:,1:].mean())

{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.12, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
               cv_score train_score test_score
toxic          0.943866    0.

{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.4, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__max_features': 40000, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
               cv_score train_score test_score
toxic          0.943866    0.951362   0.947216
severe_toxic   0.981002    0.987732   0.979689
obscene        0.961675    0.967714   0.961563
threat         0.948283    0.978424   0.950496
insult         0.955456    0.963371   0.962704
identity_hate  0.962063    0.978343   0.970227
cv_score       0.958724
train_score    0.971158
test_score     0.961983

{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
{'nb__alpha': 0.2, 'tfidf__analyzer': 'char', 'tfidf__max_df': 0.15, 'tfidf__min_df': 0.0003, 'tfidf__ngram_range': (1, 5)}
               cv_score train_score test_score
toxic          0.943409    0.956741   0.948965
severe_toxic   0.973072    0.985497   0.973623
obscene        0.958114    0.969775    0.95988
threat         0.912432    0.962793   0.919866
insult         0.952491    0.966795   0.961544
identity_hate  0.940987    0.970586   0.954784
cv_score       0.946751
train_score    0.968698
test_score     0.953111

              'tfidf__analyzer': ['char'],\
              'tfidf__max_df': [0.3, 0.2, 0.15],\
              'tfidf__min_df': [0.0001, 0.0003],\
              'tfidf__ngram_range': [(1,5)],\
              'tfidf__max_features': [40000],\
              'nb__alpha': [0.6, 0.4, 0.2]}

In [None]:
char_mnb_full.to_csv('char_mnb_full.csv',index=False)
char_mnb_records.to_csv('char_mnb_records.csv',index=True)