In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import log_loss
import numpy as np
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, RandomizedSearchCV

from gc import collect
from utils_toxic import feat_engine, clean
from collections import defaultdict

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
TARGET = ['identity_hate', 'insult', 'obscene','severe_toxic', 'threat', 'toxic']

In [3]:
data =  pd.read_csv('data/train.csv', sep=',', index_col='id')
data = feat_engine(data)
data.comment_text = data.comment_text.apply(lambda x :clean(x))
print(data.shape)
data.head()

(159571, 26)


Unnamed: 0_level_0,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,count_sent,count_word,count_unique_word,...,word_unique_percent,punct_percent,ip,count_ip,link,count_links,article_id,article_id_flag,username,count_usernames
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000997932d777bf,explanationwhy edit make username hardcore met...,0,0,0,0,0,0,2,43,41,...,95.348837,23.255814,[89.205.38.27],1,[],0,[],0,[],0
000103f0d9cfb60f,d'aww ! match background colour I am seemingly...,0,0,0,0,0,0,1,17,17,...,100.0,70.588235,[],0,[],0,[],0,[],0
000113f07ec002fd,"hey man , I am really try edit war . it is guy...",0,0,0,0,0,0,1,42,39,...,92.857143,14.285714,[],0,[],0,[],0,[],0
0001b41b1c6bb37e,""" morei cannot make real suggestions improveme...",0,0,0,0,0,0,5,113,82,...,72.566372,18.584071,[],0,[],0,[],0,[],0
0001d958c54c6e35,", sir , hero . chance remember page that is ?",0,0,0,0,0,0,1,13,13,...,100.0,38.461538,[],0,[],0,[],0,[],0


In [16]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import MultiTaskLasso
from sklearn.linear_model import SGDClassifier
#from sklearn.linear_model import RidgeClassifierCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import PassiveAggressiveClassifier

In [24]:
import time

In [29]:
def multilabel_model(model, data, target):
    tot_score = []
    for tar in list(target.columns):
        score = cross_val_score(model, data, target[tar].values.ravel(), cv = 5, scoring='roc_auc')
        tot_score.append(np.mean(score))
    return np.mean(tot_score)

def get_best_estimator(models, data, target, name_step, best_cv_score):
    best_estimator = None
    best_score = .5
    for name, model in models.items():
        start = time.time()
        try:
            score = cross_val_score(model, data, target, n_jobs=6, cv = 5, verbose=0, scoring='roc_auc')
        except:
            #print(name, 'mlabel cv is not working')
            score = multilabel_model(model, data, target)
        if np.mean(score)>best_score:
            best_score = np.mean(score)
            best_estimator = name
        finish = time.time()-start
        print(name, np.mean(score), finish, 's')
    best_cv_score[name_step].append(best_score)
    best_cv_score[name_step].append(best_estimator)
    return best_cv_score

In [7]:
tfidf = TfidfVectorizer(decode_error='ignore', 
                norm='l2', 
                ngram_range=(3, 1), 
                sublinear_tf=True, 
                max_df=0.99, 
                use_idf=True, 
                smooth_idf=True, 
                min_df=150,
                lowercase=True)
tfidf_data = tfidf.fit_transform(data.loc[:, 'comment_text'])
tfidf_data

<159571x3291 sparse matrix of type '<class 'numpy.float64'>'
	with 3642414 stored elements in Compressed Sparse Row format>

In [32]:
models = {'LR':LogisticRegression(), 
                    'LR_L1':LogisticRegression(penalty='l1', n_jobs=6),
                    'LR_C0001':LogisticRegression(C = .0001), 
                    #'MLP':MLPClassifier(),
                    #'ETrees':ExtraTreesClassifier(), 
                    #'RF':RandomForestClassifier(),
                    'Ridge':RidgeClassifier(),
                    'Ridge_C0001':RidgeClassifier(alpha=.0001),
                    #'MLasso':MultiTaskLasso(),
                    'SGD':SGDClassifier(),
                    'NB':BernoulliNB(),
                    'PAC':PassiveAggressiveClassifier()}

In [30]:
best_cv_score = defaultdict(list)
get_best_estimator(modles, tfidf_data, data[TARGET],'temp', best_cv_score)

NB 0.920980504117 5.348769664764404 s
PAC 0.944168133716 8.898405075073242 s
LR 0.973023052454 38.10983943939209 s
LR_C0001 0.816360856701 12.447779417037964 s
Ridge 0.96199127877 58.43095254898071 s
SGD 0.959024730014 8.695740222930908 s
LR_L1 0.968399083975 54.30026650428772 s


KeyboardInterrupt: 

In [35]:
import xgboost as xgb

In [36]:
pipe = Pipeline([('tfidf', TfidfVectorizer()), ('models', xgb.XGBClassifier())])

In [62]:
space = {
 'models':[value for key, value  in models.items()],
 #'lg__C': [.0001, .001, .01, .1, 1.0, 10, 100, 1000],
 #'lg__class_weight': [{1:2, 0:1},{1:3, 0:1}, {1:4, 0:1}, 'balanced', None],
 #'lg__fit_intercept': [True,False],
 #'lg__max_iter': [100, 200, 300, 500, 1000],
 #'lg__penalty': ['l2', 'l1'],
 #'lg__random_state': [234],
 #'lg__tol': [0.0001, .001, .01],
 #'lg__warm_start': [False,True],
 'tfidf__analyzer': ['word'],#, 'char'
# 'tfidf__binary': [False,True],
 'tfidf__decode_error': ['ignore'],
 #'tfidf__input': 'content',
 'tfidf__lowercase': [False,True],
 'tfidf__max_df': list(np.linspace(0.8, 1.0, 20)),
# 'tfidf__max_features': [10000, None], may be it's false
# 'tfidf__min_df': [1, 2],
 'tfidf__ngram_range': [(1, 1), (2, 1), (3, 1)], #(3, 3), (4, 1) , (3, 2)
 'tfidf__norm': ['l2','l1', None],
 'tfidf__smooth_idf': [False,True],
 #'tfidf__stop_words': None,
 #'tfidf__strip_accents': None,
 'tfidf__sublinear_tf': [False,True],
 #'tfidf__token_pattern': '(?u)\\b\\w\\w+\\b',
 'tfidf__use_idf': [False,True]
 #'tfidf__vocabulary': None
        }

In [63]:
import time

In [64]:
time.strftime('%H:%M')

'01:35'

In [None]:
%%time
score = []
rnd = RandomizedSearchCV(pipe, param_distributions=space, n_iter=200, scoring='roc_auc', 
                       cv = 5, verbose=1, random_state=234, n_jobs = 6)
for i, tar in enumerate(TARGET):
    print(tar, time.strftime('%H:%M'))
    score.append(rnd.fit(data.loc[:, 'comment_text'], data.loc[:, tar].values))
    pd.DataFrame(score[i].cv_results_).to_csv('metrics/grid_search_v2'+tar+'_.csv', sep=';', index=False)

identity_hate 01:35
Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  5.7min
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed: 24.9min
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed: 54.6min
Process ForkPoolWorker-240:
Process ForkPoolWorker-242:
Process ForkPoolWorker-241:
Process ForkPoolWorker-239:
Process ForkPoolWorker-233:
Process ForkPoolWorker-243:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/anaconda/envs/py35/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/anaconda/envs/py35/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/anaconda/envs/py35/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda/envs/py35/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/anaconda/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/pool.

TypeError: catching classes that do not inherit from BaseException is not allowed