In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from scipy import sparse
# Pretty display for notebooks
%matplotlib inline

In [2]:
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_features = sparse.load_npz("temp_res/train_features.npz")
test_features = sparse.load_npz("temp_res/test_features.npz")
test = pd.read_csv('data/test.csv').fillna(' ')[:test_features.shape[0]]
train = pd.read_csv('data/train.csv').fillna(' ')[:train_features.shape[0]]

In [3]:
if __name__ == '__main__':
    print("begin training")
    scores = []
    submission = pd.DataFrame.from_dict({'id': test['id']})
    for class_name in class_names:
        train_target = train[class_name]
        clf = LogisticRegression(random_state=0)
        cv_sets = ShuffleSplit(n_splits=3, test_size=0.20, random_state=0)
        parameters = {
            "C": [0.1, 1, 5],
            "tol": [1e-3, 1e-4, 1e-2],
            "solver": ["sag", "saga"],
            "max_iter": [200]
        }
        grid = GridSearchCV(clf, param_grid=parameters, scoring='roc_auc', cv=cv_sets, verbose=10, n_jobs=2)

        grid = grid.fit(train_features, train_target)
        best_clf = grid.best_estimator_
        scores.append(grid.best_score_)
        print('CV score for class {} is {}'.format(class_name, grid.best_score_))
        print("best params are:")
        print(grid.best_params_)
        best_clf.fit(train_features, train_target)
        res = best_clf.predict_proba(test_features)
        print(type(res))
        print(res.shape)
        submission[class_name] = best_clf.predict_proba(test_features)[:, 1]
    print('Total CV score is {}'.format(np.mean(scores)))
    submission.to_csv('subm/bench_submission.csv', index=False)


begin training
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   21.9s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   48.4s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  2.8min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  4.1min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  5.7min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 13.2min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 16.5min
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed: 26.7min finished


CV score for class toxic is 0.9798134326926518
best params are:
{'C': 1, 'max_iter': 200, 'solver': 'sag', 'tol': 0.01}
<class 'numpy.ndarray'>
(153164, 2)
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   26.9s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   49.8s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  4.6min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  7.0min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  9.7min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 18.7min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 25.3min
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed: 35.1min finished


CV score for class severe_toxic is 0.9891195702371048
best params are:
{'C': 1, 'max_iter': 200, 'solver': 'sag', 'tol': 0.001}
<class 'numpy.ndarray'>
(153164, 2)
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   20.4s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   43.9s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  2.5min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  3.7min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  5.0min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 12.2min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 17.9min
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed: 27.3min finished


CV score for class obscene is 0.9906336983821018
best params are:
{'C': 1, 'max_iter': 200, 'solver': 'sag', 'tol': 0.001}
<class 'numpy.ndarray'>
(153164, 2)
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   17.9s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   42.3s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  5.1min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  8.0min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed: 10.2min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 18.7min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 29.0min
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed: 39.3min finished


CV score for class threat is 0.9906024809967433
best params are:
{'C': 5, 'max_iter': 200, 'solver': 'sag', 'tol': 0.01}
<class 'numpy.ndarray'>
(153164, 2)
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   23.0s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   47.7s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.5min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  3.3min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  4.8min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  6.3min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 14.3min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 21.5min
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed: 31.6min finished


CV score for class insult is 0.9824663797255095
best params are:
{'C': 1, 'max_iter': 200, 'solver': 'sag', 'tol': 0.01}
<class 'numpy.ndarray'>
(153164, 2)
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   19.9s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   45.3s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.4min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  3.4min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  4.8min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  6.5min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed: 15.7min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 25.5min
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed: 35.9min finished


CV score for class identity_hate is 0.9859580174845588
best params are:
{'C': 1, 'max_iter': 200, 'solver': 'sag', 'tol': 0.0001}
<class 'numpy.ndarray'>
(153164, 2)
Total CV score is 0.9864322632531116
