In [1]:
import os

from tqdm import tqdm_notebook as tqdm
import glob
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import roc_auc_score

def multi_roc_auc_score(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(roc_auc_score(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

def read_predictions(prediction_dir, mode='valid', valid_columns=None):
    valid_labels = pd.read_csv(os.path.join(prediction_dir, 'valid_split.csv'))
    sample_submission = pd.read_csv(os.path.join(prediction_dir, 'sample_submission.csv'))
    predictions = []
    for filepath in sorted(glob.glob('{}/{}/*'.format(prediction_dir, mode))):
        prediction_single = pd.read_csv(filepath)
        prediction_single.drop('id', axis=1, inplace=True)
        predictions.append(prediction_single)

    X = np.hstack(predictions)

    if mode == 'valid':
        y = valid_labels[valid_columns].values
        return X, y
    elif mode == 'test':
        return X, sample_submission
    else:
        raise NotImplementedError

In [6]:
LABEL_COLUMNS = ['toxic', 'severe_toxic','obscene','threat','insult','identity_hate']

SINGLE_DIR = '/mnt/ml-team/minerva/toxic/single_model_predictions_newest'
ENSEMBLE_SUBMISSION_PATH = '/mnt/ml-team/minerva/toxic/catboost_submission.csv'

# Validation Set Level 1 Predictions

In [7]:
X_valid, y_valid_multilabel = read_predictions(SINGLE_DIR, mode='valid',valid_columns=LABEL_COLUMNS)

# Test Set Level 1 Predictions

In [8]:
X_test, sample_submission = read_predictions(SINGLE_DIR, mode='test')

# Ensemble Training

In [19]:
! pip install catboost

Collecting catboost
  Downloading catboost-0.6.2-cp27-none-manylinux1_x86_64.whl (25.4MB)
[K    100% |████████████████████████████████| 25.4MB 64kB/s eta 0:00:0111
Installing collected packages: catboost
[31mException:
Traceback (most recent call last):
  File "/usr/local/lib/python2.7/dist-packages/pip/basecommand.py", line 215, in main
    status = self.run(options, args)
  File "/usr/local/lib/python2.7/dist-packages/pip/commands/install.py", line 342, in run
    prefix=options.prefix_path,
  File "/usr/local/lib/python2.7/dist-packages/pip/req/req_set.py", line 784, in install
    **kwargs
  File "/usr/local/lib/python2.7/dist-packages/pip/req/req_install.py", line 851, in install
    self.move_wheel_files(self.source_dir, root=root, prefix=prefix)
  File "/usr/local/lib/python2.7/dist-packages/pip/req/req_install.py", line 1064, in move_wheel_files
    isolated=self.isolated,
  File "/usr/local/lib/python2.7/dist-packages/pip/wheel.py", line 345, in move_wheel_files
    clobber(

In [None]:
iterations=200, 
                                           learning_rate=0.05, 
                                           depth=3, 
                                           l2_leaf_reg= 1,
                                           border_count= 200,
                                           verbose=False

In [None]:
from sklearn.cross_validation import ShuffleSplit
from catboost import CatBoostClassifier


def fit_cv(X,y,n_splits=5):
    estimators,scores = [],[]
    cv = ShuffleSplit(X.shape[0], n_iter=n_splits, test_size=0.2, random_state=0)

    for train, valid in cv:
        X_train_ = X[train]
        y_train_ = y[train]
        X_valid_ = X[valid]
        y_valid_ =  y[valid]
        
        estimators_fold = []
        for i in tqdm(range(6)):
            y_train_one_label = y_train_[:,i]
            estimator = CatBoostClassifier(iterations=500, 
                                           learning_rate=0.05, 
                                           depth=3, 
                                           l2_leaf_reg= 5,
                                           rsm=0.2,
                                           model_size_reg=2.0,
                                           border_count= 200,
                                           verbose=False)
            estimator.fit(X_train_, y_train_one_label)
            estimators_fold.append(estimator)
        estimators.append(estimators_fold)
        
        y_valid_pred = []
        for estimator in estimators_fold:
            y_valid_pred_one_label = estimator.predict_proba(X_valid_)
            y_valid_pred.append(y_valid_pred_one_label)
        y_valid_pred = np.stack(y_valid_pred, axis=1)[...,1]
        score = multi_roc_auc_score(y_valid_, y_valid_pred)
        print(score)
        scores.append(score)
    return scores, estimators    

scores, estimators = fit_cv(X_valid, y_valid_multilabel)     

In [None]:
print('score average {}\nscore std {}'.format(np.mean(scores),np.std(scores)))

# Ensemble Prediction

In [59]:
y_bagged =[]
for estimators_fold in estimators:
    y_test_pred = []
    for estimator in estimators_fold:
        y_test_pred_one_label = estimator.predict_proba(X_test)
        y_test_pred.append(y_test_pred_one_label)
    y_test_pred = np.stack(y_test_pred, axis=1)[...,1]
    y_bagged.append(y_test_pred)
y_bagged = np.mean(np.stack(y_bagged),axis=0)

# Submission

In [60]:
submission = sample_submission
submission[LABEL_COLUMNS] = y_bagged # this gets 0.9849 on LB
submission.to_csv(ENSEMBLE_SUBMISSION_PATH, index=None)
submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.994446,0.297564,0.969432,0.022415,0.903344,0.41295
1,0000247867823ef7,0.001143,2.9e-05,0.000275,4.9e-05,0.000134,4.8e-05
2,00013b17ad220c46,0.002666,4.9e-05,0.000533,6.1e-05,0.000559,9.1e-05
3,00017563c3f7919a,0.00032,2.4e-05,8.6e-05,3.5e-05,6.2e-05,5e-05
4,00017695ad8997eb,0.002441,2.8e-05,0.000304,3.9e-05,0.000206,7.1e-05
