In [None]:
import os

from tqdm import tqdm_notebook as tqdm
import glob
import numpy as np
import pandas as pd

In [None]:
from sklearn.metrics import roc_auc_score

def multi_roc_auc_score(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(roc_auc_score(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

def read_predictions(prediction_dir, mode='valid', valid_columns=None, stacking_mode='flat'):
    valid_labels = pd.read_csv(os.path.join(prediction_dir, 'valid_split.csv'))
    sample_submission = pd.read_csv(os.path.join(prediction_dir, 'sample_submission.csv'))
    predictions = []
    for filepath in sorted(glob.glob('{}/{}/*'.format(prediction_dir, mode))):
        prediction_single = pd.read_csv(filepath)
        prediction_single.drop('id', axis=1, inplace=True)
        predictions.append(prediction_single)

    if stacking_mode == 'flat':
        X = np.hstack(predictions)
    elif stacking_mode == 'rnn':
        X = np.stack(predictions, axis=2)
    else:
        raise NotImplementedError("""only stacking_mode options 'flat' and 'rnn' are supported""")

    if mode == 'valid':
        y = valid_labels[valid_columns].values
        return X, y
    elif mode == 'test':
        return X, sample_submission
    else:
        raise NotImplementedError

In [None]:
LABEL_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

SINGLE_DIR = '/public/toxic_comments/single_model_predictions_20180226'
ENSEMBLE_SUBMISSION_PATH = '/output/catboost_submission.csv'

# Validation Set Level 1 Predictions

In [None]:
X_valid, y_valid_multilabel = read_predictions(SINGLE_DIR, mode='valid',valid_columns=LABEL_COLUMNS)

# Test Set Level 1 Predictions

In [None]:
X_test, sample_submission = read_predictions(SINGLE_DIR, mode='test')

# Ensemble Training

In [None]:
! pip install catboost

In [None]:
from sklearn.cross_validation import ShuffleSplit
from catboost import CatBoostClassifier


def fit_cv(X,y,n_splits=5):
    estimators,scores = [],[]
    cv = ShuffleSplit(X.shape[0], n_iter=n_splits, test_size=0.2, random_state=0)

    for train, valid in cv:
        X_train_ = X[train]
        y_train_ = y[train]
        X_valid_ = X[valid]
        y_valid_ =  y[valid]
        
        estimators_fold = []
        for i in tqdm(range(6)):
            y_train_one_label = y_train_[:,i]
            estimator = CatBoostClassifier(iterations=500, 
                                           learning_rate=0.05, 
                                           depth=3, 
                                           l2_leaf_reg= 5,
                                           rsm=0.2,
                                           model_size_reg=2.0,
                                           border_count= 200,
                                           verbose=False)
            estimator.fit(X_train_, y_train_one_label)
            estimators_fold.append(estimator)
        estimators.append(estimators_fold)
        
        y_valid_pred = []
        for estimator in estimators_fold:
            y_valid_pred_one_label = estimator.predict_proba(X_valid_)
            y_valid_pred.append(y_valid_pred_one_label)
        y_valid_pred = np.stack(y_valid_pred, axis=1)[...,1]
        score = multi_roc_auc_score(y_valid_, y_valid_pred)
        print(score)
        scores.append(score)
    return scores, estimators    

scores, estimators = fit_cv(X_valid, y_valid_multilabel)     