In [None]:
import os

from tqdm import tqdm_notebook as tqdm
import numpy as np
import pandas as pd

In [None]:
from sklearn.metrics import roc_auc_score

def multi_roc_auc_score(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(roc_auc_score(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

In [None]:
LABEL_COLUMNS = ['toxic', 'severe_toxic','obscene','threat','insult','identity_hate']

SINGLE_DIR = '/public/toxic_comments/single_model_predictions'

BWL_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'bad_word_logreg.csv')
BWCL_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'logreg_bad_word_count.csv')
CVDCNN_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'char_vdcnn.csv')
CLOGREG_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'logreg_count.csv')
GDPCNN_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'glove_dpcnn.csv')
GLSTM_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'glove_lstm.csv')
GSCNN_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'glove_scnn.csv')
TFIDIF_LOGREG_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'logreg_tfidf.csv')
WLSTM_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'word_lstm.csv')

VALID_LABELS_PATH = os.path.join(SINGLE_DIR, 'valid', 'valid_split.csv')

BWL_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'bad_word_logreg.csv') 
BWCL_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'logreg_bad_word_count.csv')
CVDCNN_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'char_vdcnn.csv')
CLOGREG_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'logreg_count.csv')
GDPCNN_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'glove_dpcnn.csv')
GLSTM_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'glove_lstm.csv')
GSCNN_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'glove_scnn.csv')
TFIDIF_LOGREG_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'logreg_tfidf.csv')
WLSTM_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'word_lstm.csv')

SAMPLE_SUBMISSION_PATH = os.path.join(SINGLE_DIR, 'test', 'sample_submission.csv')
ENSEMBLE_SUBMISSION_PATH = '/output/submission.csv'

# Validation Set Level 1 Predictions

In [None]:
bad_word_logreg_valid = pd.read_csv(BWL_VALID_PATH)
bad_word_count_logreg_valid = pd.read_csv(BWCL_VALID_PATH)
char_vdcnn_valid = pd.read_csv(CVDCNN_VALID_PATH)
count_logreg_valid = pd.read_csv(CLOGREG_VALID_PATH)
glove_dpcnn_valid = pd.read_csv(GDPCNN_VALID_PATH)
glove_lstm_valid = pd.read_csv(GLSTM_VALID_PATH)
glove_scnn_valid = pd.read_csv(GSCNN_VALID_PATH)
tfidf_logreg_valid = pd.read_csv(TFIDIF_LOGREG_VALID_PATH)
word_lstm_valid = pd.read_csv(WLSTM_VALID_PATH)

labels_valid = pd.read_csv(VALID_LABELS_PATH)

In [None]:
X_valid = np.hstack([bad_word_logreg_valid.drop('id',axis=1), 
                     bad_word_count_logreg_valid.drop('id',axis=1), 
                     char_vdcnn_valid.drop('id',axis=1), 
                     count_logreg_valid.drop('id',axis=1), 
                     glove_dpcnn_valid.drop('id',axis=1), 
                     glove_lstm_valid.drop('id',axis=1), 
                     glove_scnn_valid.drop('id',axis=1), 
                     tfidf_logreg_valid.drop('id',axis=1), 
                     word_lstm_valid.drop('id',axis=1)])

y_valid_multilabel = labels_valid[LABEL_COLUMNS].values

# Test Set Level 1 Predictions

In [None]:
bad_word_logreg_test = pd.read_csv(BWL_TEST_PATH)
bad_word_count_logreg_test = pd.read_csv(BWCL_TEST_PATH)
char_vdcnn_test = pd.read_csv(CVDCNN_TEST_PATH)
count_logreg_test = pd.read_csv(CLOGREG_TEST_PATH)
glove_dpcnn_test = pd.read_csv(GDPCNN_TEST_PATH)
glove_lstm_test = pd.read_csv(GLSTM_TEST_PATH)
glove_scnn_test = pd.read_csv(GSCNN_TEST_PATH)
tfidf_logreg_test = pd.read_csv(TFIDIF_LOGREG_TEST_PATH)
word_lstm_test = pd.read_csv(WLSTM_TEST_PATH)

In [None]:
X_test = np.hstack([bad_word_logreg_test.drop('id',axis=1), 
                    bad_word_count_logreg_test.drop('id',axis=1), 
                    char_vdcnn_test.drop('id',axis=1), 
                    count_logreg_test.drop('id',axis=1), 
                    glove_dpcnn_test.drop('id',axis=1), 
                    glove_lstm_test.drop('id',axis=1), 
                    glove_scnn_test.drop('id',axis=1), 
                    tfidf_logreg_test.drop('id',axis=1), 
                    word_lstm_test.drop('id',axis=1)])

# Ensemble Training

In [None]:
! pip install catboost

In [None]:
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier

def fit_cv(X,y,n_splits=10):
    estimators,scores = [],[]
    kf = KFold(n_splits=n_splits)
    for train, valid in kf.split(X):
        X_train_ = X[train]
        y_train_ = y[train]
        X_valid_ = X[valid]
        y_valid_ =  y[valid]
        
        estimators_fold = []
        for i in tqdm(range(6)):
            y_train_one_label = y_train_[:,i]
            estimator = CatBoostClassifier(iterations=500, 
                                           learning_rate=0.02, 
                                           depth=2, 
                                           verbose=False)
            estimator.fit(X_train_, y_train_one_label)
            estimators_fold.append(estimator)
        estimators.append(estimators_fold)
        
        y_valid_pred = []
        for estimator in estimators_fold:
            y_valid_pred_one_label = estimator.predict_proba(X_valid_)
            y_valid_pred.append(y_valid_pred_one_label)
        y_valid_pred = np.stack(y_valid_pred, axis=1)[...,1]
        score = multi_roc_auc_score(y_valid_, y_valid_pred)
        scores.append(score)
    return scores, estimators    

scores, estimators = fit_cv(X_valid, y_valid_multilabel)     

In [None]:
print('score average {}\nscore std {}'.format(np.mean(scores),np.std(scores)))

# Ensemble Prediction

In [None]:
y_bagged =[]
for estimators_fold in estimators:
    y_test_pred = []
    for estimator in estimators_fold:
        y_test_pred_one_label = estimator.predict_proba(X_test)
        y_test_pred.append(y_test_pred_one_label)
    y_test_pred = np.stack(y_test_pred, axis=1)[...,1]
    y_bagged.append(y_test_pred)
y_bagged = np.mean(np.stack(y_bagged),axis=0)

# Submission

In [None]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission[LABEL_COLUMNS] = y_bagged # this gets 0.9849 on LB
submission.to_csv(ENSEMBLE_SUBMISSION_PATH, index=None)
submission.head()