In [None]:
import os

import numpy as np
import pandas as pd
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

In [None]:
def multi_log_loss(y_true, y_pred):
    assert y_true.shape == y_pred.shape
    columns = y_true.shape[1]
    column_losses = []
    for i in range(0, columns):
        column_losses.append(log_loss(y_true[:, i], y_pred[:, i]))
    return np.array(column_losses).mean()

In [None]:
LABEL_COLUMNS = ['toxic', 'severe_toxic','obscene','threat','insult','identity_hate']

SINGLE_DIR = '/public/toxic_comments/single_model_predictions'

BWCL_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'logreg_bad_word_count.csv') # 0.0669
CVDCNN_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'char_vdcnn.csv') # 0.0435
CLOGREG_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'logreg_count.csv') # 0.126
GDPCNN_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'glove_dpcnn.csv') # 0.0422
GLSTM_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'glove_lstm.csv') # 0.0417
GSCNN_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'glove_scnn.csv') # 0.0427
TFIDIF_LOGREG_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'logreg_tfidf.csv') # 0.0459
WLSTM_VALID_PATH = os.path.join(SINGLE_DIR, 'valid', 'word_lstm.csv') # 0.0486

VALID_LABELS_PATH = os.path.join(SINGLE_DIR, 'valid', 'valid_split.csv')

BWCL_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'logreg_bad_word_count.csv')
CVDCNN_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'char_vdcnn.csv')
CLOGREG_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'logreg_count.csv')
GDPCNN_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'glove_dpcnn.csv')
GLSTM_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'glove_lstm.csv')
GSCNN_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'glove_scnn.csv')
TFIDIF_LOGREG_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'logreg_tfidf.csv')
WLSTM_TEST_PATH = os.path.join(SINGLE_DIR, 'test', 'word_lstm.csv')

SAMPLE_SUBMISSION_PATH = os.path.join(SINGLE_DIR, 'test', 'sample_submission.csv')
ENSEMBLE_SUBMISSION_PATH = os.path.join(SINGLE_DIR, 'submission.csv')

# Validation Set Level 1 Predictions

In [None]:
bad_word_count_logreg_valid = pd.read_csv(BWCL_VALID_PATH)
char_vdcnn_valid = pd.read_csv(CVDCNN_VALID_PATH)
count_logreg_valid = pd.read_csv(CLOGREG_VALID_PATH)
glove_dpcnn_valid = pd.read_csv(GDPCNN_VALID_PATH)
glove_lstm_valid = pd.read_csv(GLSTM_VALID_PATH)
glove_scnn_valid = pd.read_csv(GSCNN_VALID_PATH)
tfidf_logreg_valid = pd.read_csv(TFIDIF_LOGREG_VALID_PATH)
word_lstm_valid = pd.read_csv(WLSTM_VALID_PATH)

labels_valid = pd.read_csv(VALID_LABELS_PATH)

In [None]:
bad_word_count_logreg_valid.head()

In [None]:
X_valid = np.hstack([bad_word_count_logreg_valid.drop('id',axis=1), 
                     char_vdcnn_valid.drop('id',axis=1), 
                     count_logreg_valid.drop('id',axis=1), 
                     glove_dpcnn_valid.drop('id',axis=1), 
                     glove_lstm_valid.drop('id',axis=1), 
                     glove_scnn_valid.drop('id',axis=1), 
                     tfidf_logreg_valid.drop('id',axis=1), 
                     word_lstm_valid.drop('id',axis=1)])

y_valid_multilabel = labels_valid[LABEL_COLUMNS].values

In [None]:
X_valid.shape

# Test Set Level 1 Predictions

In [None]:
bad_word_count_logreg_test = pd.read_csv(BWCL_TEST_PATH)
char_vdcnn_test = pd.read_csv(CVDCNN_TEST_PATH)
count_logreg_test = pd.read_csv(CLOGREG_TEST_PATH)
glove_dpcnn_test = pd.read_csv(GDPCNN_TEST_PATH)
glove_lstm_test = pd.read_csv(GLSTM_TEST_PATH)
glove_scnn_test = pd.read_csv(GSCNN_TEST_PATH)
tfidf_logreg_test = pd.read_csv(TFIDIF_LOGREG_TEST_PATH)
word_lstm_test = pd.read_csv(WLSTM_TEST_PATH)

In [None]:
X_test = np.hstack([bad_word_count_logreg_test.drop('id',axis=1), 
                    char_vdcnn_test.drop('id',axis=1), 
                    count_logreg_test.drop('id',axis=1), 
                    glove_dpcnn_test.drop('id',axis=1), 
                    glove_lstm_test.drop('id',axis=1), 
                    glove_scnn_test.drop('id',axis=1), 
                    tfidf_logreg_test.drop('id',axis=1), 
                    word_lstm_test.drop('id',axis=1)])

# Ensemble Training

In [None]:
estimators = []
for i in range(6):
    y_valid_one_label = y_valid_multilabel[:,i]
    estimator = LogisticRegression()
    estimator.fit(X_valid, y_valid_one_label)
    estimators.append(estimator)

# Ensemble Prediction

In [None]:
y_test_pred = []
for estimator in estimators:
    y_test_pred_one_label = estimator.predict_proba(X_test)
    y_test_pred.append(y_test_pred_one_label)
y_test_pred = np.stack(y_test_pred, axis=1)[...,1]

# Submission

In [None]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission[LABEL_COLUMNS] = y_test_pred
submission.to_csv(ENSEMBLE_SUBMISSION_PATH, index=None)
submission.head()