In [101]:
from tqdm import tqdm
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

In [102]:
IN_TEST = Path('./data/jigsaw_toxic_comments/merged_test.csv')

MODEL_OUTPUT_DIR = Path('./data/model_outputs')

OUT_CNN = MODEL_OUTPUT_DIR / 'pred_cnn.csv'
OUT_LSTM = MODEL_OUTPUT_DIR / 'pred_lstm.csv'
OUT_LOGREG = MODEL_OUTPUT_DIR / 'pred_logreg.csv'
OUT_NB_SVM = MODEL_OUTPUT_DIR / 'pred_nb_svm.csv'

In [103]:
TEST = pd.read_csv(IN_TEST)

PRED_CNN = pd.read_csv(OUT_CNN)
PRED_LTSM = pd.read_csv(OUT_LSTM)
PRED_LOGREG = pd.read_csv(OUT_LOGREG)
PRED_NB_SVM = pd.read_csv(OUT_NB_SVM)

LABEL_COLS = TEST.columns[2:].to_list()
N_TE = len(TEST)

MODEL_OUT_PROB = {
    'CNN':    PRED_CNN, 
    'LSTM':   PRED_LTSM, 
    'LOGREG': PRED_LOGREG, 
    'NB_SVM': PRED_NB_SVM
}

MODEL_OUT_PROB = {k: v[LABEL_COLS].to_numpy() for k, v in MODEL_OUT_PROB.items()}

In [104]:
def prob_to_class(prob):
    return 1 if prob >= 0.5 else 0

def compute_ensemble_pred(idx = 0, col = 0):
    prob_sum = 0

    for model in MODEL_OUT_PROB.values():
        prob_sum += model[idx][col]

    return prob_sum

In [105]:
bagged_pred = np.zeros((len(PRED_CNN), len(LABEL_COLS)))

for c_idx, col in enumerate(LABEL_COLS):
    for x_idx in tqdm(range(len(TEST.id))):
        bagged_pred[x_idx, c_idx] = compute_ensemble_pred(x_idx, c_idx)

100%|██████████| 63978/63978 [00:00<00:00, 1155506.10it/s]
100%|██████████| 63978/63978 [00:00<00:00, 1176670.25it/s]
100%|██████████| 63978/63978 [00:00<00:00, 1181119.14it/s]
100%|██████████| 63978/63978 [00:00<00:00, 1158984.60it/s]
100%|██████████| 63978/63978 [00:00<00:00, 1143179.86it/s]
100%|██████████| 63978/63978 [00:00<00:00, 1146437.48it/s]


In [106]:
def compute_test_auc(pred):
    print(f'{k}: {roc_auc_score(TEST[LABEL_COLS].values, v)}')

print("AUC Scores")
for k, v in MODEL_OUT_PROB.items():
    print(f'{k}: {roc_auc_score(TEST[LABEL_COLS].values, v)}')
print(roc_auc_score(TEST[LABEL_COLS].values, bagged_pred)) 

0.9237296784887028

In [107]:
bagged_pred[0]

array([0., 0., 0., 0., 0., 0.])