In [1]:
from tqdm import tqdm
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

In [2]:
IN_TEST = Path('./data/jigsaw_toxic_comments/test.csv')

MODEL_OUTPUT_DIR = Path('./data/model_outputs/full/')

OUT_CNN = MODEL_OUTPUT_DIR / 'pred_cnn.csv'
OUT_LSTM = MODEL_OUTPUT_DIR / 'pred_lstm.csv'
OUT_LOGREG = MODEL_OUTPUT_DIR / 'pred_logreg.csv'
OUT_NB_SVM = MODEL_OUTPUT_DIR / 'pred_nb_svm.csv'
OUT_BERT = MODEL_OUTPUT_DIR / 'pred_bert.csv'

In [3]:
TEST = pd.read_csv(IN_TEST)
LABEL_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

MODEL_OUT_PROB = {
    'CNN':    OUT_CNN, 
    'LSTM':   OUT_LSTM, 
    'LOGREG': OUT_LOGREG, 
    # 'NB_SVM': OUT_NB_SVM,
    'BERT':   OUT_BERT
}

MODEL_OUT_PROB = {
    k: pd.read_csv(v)[LABEL_COLS].to_numpy() for k, v in MODEL_OUT_PROB.items()
}

N_TE = len(TEST)

In [4]:
def prob_to_class(prob):
    return 1 if prob >= 0.5 else 0

def compute_ensemble_pred(idx = 0, col = 0):
    prob_sum = 0

    for model in MODEL_OUT_PROB.values():
        prob_sum += model[idx][col]

    return prob_sum / len(MODEL_OUT_PROB)

In [5]:
bagged_pred = np.zeros((len(TEST), len(LABEL_COLS)))

for c_idx, col in enumerate(LABEL_COLS):
    for x_idx in tqdm(range(len(TEST.id))):
        bagged_pred[x_idx, c_idx] = compute_ensemble_pred(x_idx, c_idx)

100%|██████████| 153164/153164 [00:00<00:00, 1077480.29it/s]
100%|██████████| 153164/153164 [00:00<00:00, 1171345.75it/s]
100%|██████████| 153164/153164 [00:00<00:00, 1222534.83it/s]
100%|██████████| 153164/153164 [00:00<00:00, 1241429.88it/s]
100%|██████████| 153164/153164 [00:00<00:00, 1239461.04it/s]
100%|██████████| 153164/153164 [00:00<00:00, 1240435.10it/s]


In [6]:
pd.merge(
    left=TEST.id, 
    right=pd.DataFrame(bagged_pred, columns=LABEL_COLS), 
    how='outer', 
    left_index=True,
    right_index=True
).head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998422,0.608779,0.989709,0.232253,0.950925,0.45137
1,0000247867823ef7,0.002211,0.00073,0.00055,0.000239,0.004273,0.002416
2,00013b17ad220c46,0.007113,0.001608,0.005682,0.000313,0.00223,0.000937
3,00017563c3f7919a,0.002408,0.003137,0.001147,0.00059,0.001676,0.000292
4,00017695ad8997eb,0.011703,0.000932,0.004295,0.000647,0.005174,0.000488


In [7]:
pd.merge(
    left=TEST.id, 
    right=pd.DataFrame(bagged_pred, columns=LABEL_COLS), 
    how='outer', 
    left_index=True,
    right_index=True
).to_csv('temp.csv', index=False)

In [8]:
def compute_test_auc(pred):
    return roc_auc_score(TEST[LABEL_COLS].values, pred)

scores = pd.DataFrame(columns=['Model', 'AUC Score'])
scores.set_index('Model', inplace=True)

for k, v in MODEL_OUT_PROB.items():
    scores.loc[k] = [compute_test_auc(v)]

scores.loc['ENSEMBLE'] = [compute_test_auc(bagged_pred)]

scores *= 100
print(scores.to_markdown())

KeyError: "None of [Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',\n       'identity_hate'],\n      dtype='object')] are in the [columns]"