In [26]:
import pandas as pd
from pathlib import Path
from collections import defaultdict
from sklearn.metrics import roc_curve, accuracy_score, auc, f1_score, recall_score, precision_score

In [27]:
IN_TEST = Path('./data/jigsaw_toxic_comments/test.csv')

MODEL_OUTPUT_DIR = Path('./data/model_outputs/full/')

OUT_CNN = MODEL_OUTPUT_DIR / 'pred_cnn.csv'
OUT_LSTM = MODEL_OUTPUT_DIR / 'pred_lstm.csv'
OUT_LOGREG = MODEL_OUTPUT_DIR / 'pred_logreg.csv'
OUT_NB_SVM = MODEL_OUTPUT_DIR / 'pred_nb_svm.csv'
OUT_BERT = MODEL_OUTPUT_DIR / 'pred_bert.csv'
OUT_ENSEM_AVG = MODEL_OUTPUT_DIR / 'pred_ensem_avg.csv'

In [28]:
TEST = pd.read_csv(IN_TEST)
LABEL_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

MODEL_OUT_PROB = {
    'CNN':                OUT_CNN, 
    'LSTM':               OUT_LSTM, 
    'LOGREG':             OUT_LOGREG, 
    'NB_SVM':             OUT_NB_SVM,
    'BERT':               OUT_BERT,
    'ENSEMBLE_AVERAGING': OUT_ENSEM_AVG
}

MODEL_OUT_PROB = {
    k: pd.read_csv(v)[LABEL_COLS] for k, v in MODEL_OUT_PROB.items()
}

N_TE = len(TEST)

In [29]:
for model in MODEL_OUT_PROB.values():
    model.insert(0, 'id', TEST.id)

TEST = pd.merge(left=pd.read_csv(IN_TEST), right=pd.read_csv(Path('./data/jigsaw_toxic_comments/test_labels.csv')), left_on='id', right_on='id')
TEST = TEST[TEST.toxic != -1]

MODEL_OUT_PROB = {
    k: v[v.id.isin(TEST.id)] for k, v in MODEL_OUT_PROB.items()
}

In [30]:
v = MODEL_OUT_PROB['CNN']
v

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
5,0001ea8717f6de06,0.000058,2.783060e-09,0.000015,9.913385e-08,3.294415e-06,1.019644e-07
7,000247e83dcc1211,0.061447,4.050740e-06,0.001258,7.408154e-05,2.917345e-03,1.290226e-04
11,0002f87b16116a7f,0.016210,5.565348e-05,0.003242,2.949872e-04,3.047977e-03,4.290054e-04
13,0003e1cccfd5a40a,0.007068,1.208248e-06,0.000634,3.170063e-05,5.884999e-04,5.931168e-05
14,00059ace3e3e9a53,0.000009,2.981097e-10,0.000004,7.004811e-09,7.122055e-07,9.064986e-09
...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,0.000119,1.130469e-08,0.000036,2.012253e-07,1.092839e-05,3.026531e-07
153151,fff9d70fe0722906,0.954490,5.183419e-03,0.518844,7.832063e-04,5.646287e-01,7.741437e-03
153154,fffa8a11c4378854,0.942306,4.343872e-02,0.256488,8.719553e-02,5.757344e-01,2.896482e-01
153155,fffac2a094c8e0e2,0.989430,1.584197e-01,0.913899,2.562255e-02,7.776085e-01,8.603556e-02


In [31]:
model_scores = defaultdict(None)

for model in MODEL_OUT_PROB:
    scores = defaultdict(list)
    y_pred_prob = MODEL_OUT_PROB[model]

    for col in LABEL_COLS:
        y_pred = y_pred_prob[col].apply(lambda x: 1 if x >= 0.5 else 0)
        fpr, tpr, _ = roc_curve(TEST[col].values, y_pred_prob[col])
        scores['train_accuracy'] += accuracy_score(y_pred, TEST[col].values),
        scores['test_accuracy'] += accuracy_score(y_pred, TEST[col].values),
        scores['auc_score'] += auc(fpr, tpr),
        scores['f1_score'] += f1_score(TEST[col].values, y_pred, average="weighted"),
        scores['precision'] += precision_score(TEST[col].values, y_pred, average="weighted"),
        scores['recall'] += recall_score(TEST[col].values, y_pred, average="weighted"),

    scores = pd.DataFrame.from_dict(scores)
    scores['label']= LABEL_COLS
    scores.set_index('label', inplace=True)
    model_scores[model] = scores

  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
print(pd.DataFrame(
    [(k, v.to_json(indent=2)) for k, v in model_scores.items()],
    columns=['model', 'scores']
).set_index('model').to_json(indent=2))

{
  "scores":{
    "CNN":"{\n  \"train_accuracy\":{\n    \"toxic\":0.9113132639,\n    \"severe_toxic\":0.9922629654,\n    \"obscene\":0.9555003282,\n    \"threat\":0.9967019913,\n    \"insult\":0.9494201132,\n    \"identity_hate\":0.9897933665\n  },\n  \"test_accuracy\":{\n    \"toxic\":0.9113132639,\n    \"severe_toxic\":0.9922629654,\n    \"obscene\":0.9555003282,\n    \"threat\":0.9967019913,\n    \"insult\":0.9494201132,\n    \"identity_hate\":0.9897933665\n  },\n  \"auc_score\":{\n    \"toxic\":0.9444422262,\n    \"severe_toxic\":0.9806841113,\n    \"obscene\":0.9561881565,\n    \"threat\":0.9475504237,\n    \"insult\":0.9475378204,\n    \"identity_hate\":0.949589927\n  },\n  \"f1_score\":{\n    \"toxic\":0.9193164848,\n    \"severe_toxic\":0.9925845109,\n    \"obscene\":0.9572671936,\n    \"threat\":0.9950557107,\n    \"insult\":0.9521981373,\n    \"identity_hate\":0.9862379036\n  },\n  \"precision\":{\n    \"toxic\":0.9342240867,\n    \"severe_toxic\":0.992937487,\n    \"obscene