In [25]:
from tqdm import tqdm
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [26]:
IN_TRAIN = Path('./data/jigsaw_toxic_comments/train.csv')
IN_TEST = Path('./data/jigsaw_toxic_comments/merged_test.csv')

MODEL_OUT_VA_DIR = Path('./data/model_outputs/stack/')
OUT_VA_CNN = MODEL_OUT_VA_DIR / 'pred_cnn.csv'
OUT_VA_LSTM = MODEL_OUT_VA_DIR / 'pred_lstm.csv'
OUT_VA_LOGREG = MODEL_OUT_VA_DIR / 'pred_logreg.csv'
OUT_VA_NB_SVM = MODEL_OUT_VA_DIR / 'pred_nb_svm.csv'
# OUT_VA_BERT = MODEL_OUTPUT_VA_DIR / 'pred_bert.csv'

MODEL_OUT_DIR = Path('./data/model_outputs/truncated/')
OUT_CNN = MODEL_OUT_DIR / 'pred_cnn.csv'
OUT_LSTM = MODEL_OUT_DIR / 'pred_lstm.csv'
OUT_LOGREG = MODEL_OUT_DIR / 'pred_logreg.csv'
OUT_NB_SVM = MODEL_OUT_DIR / 'pred_nb_svm.csv'
# OUT_BERT = MODEL_OUT_DIR / 'pred_bert.csv'

In [27]:
TRAIN = pd.read_csv(IN_TRAIN)
TEST = pd.read_csv(IN_TEST)
TRAIN, VALID = TRAIN[TRAIN.index <= 100_000], TRAIN[TRAIN.index > 100_000]

LABEL_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

MODEL_OUT_VA_PROB = {
    'CNN':    OUT_VA_CNN, 
    'LSTM':   OUT_VA_LSTM, 
    'LOGREG': OUT_VA_LOGREG, 
    'NB_SVM': OUT_VA_NB_SVM,
    # 'BERT':   OUT_VA_BERT
}

MODEL_OUT_PROB = {
    'CNN':    OUT_CNN, 
    'LSTM':   OUT_LSTM, 
    'LOGREG': OUT_LOGREG, 
    'NB_SVM': OUT_NB_SVM,
    # 'BERT':   OUT_BERT
}

MODEL_OUT_VA_PROB = {
    k: pd.read_csv(v)[LABEL_COLS].to_numpy() for k, v in MODEL_OUT_VA_PROB.items()
}

MODEL_OUT_PROB = {
    k: pd.read_csv(v)[LABEL_COLS].to_numpy() for k, v in MODEL_OUT_PROB.items()
}

N_TE = len(TEST)

In [28]:
stacked_pred = np.zeros((len(TEST), len(LABEL_COLS)))
for c_idx, col in enumerate(LABEL_COLS):
    train_stack = np.column_stack([
        model[:, c_idx] for model in MODEL_OUT_VA_PROB.values()
    ])
 
    clf = RandomForestClassifier(
        n_estimators=500,
        max_depth=10,
        max_leaf_nodes=4
    )
    clf = clf.fit(train_stack, VALID[col].to_numpy())

    test_stack = np.column_stack([
        model[:, c_idx] for model in MODEL_OUT_PROB.values()
    ])
    stacked_pred[:, c_idx] = clf.predict_proba(test_stack)[:, 1]

In [29]:
def compute_test_auc(pred):
    return roc_auc_score(TEST[LABEL_COLS].values, pred)

scores = pd.DataFrame(columns=['Model', 'AUC Score'])
scores.set_index('Model', inplace=True)

for k, v in MODEL_OUT_PROB.items():
    scores.loc[k] = [compute_test_auc(v)]

scores.loc['ENSEMBLE'] = [compute_test_auc(stacked_pred)]

scores *= 100
print(scores.to_markdown())

| Model    |   AUC Score |
|:---------|------------:|
| CNN      |     95.2601 |
| LSTM     |     97.2482 |
| LOGREG   |     97.9942 |
| NB_SVM   |     97.6329 |
| ENSEMBLE |     97.4374 |
