In [1]:
import sys
from pathlib import Path
ROOT_DIR = Path().resolve().parents[0]
sys.path.append(str(ROOT_DIR))
import config as cfg

from datasets import load_from_disk
import pandas as pd
import os
from transformers import AutoModelForSequenceClassification, Trainer
import numpy as np
import pickle
from sklearn.metrics import roc_auc_score

N_RUN = 3

# Load data

In [2]:
ds_test_tokenized = load_from_disk(cfg.PATH_DS_TEST_TOKENIZED)

In [3]:
df_test_labels = pd.read_csv(cfg.PATH_DF_TEST_LABELS)
df_test_labels

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,-1,-1,-1,-1,-1,-1
153160,fffd7a9a6eb32c16,-1,-1,-1,-1,-1,-1
153161,fffda9e8d6fafa9e,-1,-1,-1,-1,-1,-1
153162,fffe8f1340a79fc2,-1,-1,-1,-1,-1,-1


# Predictions

In [4]:
def get_predictions(n_run, ds_test_tokenized):
	preds_folds = {f"run_{n_run}": {}}
	preds_folds[f"run_{n_run}"] = {f"fold_{fold_id}": {"model": None, "trainer": None, "predictions": None} for fold_id in range(cfg.N_FOLDS)}

	for fold_id in range(cfg.N_FOLDS):
		print(f"Processing fold {fold_id}...")
		path_model_trained = os.path.join(cfg.PATH_CHECKPOINTS, cfg.MODEL_BASE, f"run_{n_run}", f"fold_{fold_id}", "model_final")
		model = AutoModelForSequenceClassification.from_pretrained(path_model_trained).to("cuda")
		model.eval()
		trainer = Trainer(model=model, compute_metrics=None)
		preds = trainer.predict(ds_test_tokenized).predictions
		preds_folds[f"run_{N_RUN}"][f"fold_{fold_id}"]["model"] = model
		preds_folds[f"run_{N_RUN}"][f"fold_{fold_id}"]["trainer"] = trainer
		preds_folds[f"run_{N_RUN}"][f"fold_{fold_id}"]["predictions"] = preds

	predictions_list = [preds_folds[f"run_{N_RUN}"][f"fold_{fold_id}"]["predictions"] for fold_id in range(cfg.N_FOLDS)]
	logits_mean = np.mean(predictions_list, axis=0)
	probs = 1 / (1 + np.exp(-logits_mean))

	return preds_folds, probs

In [5]:
# Get predictions
preds_folds, probs = get_predictions(N_RUN, ds_test_tokenized)

Processing fold 0...


Processing fold 1...


Processing fold 2...


Processing fold 3...


Processing fold 4...


In [7]:
# Save preds_folds dictionary to a file
with open(cfg.PATH_PREDICTIONS_DICT, "wb") as f:
	pickle.dump(preds_folds, f)

In [8]:
# Create a DataFrame with the predictions
df_submission = pd.DataFrame(probs, columns=cfg.LABEL_COLS)
df_submission.insert(0, "id", df_test_labels["id"])
path_df_submission_dir = os.path.join(cfg.PATH_RESULTS, F"run_{N_RUN}", "submission")
if not os.path.exists(path_df_submission_dir):
    os.makedirs(path_df_submission_dir, exist_ok=True)
df_submission.to_csv(os.path.join(path_df_submission_dir, "submission.csv"), index=False)
df_submission

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.994318,0.500406,0.977775,0.097314,0.944305,0.476001
1,0000247867823ef7,0.000193,0.000017,0.000069,0.000032,0.000052,0.000044
2,00013b17ad220c46,0.000213,0.000017,0.000074,0.000031,0.000054,0.000043
3,00017563c3f7919a,0.000183,0.000018,0.000070,0.000033,0.000053,0.000046
4,00017695ad8997eb,0.000412,0.000013,0.000071,0.000028,0.000054,0.000042
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.699546,0.002302,0.434600,0.001474,0.071186,0.001950
153160,fffd7a9a6eb32c16,0.001863,0.000016,0.000159,0.000040,0.000122,0.000062
153161,fffda9e8d6fafa9e,0.000234,0.000014,0.000070,0.000026,0.000049,0.000038
153162,fffe8f1340a79fc2,0.000382,0.000013,0.000078,0.000027,0.000057,0.000043


In [9]:
# Create a list of true label columns
label_cols_true = [f"{col}_true" for col in cfg.LABEL_COLS]

In [10]:
# Create a DataFrame to compare true labels and predictions
df_check_scores = df_submission.merge(df_test_labels, on="id", suffixes=("_pred", "_true"))
df_check_scores = df_check_scores.loc[(df_check_scores[label_cols_true]!=-1).all(axis=1)]
df_check_scores

Unnamed: 0,id,toxic_pred,severe_toxic_pred,obscene_pred,threat_pred,insult_pred,identity_hate_pred,toxic_true,severe_toxic_true,obscene_true,threat_true,insult_true,identity_hate_true
5,0001ea8717f6de06,0.000209,0.000017,0.000070,0.000032,0.000053,0.000044,0,0,0,0,0,0
7,000247e83dcc1211,0.464235,0.000438,0.012883,0.001908,0.030001,0.003084,0,0,0,0,0,0
11,0002f87b16116a7f,0.252631,0.000331,0.042167,0.000652,0.019577,0.001029,0,0,0,0,0,0
13,0003e1cccfd5a40a,0.000204,0.000016,0.000070,0.000029,0.000049,0.000042,0,0,0,0,0,0
14,00059ace3e3e9a53,0.000204,0.000016,0.000072,0.000030,0.000052,0.000042,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,0.000226,0.000014,0.000071,0.000026,0.000050,0.000039,0,0,0,0,0,0
153151,fff9d70fe0722906,0.907779,0.006441,0.680416,0.002616,0.393006,0.006616,0,0,0,0,0,0
153154,fffa8a11c4378854,0.858760,0.012444,0.050141,0.056486,0.282410,0.226051,0,0,0,0,0,0
153155,fffac2a094c8e0e2,0.993864,0.500819,0.975915,0.093700,0.947789,0.509864,1,0,1,0,1,0


In [11]:
# Calculate AUC scores for each label
auc_scores = {}
for col in cfg.LABEL_COLS:
    y_true = df_check_scores[f"{col}_true"]
    y_pred = df_check_scores[f"{col}_pred"]
    auc_scores[col] = roc_auc_score(y_true, y_pred)

In [12]:
mean_auc = np.mean(list(auc_scores.values()))
print(f"AUC scores per label: {auc_scores}")
print(f"Mean column-wise ROC AUC: {mean_auc:.6f}")

AUC scores per label: {'toxic': 0.9755441187716771, 'severe_toxic': 0.9905827257183124, 'obscene': 0.9833410058507415, 'threat': 0.9836843062461478, 'insult': 0.9827046995335034, 'identity_hate': 0.9880494324480514}
Mean column-wise ROC AUC: 0.983984
