In [35]:
import os
import pandas as pd
import numpy as np
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from sklearn.metrics import roc_auc_score
from datasets import Dataset, load_from_disk

MAX_LENGTH = 256
MODEL_BASE = "microsoft/deberta-v3-base"

PATH_DATA_PROCESSED = f"data/processed_data/{MODEL_BASE}"
PATH_DS_TEST_TOKENIZED = os.path.join(PATH_DATA_PROCESSED, "ds_test_tokenized")
PATH_PREDICTIONS_DICT = f"results/predictions/predictions.pkl"
PATH_DF_SUBMISSION = f"results/submission/submission.csv"


In [2]:
df_test = pd.read_csv("/home/azureuser/ruben/toxicity_classificator/data/original_data/test.csv")
df_test

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE)



In [4]:
def tokenize(batch):
    return tokenizer(batch["comment_text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

In [5]:
ds_test_raw = Dataset.from_pandas(df_test)
# ds_test_raw = ds_test_raw.map(add_label_vector, batched=True)
ds_test_tokenized = ds_test_raw.map(tokenize, batched=True, remove_columns=["comment_text"])
ds_test_tokenized.save_to_disk(PATH_DS_TEST_TOKENIZED)
ds_test_tokenized

Map:   0%|          | 0/153164 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/153164 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 153164
})

In [6]:
preds_folds = {fold_id: {"model": None, "trainer": None, "predictions": None} for fold_id in range(5)}

for fold_id in range(5):
    print(f"Processing fold {fold_id}...")
    path_model_trained = os.path.join("checkpoints", MODEL_BASE, f"fold_{fold_id}", "model_final")
    model = AutoModelForSequenceClassification.from_pretrained(path_model_trained).to("cuda")
    model.eval()
    trainer = Trainer(model=model, compute_metrics=None)
    preds = trainer.predict(ds_test_tokenized).predictions
    preds_folds[fold_id]["model"] = model
    preds_folds[fold_id]["trainer"] = trainer
    preds_folds[fold_id]["predictions"] = preds

predictions_list = [preds_folds[fold_id]["predictions"] for fold_id in range(5)]
logits_mean = np.mean(predictions_list, axis=0)
probs = 1 / (1 + np.exp(-logits_mean))

Processing fold 0...


Processing fold 1...


Processing fold 2...


Processing fold 3...


Processing fold 4...


In [36]:
with open(PATH_PREDICTIONS_DICT, "wb") as f:
    pickle.dump(preds_folds, f)

In [31]:
label_cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
df_submission = pd.DataFrame(probs, columns=label_cols)
df_submission.insert(0, "id", df_test["id"])
df_submission.to_csv(PATH_DF_SUBMISSION, index=False)
df_submission

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.993235,0.463939,0.969114,0.102317,0.951213,0.673025
1,0000247867823ef7,0.000118,0.000015,0.000054,0.000019,0.000041,0.000028
2,00013b17ad220c46,0.000121,0.000016,0.000056,0.000020,0.000041,0.000030
3,00017563c3f7919a,0.000118,0.000014,0.000053,0.000018,0.000041,0.000027
4,00017695ad8997eb,0.000148,0.000012,0.000048,0.000017,0.000036,0.000024
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,0.831779,0.001356,0.413287,0.000689,0.091312,0.001200
153160,fffd7a9a6eb32c16,0.000268,0.000010,0.000057,0.000017,0.000038,0.000027
153161,fffda9e8d6fafa9e,0.000134,0.000013,0.000054,0.000017,0.000038,0.000026
153162,fffe8f1340a79fc2,0.000149,0.000012,0.000052,0.000018,0.000038,0.000026


In [8]:
df_test_labels = pd.read_csv("data/original_data/test_labels.csv")
df_test_labels

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,-1,-1,-1,-1,-1,-1
153160,fffd7a9a6eb32c16,-1,-1,-1,-1,-1,-1
153161,fffda9e8d6fafa9e,-1,-1,-1,-1,-1,-1
153162,fffe8f1340a79fc2,-1,-1,-1,-1,-1,-1


In [26]:
df_check_score = df_submission.merge(df_test_labels, on="id", suffixes=("_pred", "_true"))
label_cols_true = [f"{col}_true" for col in label_cols]
df_check_score = df_check_score.loc[(df_check_score[label_cols_true]!=-1).all(axis=1)]
df_check_score

Unnamed: 0,id,toxic_pred,severe_toxic_pred,obscene_pred,threat_pred,insult_pred,identity_hate_pred,toxic_true,severe_toxic_true,obscene_true,threat_true,insult_true,identity_hate_true
5,0001ea8717f6de06,0.000127,0.000014,0.000051,0.000019,0.000040,0.000027,0,0,0,0,0,0
7,000247e83dcc1211,0.517046,0.000174,0.007884,0.000695,0.021622,0.000729,0,0,0,0,0,0
11,0002f87b16116a7f,0.314195,0.000312,0.055958,0.000391,0.017917,0.001357,0,0,0,0,0,0
13,0003e1cccfd5a40a,0.000123,0.000014,0.000054,0.000019,0.000039,0.000028,0,0,0,0,0,0
14,00059ace3e3e9a53,0.000119,0.000015,0.000054,0.000018,0.000041,0.000027,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
153150,fff8f64043129fa2,0.000131,0.000012,0.000056,0.000017,0.000039,0.000025,0,0,0,0,0,0
153151,fff9d70fe0722906,0.897605,0.003960,0.700326,0.001129,0.341232,0.004817,0,0,0,0,0,0
153154,fffa8a11c4378854,0.948748,0.023928,0.146038,0.045124,0.497114,0.558814,0,0,0,0,0,0
153155,fffac2a094c8e0e2,0.993961,0.446017,0.964453,0.083660,0.945908,0.723239,1,0,1,0,1,0


In [27]:
auc_scores = {}
for col in label_cols:
    y_true = df_check_score[f"{col}_true"]
    y_pred = df_check_score[f"{col}_pred"]
    auc_scores[col] = roc_auc_score(y_true, y_pred)

In [28]:
mean_auc = np.mean(list(auc_scores.values()))
print(f"AUC scores per label: {auc_scores}")
print(f"Mean column-wise ROC AUC: {mean_auc:.6f}")

AUC scores per label: {'toxic': 0.9758748902813065, 'severe_toxic': 0.9907877782521548, 'obscene': 0.9827022702932348, 'threat': 0.9912962899513386, 'insult': 0.9836326817941821, 'identity_hate': 0.9901016734408703}
Mean column-wise ROC AUC: 0.985733
