# Jigsaw Unintended Bias in Toxicity Classification

1. Import dependencies

In [1]:
import random
import time

import numpy as np
import torch
import os
import pandas as pd

from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
)

from src.data.kaggle import submit_competition
from src.model.metrics import compute_metrics
from src.data.toxicity import (
    load_toxicity_dataset,
    TOXICITY_LABEL_TO_ID,
    TOXICITY_ID_TO_LABEL,
)
from src.util.torch_device import resolve_torch_device
from src.definitions import MODELS_FOLDER, EXTERNAL_DATA_FOLDER, PROCESSED_DATA_FOLDER, SUBMITIONS_FOLDER

2. Prepare Env

In [2]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

device = resolve_torch_device()

competition = "jigsaw-unintended-bias-in-toxicity-classification"
submition_path = (
    SUBMITIONS_FOLDER
    / "jigsaw-unintended-bias-in-toxicity-classification"
    / "submission.csv"
)

model_checkpoint = "distilbert/distilbert-base-uncased"
num_epochs = 3
learning_rate = 2e-5

epoch_time = int(time.time())

os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

device

device(type='cuda')

3. Load dataset

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

ds = load_toxicity_dataset(
    EXTERNAL_DATA_FOLDER, PROCESSED_DATA_FOLDER, tokenizer, random_seed
)

3. Prepare model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(TOXICITY_LABEL_TO_ID),
    id2label=TOXICITY_ID_TO_LABEL,
    label2id=TOXICITY_LABEL_TO_ID,
).to(device)

In [5]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir=MODELS_FOLDER
    / "jigsaw-unintended-bias-in-toxicity-classification-checkpoint",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=random_seed,
    auto_find_batch_size=True,
    metric_for_best_model="f1",
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

4. Train model

In [None]:
trainer.train()

5. Evaluate

In [6]:
evaluation_feedback = trainer.evaluate()

In [7]:
evaluation_feedback

{'eval_loss': 0.11260996013879776,
 'eval_model_preparation_time': 0.0015,
 'eval_precision': 0.8762262180281618,
 'eval_recall': 0.8106681253394749,
 'eval_f1': 0.8398366489224725,
 'eval_runtime': 196.6295,
 'eval_samples_per_second': 917.909,
 'eval_steps_per_second': 57.372}

6. Save weights

In [None]:
trainer.save_model(MODELS_FOLDER / "jigsaw-unintended-bias-in-toxicity-classification")
tokenizer.save_pretrained(
    MODELS_FOLDER / "jigsaw-unintended-bias-in-toxicity-classification"
)

7. Predict test data

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODELS_FOLDER / "jigsaw-unintended-bias-in-toxicity-classification"
)
tokenizer = AutoTokenizer.from_pretrained(
    MODELS_FOLDER / "jigsaw-unintended-bias-in-toxicity-classification"
)

In [None]:
test = pd.read_csv(
    EXTERNAL_DATA_FOLDER
    / "jigsaw-unintended-bias-in-toxicity-classification"
    / "test.csv"
)
submission = pd.read_csv(
    EXTERNAL_DATA_FOLDER
    / "jigsaw-unintended-bias-in-toxicity-classification"
    / "sample_submission.csv"
)

In [None]:
predictor = pipeline(
    "text-classification", model=model, tokenizer=tokenizer
)

prediction_df = pd.DataFrame.from_records(predictor(test["comment_text"].values.tolist()))

In [None]:
submission["prediction"] = prediction_df["label"].map(TOXICITY_LABEL_TO_ID)

submition_path.parent.mkdir(parents=True, exist_ok=True)

submission = submission.set_index("id")

submission.to_csv(submition_path)

In [None]:
message = (
    f"[ {model_checkpoint} ] {num_epochs} epochs with {learning_rate} learning rate"
)

submit_competition(submition_path, message, competition)