# T5 model

In this notebook I aim to apply code from lab5 class to train for detoxification task.


In [None]:
%%capture

# installing huggingface libraries for dataset, models and metrics
!pip install datasets transformers[sentencepiece] sacrebleu tensorboardX transformers[torch]

!pip install numpy==1.24.3

In [None]:
# Necessary inputs
import warnings

from datasets import load_dataset, load_metric
import transformers
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

warnings.filterwarnings('ignore')

In [None]:
# selecting model checkpoint
model_checkpoint = "t5-small"

In [None]:
# setting random seed for transformers library
transformers.set_seed(42)

# Load the BLUE metric
metric = load_metric("sacrebleu")

# Data mapping

We have to transform raw pandas dataframes into huggingface datasets.


In [None]:
from datasets import DatasetDict, Dataset

# load raw dataset
raw_datasets = load_dataset(
    "csv",
    data_files={
        "train": "../data/interim/train.csv",
        "validation": "../data/interim/val.csv",
    },
)

raw_datasets

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'reference', 'translation', 'similarity', 'lenght_diff', 'ref_tox', 'trn_tox'],
        num_rows: 195705
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'reference', 'translation', 'similarity', 'lenght_diff', 'ref_tox', 'trn_tox'],
        num_rows: 48927
    })
})

# Tokenizer setup

In [None]:
from transformers import AutoTokenizer

# we will use autotokenizer for this purpose
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# prefix for model input
prefix = "paraphrase toxic sentences:"

In [None]:
max_input_length = 128
max_target_length = 128


def preprocess_function(df):
    inputs = [prefix + ex for ex in df["reference"]]
    targets = [ex for ex in df["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Train dataset is limited to 5000 entries

In [None]:
# split train dataset into train and test
train_dataset = raw_datasets["train"].train_test_split(test_size=0.1)
val_dataset = raw_datasets["validation"]

# train_dataset, val_dataset

tokenized_train = (
    train_dataset["train"].map(preprocess_function, batched=True).select(range(5000))
)
tokenized_test = (
    train_dataset["test"].map(preprocess_function, batched=True).select(range(500))
)
tokenized_val = val_dataset.map(preprocess_function, batched=True).select(range(500))

Map:   0%|          | 0/176134 [00:00<?, ? examples/s]

Map:   0%|          | 0/19571 [00:00<?, ? examples/s]

## Fine-tuning the model

In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

# create a model for the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# defining the parameters for training
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-toxicity",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    report_to="tensorboard",
)

In [None]:
# instead of writing collate_fn function we will use DataCollatorForSeq2Seq
# simliarly it implements the batch creation for training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import numpy as np

# simple postprocessing for text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

# compute metrics function to pass to trainer
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
# instead of writing train loop we will use Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,1.967156,19.7612,13.426
2,No log,1.858222,20.5497,13.412
3,No log,1.821089,20.9454,13.45
4,2.141800,1.799814,21.5974,13.436
5,2.141800,1.785505,21.9361,13.454
6,2.141800,1.776397,22.3949,13.476
7,1.907800,1.770325,22.5139,13.516
8,1.907800,1.766371,22.5923,13.54
9,1.907800,1.764367,22.5376,13.5
10,1.869500,1.76321,22.5392,13.5


TrainOutput(global_step=1570, training_loss=1.9670868017111613, metrics={'train_runtime': 257.6275, 'train_samples_per_second': 194.079, 'train_steps_per_second': 6.094, 'total_flos': 709288316633088.0, 'train_loss': 1.9670868017111613, 'epoch': 10.0})

In [None]:
def translate(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True,temperature=0))

In [None]:
inference_request = (
    prefix + "I'll I'll fling myself on my knees and let him kick the fuck out of me."
)
translate(model.to("cpu"), inference_request, tokenizer)

I'll be kicking myself on my knees and let him kick me out of me


# Conclusion:

We can observe that even on limited dataset of toxic sentences transformed into non-toxic ones T5 model is able to learn.

In order to improve the results we can try to make more complicated score which will include preservance of the meaning and toxicity regularization, but I am satisfied with obtained results.


In [None]:
# saving model
trainer.save_model("best")