# T5-small model finetuning

In [1]:
# If you don't have pytorch, you should install it using the command that you can get on 
# https://pytorch.org/get-started/locally/
# Example: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

In [1]:
import warnings

import numpy as np
import pandas as pd
import transformers
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, \
    Seq2SeqTrainer

warnings.filterwarnings('ignore')
transformers.set_seed(42)

In [2]:
df = pd.read_csv('../data/interim/combined.tsv', sep='\t', header=0)
df.head()

Unnamed: 0,toxic-en,neutral-en
0,"if Alkar floods her with her mental waste, it ...","If Alkar is flooding her with psychic waste, t..."
1,you're becoming disgusting.,Now you're getting nasty.
2,"well, we can spare your life.","Well, we could spare your life, for one."
3,"monkey, you have to wake up.","Ah! Monkey, you've got to snap out of it."
4,I have orders to kill her.,I've got orders to put her down.


In [3]:
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

train_data = df.sample(frac=train_ratio, random_state=42)
val_data = df.drop(train_data.index).sample(frac=val_ratio / (val_ratio + test_ratio), random_state=42)
test_data = df.drop(train_data.index).drop(val_data.index)

In [4]:
from datasets import Dataset


def create_dataset(df: pd.DataFrame):
    return Dataset.from_dict(
        {
            "translation": [
                {"toxic-en": ref, "neutral-en": tr}
                for ref, tr, in zip(df["toxic-en"], df["neutral-en"])
            ]
        }
    )

In [5]:
from datasets import DatasetDict

raw_datasets = DatasetDict(
    train=create_dataset(train_data),
    validation=create_dataset(val_data),
    test=create_dataset(test_data)
)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 504985
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 63123
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 63123
    })
})

In [7]:
model_checkpoint = "t5-small"
prefix = "paraphrase:"
max_input_length = 128
max_target_length = 128
source_lang = "toxic-en"
target_lang = "neutral-en"

out_dir = f"../models/{model_checkpoint.replace('/', '-')}-finetuned-{source_lang}-to-{target_lang}"

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, use_safetensors=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = load_metric("sacrebleu")

In [9]:
def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [10]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/504985 [00:00<?, ? examples/s]

Map:   0%|          | 0/63123 [00:00<?, ? examples/s]

Map:   0%|          | 0/63123 [00:00<?, ? examples/s]

## Fine-tuning the model

In [8]:
batch_size = 32
train_args = Seq2SeqTrainingArguments(
    output_dir=out_dir,
    save_total_limit=3,
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=1e-4,
    weight_decay=0.01,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to=['tensorboard'],
    disable_tqdm=False,
)

In [9]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [13]:
trainer = Seq2SeqTrainer(
    model,
    train_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [14]:
trainer.train()

  0%|          | 0/47343 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.2192, 'learning_rate': 9.894598990347043e-05, 'epoch': 0.03}
{'loss': 2.0449, 'learning_rate': 9.788986756225841e-05, 'epoch': 0.06}
{'loss': 2.0169, 'learning_rate': 9.683374522104641e-05, 'epoch': 0.1}
{'loss': 1.969, 'learning_rate': 9.577762287983441e-05, 'epoch': 0.13}
{'loss': 1.9762, 'learning_rate': 9.47215005386224e-05, 'epoch': 0.16}
{'loss': 1.9564, 'learning_rate': 9.366749044209282e-05, 'epoch': 0.19}
{'loss': 1.937, 'learning_rate': 9.261136810088082e-05, 'epoch': 0.22}
{'loss': 1.9188, 'learning_rate': 9.15552457596688e-05, 'epoch': 0.25}
{'loss': 1.9095, 'learning_rate': 9.04991234184568e-05, 'epoch': 0.29}
{'loss': 1.901, 'learning_rate': 8.94430010772448e-05, 'epoch': 0.32}
{'loss': 1.8929, 'learning_rate': 8.838687873603278e-05, 'epoch': 0.35}
{'loss': 1.8925, 'learning_rate': 8.733075639482078e-05, 'epoch': 0.38}
{'loss': 1.8788, 'learning_rate': 8.627463405360878e-05, 'epoch': 0.41}
{'loss': 1.8793, 'learning_rate': 8.522062395707919e-05, 'epoch': 0.44}


  0%|          | 0/1973 [00:00<?, ?it/s]

{'eval_loss': 1.6821682453155518, 'eval_bleu': 25.4631, 'eval_gen_len': 13.2798, 'eval_runtime': 747.6179, 'eval_samples_per_second': 84.432, 'eval_steps_per_second': 2.639, 'epoch': 1.0}
{'loss': 1.8111, 'learning_rate': 6.621675854931035e-05, 'epoch': 1.01}
{'loss': 1.7867, 'learning_rate': 6.516063620809835e-05, 'epoch': 1.05}
{'loss': 1.7855, 'learning_rate': 6.410451386688633e-05, 'epoch': 1.08}
{'loss': 1.796, 'learning_rate': 6.304839152567433e-05, 'epoch': 1.11}
{'loss': 1.7897, 'learning_rate': 6.199226918446233e-05, 'epoch': 1.14}
{'loss': 1.7808, 'learning_rate': 6.093614684325033e-05, 'epoch': 1.17}
{'loss': 1.7904, 'learning_rate': 5.988002450203832e-05, 'epoch': 1.2}
{'loss': 1.7815, 'learning_rate': 5.882601440550874e-05, 'epoch': 1.24}
{'loss': 1.7708, 'learning_rate': 5.777200430897915e-05, 'epoch': 1.27}
{'loss': 1.7781, 'learning_rate': 5.671588196776715e-05, 'epoch': 1.3}
{'loss': 1.7776, 'learning_rate': 5.565975962655514e-05, 'epoch': 1.33}
{'loss': 1.7857, 'learn

  0%|          | 0/1973 [00:00<?, ?it/s]

{'eval_loss': 1.6408618688583374, 'eval_bleu': 25.806, 'eval_gen_len': 13.2742, 'eval_runtime': 734.3411, 'eval_samples_per_second': 85.959, 'eval_steps_per_second': 2.687, 'epoch': 2.0}
{'loss': 1.7492, 'learning_rate': 3.243351709862071e-05, 'epoch': 2.03}
{'loss': 1.7426, 'learning_rate': 3.13773947574087e-05, 'epoch': 2.06}
{'loss': 1.739, 'learning_rate': 3.0321272416196693e-05, 'epoch': 2.09}
{'loss': 1.7325, 'learning_rate': 2.9265150074984688e-05, 'epoch': 2.12}
{'loss': 1.7447, 'learning_rate': 2.820902773377268e-05, 'epoch': 2.15}
{'loss': 1.7442, 'learning_rate': 2.7155017637243103e-05, 'epoch': 2.19}
{'loss': 1.7358, 'learning_rate': 2.6098895296031095e-05, 'epoch': 2.22}
{'loss': 1.7287, 'learning_rate': 2.504277295481909e-05, 'epoch': 2.25}
{'loss': 1.7394, 'learning_rate': 2.398665061360708e-05, 'epoch': 2.28}
{'loss': 1.7397, 'learning_rate': 2.29326405170775e-05, 'epoch': 2.31}
{'loss': 1.738, 'learning_rate': 2.1878630420547918e-05, 'epoch': 2.34}
{'loss': 1.7448, 'le

  0%|          | 0/1973 [00:00<?, ?it/s]

{'eval_loss': 1.6303457021713257, 'eval_bleu': 25.9715, 'eval_gen_len': 13.2756, 'eval_runtime': 704.7056, 'eval_samples_per_second': 89.574, 'eval_steps_per_second': 2.8, 'epoch': 3.0}
{'train_runtime': 8295.7779, 'train_samples_per_second': 182.618, 'train_steps_per_second': 5.707, 'train_loss': 1.7999640508434622, 'epoch': 3.0}


TrainOutput(global_step=47343, training_loss=1.7999640508434622, metrics={'train_runtime': 8295.7779, 'train_samples_per_second': 182.618, 'train_steps_per_second': 5.707, 'train_loss': 1.7999640508434622, 'epoch': 3.0})

In [15]:
trainer.save_model(f'{out_dir}/best')