In [2]:
# Necessary inputs
import warnings

from datasets import load_dataset, load_metric, concatenate_datasets
import transformers
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

warnings.filterwarnings("ignore")

In [3]:
# selecting model checkpoint
model_checkpoint = "t5-small"

In [4]:
transformers.set_seed(42)

raw_datasets = load_dataset("dataset")
metric = load_metric("sacrebleu")

In [7]:
from transformers import AutoTokenizer

# we will use autotokenizer for this purpose
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [8]:
# prefix for model input
prefix = "make sentence non-toxic:"

In [9]:
max_input_length = 256
max_target_length = 256
toxic = "source"
non_toxic = "target"


def preprocess_function(examples):
    inputs = [prefix + ex if ex else " " for ex in examples[toxic]]
    targets = [ex if ex else " " for ex in examples[non_toxic]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

## Fine-tuning the model

In [12]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

# create a model for the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [13]:
# defining the parameters for training
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-detoxify",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    report_to="tensorboard",
)

In [14]:
# instead of writing collate_fn function we will use DataCollatorForSeq2Seq
# simliarly it implements the batch creation for training
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
import numpy as np


# simple postprocessing for text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


# compute metrics function to pass to trainer
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [16]:
# instead of writing train loop we will use Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,2.279511,22.5242,11.731
2,2.621800,2.186291,23.352,11.853
3,2.621800,2.142628,23.6672,11.838
4,2.302300,2.118445,23.8465,11.835
5,2.302300,2.101738,23.96,11.846
6,2.230200,2.090894,23.9772,11.798
7,2.230200,2.083018,23.9237,11.756
8,2.186100,2.077272,24.0021,11.745
9,2.157300,2.074191,24.0206,11.763
10,2.157300,2.073121,24.0197,11.755


TrainOutput(global_step=2820, training_loss=2.2831285841921543, metrics={'train_runtime': 438.0722, 'train_samples_per_second': 205.446, 'train_steps_per_second': 6.437, 'total_flos': 1062270308450304.0, 'train_loss': 2.2831285841921543, 'epoch': 10.0})

In [18]:
# saving model
trainer.save_model("best")

In [19]:
# loading the model and run inference for it
model = AutoModelForSeq2SeqLM.from_pretrained("best")
model.eval()
model.config.use_cache = False

In [20]:
import torch
import pandas
from tqdm import tqdm

In [21]:
def test(model, tokenizer=tokenizer, batch_size=100):
    res = pd.DataFrame({"source": raw_datasets["test"][toxic]})
    model_res = []
    test_data = raw_datasets["test"]

    for i in tqdm(range(0, len(test_data), batch_size)):
        batch = test_data[i : i + batch_size]
        input_texts = [prefix + line for line in batch[toxic]]

        input_ids = tokenizer(
            input_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128,
        ).input_ids
        outputs = model.generate(input_ids=input_ids)

        decoded_outputs = [
            tokenizer.decode(output, skip_special_tokens=True) for output in outputs
        ]
        model_res.extend(decoded_outputs)

    res["target"] = model_res
    return res

In [None]:
res = test(model, tokenizer)
res.head()

In [None]:
res.to_csv("t5-detoxify.csv")