In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# !wget https://github.com/skoltech-nlp/detox/releases/download/emnlp2021/filtered_paranmt.zip

In [3]:
# !unzip filtered_paranmt.zip

In [5]:
data = pd.read_csv("../data/filtered.tsv", delimiter="\t")
data.head()

Unnamed: 0.1,Unnamed: 0,reference,translation,similarity,lenght_diff,ref_tox,trn_tox
0,0,"If Alkar is flooding her with psychic waste, t...","if Alkar floods her with her mental waste, it ...",0.785171,0.010309,0.014195,0.981983
1,1,Now you're getting nasty.,you're becoming disgusting.,0.749687,0.071429,0.065473,0.999039
2,2,"Well, we could spare your life, for one.","well, we can spare your life.",0.919051,0.268293,0.213313,0.985068
3,3,"Ah! Monkey, you've got to snap out of it.","monkey, you have to wake up.",0.664333,0.309524,0.053362,0.994215
4,4,I've got orders to put her down.,I have orders to kill her.,0.726639,0.181818,0.009402,0.999348


In [6]:
data = data.sample(frac=0.18, random_state=42)

In [7]:
data = data.drop("Unnamed: 0", axis=1)

In [8]:
import torch
from torch.utils.data import Dataset


class ToxicityDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, task="classification"):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.task = task

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]["reference"]
        target = self.data.iloc[index]["translation"]
        label = self.data.iloc[index]["ref_tox"]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt",
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        if self.task == "classification":
            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": torch.tensor(label, dtype=torch.long),
            }
        elif self.task == "generation":
            target_encoding = self.tokenizer(
                target,
                padding="max_length",
                max_length=self.max_length,
                truncation=True,
                return_tensors="pt",
            )
            return {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels": target_encoding["input_ids"].squeeze(),
            }

In [None]:
# !pip install transformers
# !pip install datasets
# !pip install SentencePiece
# !pip install transformers[torch]

In [9]:
import pandas as pd
import torch
from torch.utils.data import random_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config, Trainer, TrainingArguments, T5ForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
dataset = ToxicityDataset(data, tokenizer, max_length=120, task="generation")
train_dataset, test_dataset, validate_dataset = random_split(dataset, [.7, .2, .1])

Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.55MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.39M/1.39M [00:00<00:00, 2.04MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<?, ?B/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.mo

In [11]:
# Define the T5 model and tokenizer
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name, config=T5Config())

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
training_args = TrainingArguments(
    output_dir="./t5_toxicity_finetuned",
    per_device_train_batch_size=20,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=2500,
    save_steps=2500,
    logging_steps=2500
)

In [13]:
def model_init():
    return T5ForConditionalGeneration.from_pretrained(model_name)

In [14]:
from transformers import Trainer, TrainingArguments

# Define the trainer and training arguments
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset
)

# Train the model
# trainer.train()


Downloading model.safetensors: 100%|██████████| 892M/892M [02:19<00:00, 6.41MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<?, ?B/s] 


In [None]:
# model_dir = "./saved_model"
# trainer.save_model(model_dir)

In [15]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained('../models/')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [37]:
input = "I've got orders to put her down"
tokens = tokenizer(
    input,
    padding="max_length",
    max_length=50,
    truncation=True,
    return_tensors="pt",
)

out = model.generate(
    input_ids = tokens["input_ids"],
    attention_mask = tokens["attention_mask"],
    max_length= 50,
    num_return_sequences = 1
)

In [38]:
output = [tokenizer.decode(
    gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True
) for gen_id in out]
output

['I have orders to kill her.']

  0%|          | 33/14445 [05:26<8:18:42,  2.08s/it]  

KeyboardInterrupt: 