In [1]:
from datasets import load_dataset, concatenate_datasets

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [2]:
# ---------- Preprocessing Functions ----------
def preprocess_paradetox(example):
    return {
        "input_text": "detoxify: " + example["toxic_sentence"],
        "target_text": example["neutral_sentence"]
    }

def preprocess_snlp(example):
    return {
        "input_text": "detoxify: " + example["en_toxic_comment"],
        "target_text": example["en_neutral_comment"]
    }

def clean_columns(dataset):
    return dataset.remove_columns(
        [col for col in dataset.column_names if col not in ["input_text", "target_text"]]
    )

Map: 100%|██████████| 400/400 [00:00<00:00, 27019.50 examples/s]


{'toxic_sentence': 'then all of a sudden i see her , shes now got the big phony tits and everything .', 'neutral_sentence': 'All of a sudden i see her, she is all grown up.', 'input_text': 'detoxify: then all of a sudden i see her , shes now got the big phony tits and everything .', 'target_text': 'All of a sudden i see her, she is all grown up.'}


In [3]:
# ---------- Load and Process Datasets ----------
en_data = load_dataset("textdetox/multilingual_paradetox", split="en")
en2_data = load_dataset("s-nlp/paradetox", split="train")  # assuming single split

formatted = clean_columns(en_data.map(preprocess_paradetox))
formatted2 = clean_columns(en2_data.map(preprocess_snlp))

# ---------- Combine Datasets ----------
combined_dataset = concatenate_datasets([formatted, formatted2])


{'toxic_sentence': 'then all of a sudden i see her , shes now got the big phony tits and everything .', 'neutral_sentence': 'All of a sudden i see her, she is all grown up.', 'input_text': 'detoxify: then all of a sudden i see her , shes now got the big phony tits and everything .', 'target_text': 'All of a sudden i see her, she is all grown up.'}


In [3]:
def tokenize(examples):
    # Tokenize inputs
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        padding="max_length",
        truncation=True
    )

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=128,
            padding="max_length",
            truncation=True
        )

    # Mask padding tokens in labels
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label_seq]
        for label_seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [4]:
# Tokenize and split
tokenized_dataset = combined_dataset.map(tokenize, batched=True)

# Optional: filter out empty label examples
tokenized_dataset = tokenized_dataset.filter(lambda x: any(label != -100 for label in x["labels"]))

# Split into train/test
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

NameError: name 'combined_dataset' is not defined

In [None]:
from transformers import DataCollatorForSeq2Seq
from transformers import EarlyStoppingCallback
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
args = Seq2SeqTrainingArguments(
    output_dir="./mt5-detox-en",
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    evaluation_strategy="epoch",  
    logging_strategy="steps",
    logging_steps=50,
    learning_rate=3e-5,
    num_train_epochs=3,
    save_strategy="epoch",  # May also need to be removed if very old
    save_total_limit=2,
    logging_dir="./logs_en",
    report_to="none",
    fp16=torch.cuda.is_available()
)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    tokenizer=tokenizer
)

# Train the model
trainer.train()


  trainer = Seq2SeqTrainer(


Step,Training Loss
100,0.0


TrainOutput(global_step=176, training_loss=0.0, metrics={'train_runtime': 1124.2355, 'train_samples_per_second': 2.562, 'train_steps_per_second': 0.157, 'total_flos': 827341830881280.0, 'train_loss': 0.0, 'epoch': 7.666666666666667})

In [None]:
# Prepare input
input_text = "You are ugly"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

# Generate detoxified output
output = model.generate(
    **inputs,
    max_length=50,
    num_beams=4,                      # optional: beam search improves fluency
    early_stopping=True,              # stop generation when EOS is reached
    decoder_start_token_id=tokenizer.pad_token_id  # force decoder to start properly
)

# Decode the generated output
decoded = tokenizer.decode(output[0])
print("Detoxified:", decoded)


NameError: name 'tokenizer' is not defined