In [None]:
from datasets import load_dataset
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

# Load English dataset
en_data = load_dataset("textdetox/multilingual_paradetox", split="en")

def preprocess(example):
    return {
        "input_text": "detoxify: " + example["toxic_sentence"],
        "target_text": example["neutral_sentence"]
    }

In [None]:
# Apply preprocessing
formatted = en_data.map(preprocess)

# Use the correct tokenizer class for mT5
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-base", legacy=False)
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")

# Send model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Map: 100%|██████████| 400/400 [00:00<00:00, 27019.50 examples/s]


{'toxic_sentence': 'then all of a sudden i see her , shes now got the big phony tits and everything .', 'neutral_sentence': 'All of a sudden i see her, she is all grown up.', 'input_text': 'detoxify: then all of a sudden i see her , shes now got the big phony tits and everything .', 'target_text': 'All of a sudden i see her, she is all grown up.'}


In [None]:
print(formatted[0])

{'toxic_sentence': 'then all of a sudden i see her , shes now got the big phony tits and everything .', 'neutral_sentence': 'All of a sudden i see her, she is all grown up.', 'input_text': 'detoxify: then all of a sudden i see her , shes now got the big phony tits and everything .', 'target_text': 'All of a sudden i see her, she is all grown up.'}


In [67]:
def tokenize(examples): 
    # Tokenize input and target
    inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )
    targets = tokenizer(
        examples["target_text"],
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    # Mask padding tokens in labels
    targets["input_ids"] = [
        [token if token != tokenizer.pad_token_id else -100 for token in label]
        for label in targets["input_ids"]
    ]

    # Attach labels to inputs
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized = formatted.map(tokenize, batched=True)


# Split into training and validation sets
split_dataset = tokenized.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

Map: 100%|██████████| 400/400 [00:00<00:00, 2222.20 examples/s]


In [75]:
sample = tokenized[6]
print("Input:", tokenizer.decode(sample["input_ids"], skip_special_tokens=True))
print("Target:", tokenizer.decode([l if l != -100 else tokenizer.pad_token_id for l in sample["labels"]], skip_special_tokens=True))


Input: detoxify: who really gives a crap about this ?
Target: Who really care about this?


In [76]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
args = Seq2SeqTrainingArguments(
    output_dir="./mt5-detox-en",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    num_train_epochs=8,
    logging_steps=100,
    fp16=True,
    save_strategy="epoch",
    logging_dir="./logs_en",
    report_to="none"
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss
100,0.0


TrainOutput(global_step=176, training_loss=0.0, metrics={'train_runtime': 1124.2355, 'train_samples_per_second': 2.562, 'train_steps_per_second': 0.157, 'total_flos': 827341830881280.0, 'train_loss': 0.0, 'epoch': 7.666666666666667})

In [1]:
# Prepare input
input_text = "You are ugly"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

# Generate detoxified output
output = model.generate(
    **inputs,
    max_length=50,
    num_beams=4,                      # optional: beam search improves fluency
    early_stopping=True,              # stop generation when EOS is reached
    decoder_start_token_id=tokenizer.pad_token_id  # force decoder to start properly
)

# Decode the generated output
decoded = tokenizer.decode(output[0])
print("Detoxified:", decoded)


NameError: name 'tokenizer' is not defined