In [None]:
from datasets import load_dataset, concatenate_datasets

from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ---------- Preprocessing Functions ----------
def preprocess_paradetox(example):
    return {
        "input_text": "detoxify: " + example["toxic_sentence"],
        "target_text": example["neutral_sentence"]
    }

def preprocess_snlp(example):
    return {
        "input_text": "detoxify: " + example["en_toxic_comment"],
        "target_text": example["en_neutral_comment"]
    }

def clean_columns(dataset):
    return dataset.remove_columns(
        [col for col in dataset.column_names if col not in ["input_text", "target_text"]]
    )

# ---------- Load and Process Datasets ----------
en_data = load_dataset("textdetox/multilingual_paradetox", split="en")
en2_data = load_dataset("s-nlp/paradetox", split="train")  # assuming single split

formatted = clean_columns(en_data.map(preprocess_paradetox))
formatted2 = clean_columns(en2_data.map(preprocess_snlp))

# ---------- Combine Datasets ----------
combined_dataset = concatenate_datasets([formatted, formatted2])


def tokenize(examples):
    # Tokenize inputs
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        padding="max_length",
        truncation=True
    )

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=128,
            padding="max_length",
            truncation=True
        )

    # Mask padding tokens in labels
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label_seq]
        for label_seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize dataset
tokenized_dataset = combined_dataset.map(tokenize, batched=True)

# Optional: filter out empty label examples
tokenized_dataset = tokenized_dataset.filter(lambda x: any(label != -100 for label in x["labels"]))

# Split into train/test
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
tokenized_dataset = tokenized_dataset.shuffle(seed=42)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [26]:
print("Train dataset size:", len(train_dataset))

Train dataset size: 18129


In [None]:
from transformers import (
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback,
    Seq2SeqTrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Seq2SeqTrainer
)
import torch
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training Arguments (faster logging & efficient eval)
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-detox-en-finetuned",
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    logging_strategy="steps",
    logging_steps=100,                   # ⬅ log less frequently
    eval_strategy="epoch",        # ⬅ eval only once per epoch
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=3e-5,
    num_train_epochs=3,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_dir="./logs_en",
    report_to="none"
)

# ✅ Custom Trainer with fast generation & partial toxicity scoring
class ToxicityPenaltyTrainer(Seq2SeqTrainer):
    def __init__(self, *args, lambda_penalty=0.8, **kwargs):
        super().__init__(*args, **kwargs)
        self.lambda_penalty = lambda_penalty
        self.tox_tokenizer = AutoTokenizer.from_pretrained("unitary/toxic-bert")
        self.tox_model = AutoModelForSequenceClassification.from_pretrained("unitary/toxic-bert").to(self.model.device)
        self.tox_model.eval()

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Standard generation loss
        outputs = model(**inputs)
        generation_loss = outputs.loss

        # Use only the first 1–2 samples to compute toxicity
        sample_input_ids = inputs["input_ids"][:2]
        sample_attention_mask = inputs["attention_mask"][:2]

        with torch.no_grad():
            # Fast generation with greedy decoding and reduced length
            generated_ids = model.generate(
                input_ids=sample_input_ids,
                attention_mask=sample_attention_mask,
                max_length=20,
                num_beams=1,
                do_sample=False,
                early_stopping=True,
                decoder_start_token_id=self.tokenizer.pad_token_id
            )

            # Decode and score toxicity
            decoded_texts = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            tox_inputs = self.tox_tokenizer(decoded_texts, return_tensors="pt", truncation=True, padding=True).to(self.model.device)
            tox_logits = self.tox_model(**tox_inputs).logits
            tox_probs = torch.sigmoid(tox_logits[:, 0])
            penalty = tox_probs.mean()

        # Final loss
        total_loss = generation_loss + self.lambda_penalty * penalty
        return (total_loss, outputs) if return_outputs else total_loss

# ✅ Instantiate and train
trainer = ToxicityPenaltyTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    lambda_penalty=0.8,  # Adjust penalty weight
)

# 🔁 Start training
trainer.train()


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Epoch,Training Loss,Validation Loss
1,1.4934,1.361239
2,1.2727,1.225731


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

TrainOutput(global_step=3399, training_loss=1.4724562180045775, metrics={'train_runtime': 1938.5674, 'train_samples_per_second': 28.055, 'train_steps_per_second': 1.753, 'total_flos': 1839092069302272.0, 'train_loss': 1.4724562180045775, 'epoch': 2.9977944419938245})

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Path to your saved model
checkpoint_path = "mt5-detox-en-2/checkpoint-3399"

# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(checkpoint_path)
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

# Send to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [13]:
# Prepare input
input_text = "You are fucking ugly"
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(device)

# Generate detoxified output
output = model.generate(
    **inputs,
    max_length=50,
    num_beams=4,                      # optional: beam search improves fluency
    early_stopping=True,              # stop generation when EOS is reached
    decoder_start_token_id=tokenizer.pad_token_id  # force decoder to start properly
)

# Decode the generated output
decoded = tokenizer.decode(output[0])
print("Detoxified:", decoded)

def generate_detoxified(text):
    input_text = "detoxify (en): " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True).to(model.device)
    output_ids = model.generate(input_ids, max_length=128)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(generate_detoxified("You are the worst piece of garbage."))
print(generate_detoxified("Nobody likes you, idiot."))

Detoxified: <pad> You are fucking ugly</s>
detoxify (en): You are the worst piece of garbage.
: "I'm not a person who likes you, idiot.


In [14]:
from torch.utils.data import DataLoader
from tqdm import tqdm

# ✅ Make sure you use the raw (non-tokenized) dataset
# If you accidentally removed input_text/target_text earlier, re-load or cache it

# 🔧 Batch size for faster inference (tune based on your GPU)
BATCH_SIZE = 16

# Collate function for batching text
def collate_fn(batch):
    texts = [ex["input_text"] for ex in batch]
    return tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

# DataLoader
loader = DataLoader(eval_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

# Storage for results
detoxified_outputs = []
input_texts = []
reference_texts = []

# Run generation
model.eval()
for i, batch in enumerate(tqdm(loader)):
    batch = {k: v.to(model.device) for k, v in batch.items()}
    outputs = model.generate(
        **batch,
        max_length=50,
        num_beams=4,
        early_stopping=True,
        decoder_start_token_id=tokenizer.pad_token_id
    )
    
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    detoxified_outputs.extend(decoded)

    # Save corresponding original and reference text
    for j in range(len(decoded)):
        example = eval_dataset[i * BATCH_SIZE + j]
        input_texts.append(example["input_text"])
        reference_texts.append(example["target_text"])


100%|██████████| 126/126 [00:56<00:00,  2.21it/s]


In [15]:
import json

# Save generated outputs and references to file
with open("generated_results_fine.json", "w", encoding="utf-8") as f:
    json.dump({
        "input_texts": input_texts,
        "reference_texts": reference_texts,
        "detoxified_outputs": detoxified_outputs
    }, f, ensure_ascii=False, indent=2)

In [1]:
import json
with open("generated_results_fine.json", "r", encoding="utf-8") as f:
    data = json.load(f)
    input_texts = data["input_texts"]
    reference_texts = data["reference_texts"]
    detoxified_outputs = data["detoxified_outputs"]

## BLEU Evaluation Score

In [4]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
nltk.download("punkt")

smooth_fn = SmoothingFunction().method4

bleu_scores = []

for hyp, ref in zip(detoxified_outputs, reference_texts):
    hyp_tokens = nltk.word_tokenize(hyp)
    ref_tokens = nltk.word_tokenize(ref)
    
    score = sentence_bleu(
        [ref_tokens],
        hyp_tokens,
        smoothing_function=smooth_fn
    )
    bleu_scores.append(score)

avg_bleu = np.mean(bleu_scores)
print(f"Average BLEU Score: {avg_bleu:.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jonaz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\jonaz/nltk_data'
    - 'c:\\Users\\jonaz\\git\\Maria_stuff\\Maria_code\\.venv\\nltk_data'
    - 'c:\\Users\\jonaz\\git\\Maria_stuff\\Maria_code\\.venv\\share\\nltk_data'
    - 'c:\\Users\\jonaz\\git\\Maria_stuff\\Maria_code\\.venv\\lib\\nltk_data'
    - 'C:\\Users\\jonaz\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


## Toxicity Score Evaluation

In [3]:
from detoxify import Detoxify

# Load Detoxify model
toxicity_model = Detoxify('unbiased')

# Run toxicity prediction
toxicity_scores = toxicity_model.predict(input_texts)
detoxified_scores = toxicity_model.predict(detoxified_outputs)

# Print sample comparisons
for i in range(3):
    print(f"Original:    {input_texts[i]}")
    print(f"Detoxified:  {detoxified_outputs[i]}")
    print(f"Toxicity Before: {toxicity_scores['toxicity'][i]:.2f}")
    print(f"Toxicity After:  {detoxified_scores['toxicity'][i]:.2f}")
    print("-" * 50)


import numpy as np
import matplotlib.pyplot as plt

# Calculate average toxicity
avg_toxicity_before = np.mean(toxicity_scores['toxicity'])
avg_toxicity_after = np.mean(detoxified_scores['toxicity'])

print(f"Average Toxicity Before: {avg_toxicity_before:.2f}")
print(f"Average Toxicity After: {avg_toxicity_after:.2f}")

# Plotting the distributions
plt.hist(toxicity_scores['toxicity'], bins=20, alpha=0.5, label='Before Detox')
plt.hist(detoxified_scores['toxicity'], bins=20, alpha=0.5, label='After Detox')
plt.xlabel('Toxicity Score')
plt.ylabel('Frequency')
plt.title('Toxicity Score Distribution')
plt.legend()
plt.show()


KeyboardInterrupt: 

In [40]:
from scipy.stats import ttest_rel

# Example: compare two arrays of toxicity scores
before = np.array(toxicity_scores['toxicity'])
after = np.array(detoxified_scores['toxicity'])

t_stat, p_value = ttest_rel(before, after)
print(f"t = {t_stat:.3f}, p = {p_value:.5f}")

t = 83.803, p = 0.00000
