# Processing Data

## Load tokenizer

In [3]:
from datasets import load_dataset
import json

from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")


# ---------- Preprocesing Functions ----------
def preprocess_paradetox_multilingual(example):
    return {
        "input_text": "detoxify: " + example["toxic_sentence"],
        "target_text": example["neutral_sentence"]
    }

def preprocess_paradetox(example):
    return {
        "input_text": "detoxify: " + example["en_toxic_comment"],
        "target_text": example["en_neutral_comment"]
    }

def clean_columns(dataset):
    return dataset.remove_columns(
        [col for col in dataset.column_names if col not in ["input_text", "target_text"]]
    )


def load_json_results(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)


def save_variables_to_json(filename, **variables):
    """
    Saves given variables to a JSON file with their variable names as keys.

    Parameters:
    - filename (str): The name of the JSON file to write to.
    - **variables: Arbitrary keyword arguments representing variable names and their values.
    """
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(variables, f, ensure_ascii=False, indent=2)

## Load and Process Datasets

In [8]:
# ---------- Load and Process Datasets ----------
baseline_test_data_en = load_dataset("textdetox/multilingual_paradetox", split="en")
baseline_test_data_de = load_dataset("textdetox/multilingual_paradetox", split="de")
baseline_train_data = load_dataset("s-nlp/paradetox", split="train") 

In [15]:
save_variables_to_json(
    "baseline_test_data_en.json",
    toxic_texts=baseline_test_data_en["toxic_sentence"],
    human_texts=baseline_test_data_en["neutral_sentence"],
)

## Reformat data

In [16]:
import json

# Load the original test data
with open("baseline_test_data_en.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# Reformat into list of {"input_text": ..., "target_text": ...}
reformatted = [
    {"input_text": toxic.strip(), "target_text": neutral.strip()}
    for toxic, neutral in zip(test_data["toxic_texts"], test_data["human_texts"])
]

# Save the reformatted structure
with open("baseline_test_data_en2.json", "w", encoding="utf-8") as f:
    json.dump(reformatted, f, indent=2, ensure_ascii=False)

print("✅ Saved as 'baseline_test_data_en2.json'")

✅ Saved as 'baseline_test_data_en2.json'


## format datasets remove emoji

In [17]:
import re

baseline_formatted_train = clean_columns(baseline_train_data.map(preprocess_paradetox))
baseline_formatted_en = clean_columns(baseline_test_data_en.map(preprocess_paradetox_multilingual))
baseline_formatted_de = clean_columns(baseline_test_data_de.map(preprocess_paradetox_multilingual))


def remove_emojis(text):
    # This regex pattern matches a wide range of emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags
        "\U00002700-\U000027BF"  # Dingbats
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U000024C2-\U0001F251"
        "]+",
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

def clean_emoji_batch(batch):
    batch["input_text"] = remove_emojis(batch["input_text"])
    batch["target_text"] = remove_emojis(batch["target_text"])
    return batch

baseline_formatted_train = baseline_formatted_train.map(clean_emoji_batch)
baseline_formatted_en = baseline_formatted_en.map(clean_emoji_batch)
baseline_formatted_de = baseline_formatted_de.map(clean_emoji_batch)


In [18]:
print("Test dataset en size:", len(baseline_formatted_en))
print("Test dataset de size:", len(baseline_formatted_de))
print("Train dataset size:", len(baseline_formatted_train))

print("Test en dataset columns:", baseline_formatted_en.column_names)
print("Test de dataset columns:", baseline_formatted_de.column_names)
print("Train dataset columns:", baseline_formatted_train.column_names)

Test dataset en size: 400
Test dataset de size: 400
Train dataset size: 19744
Test en dataset columns: ['input_text', 'target_text']
Test de dataset columns: ['input_text', 'target_text']
Train dataset columns: ['input_text', 'target_text']


## Save datasets to json

In [20]:
save_variables_to_json(
    "baseline_formatted_de.json",
    input_texts=baseline_formatted_de["input_text"],
    reference_texts=baseline_formatted_de["target_text"],
)

save_variables_to_json(
    "baseline_formatted_en.json",
    input_texts=baseline_formatted_en["input_text"],
    reference_texts=baseline_formatted_en["target_text"],
)

save_variables_to_json(
    "baseline_formatted_train.json",
    input_texts=baseline_formatted_train["input_text"],
    reference_texts=baseline_formatted_train["target_text"],
)

## Load Dataset

In [2]:
import json
from datasets import Dataset

def load_formatted_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Convert keys to the structure used by your pipeline
    input_texts = data["input_texts"]
    target_texts = data["reference_texts"]

    return Dataset.from_dict({
        "input_text": input_texts,
        "target_text": target_texts
    })

baseline_formatted_train = load_formatted_json("baseline_formatted_train.json")
baseline_formatted_de = load_formatted_json("baseline_formatted_de.json")
baseline_formatted_en = load_formatted_json("baseline_formatted_en.json")

In [3]:


def tokenize(examples):
    # Tokenize inputs
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=128,
        padding="max_length",
        truncation=True
    )

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=128,
            padding="max_length",
            truncation=True
        )

    # Mask padding tokens in labels
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label_seq]
        for label_seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize dataset
baseline_tokenized_dataset = baseline_formatted_train.map(tokenize, batched=True)
baseline_tokenized_dataset = baseline_tokenized_dataset.filter(lambda x: any(label != -100 for label in x["labels"]))

baseline_tokenized_eval = baseline_formatted_en.map(tokenize, batched=True)
baseline_tokenized_eval = baseline_tokenized_eval.filter(lambda x: any(label != -100 for label in x["labels"]))

# Assign to eval_dataset
baseline_eval_dataset = baseline_tokenized_eval
baseline_train_dataset = baseline_tokenized_dataset

Map:   0%|          | 0/19744 [00:00<?, ? examples/s]



Filter:   0%|          | 0/19744 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/400 [00:00<?, ? examples/s]

In [4]:
print("Train dataset size:", len(baseline_train_dataset))
print("Eval dataset size:", len(baseline_eval_dataset))

print(type(baseline_eval_dataset))
print(baseline_eval_dataset[1])

print(type(baseline_train_dataset))
print(baseline_train_dataset[1])

Train dataset size: 19744
Eval dataset size: 400
<class 'datasets.arrow_dataset.Dataset'>
{'input_text': "detoxify: My page should be protected first so that worthless scum like you can't keep vandalizing it.", 'target_text': "My page should be protected first so that unpleasant people like you can't keep vandalizing it.", 'input_ids': [16379, 4921, 10, 499, 543, 225, 36, 5046, 166, 78, 24, 1494, 924, 3, 7, 6361, 114, 25, 54, 31, 17, 453, 4049, 26, 138, 2610, 34, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Models

In [5]:
print("Train dataset columns:", baseline_train_dataset.column_names)

Train dataset columns: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels']


## Baseline transformer model

In [6]:
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    EarlyStoppingCallback,
)
import torch
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Set up data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-detox-en-baseline",
    overwrite_output_dir=True,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    num_train_epochs=3,
    fp16=torch.cuda.is_available(),
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    logging_dir="./logs_en_base",
    report_to="none",
)

# Trainer setup
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=baseline_train_dataset,
    eval_dataset=baseline_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# Train
trainer.train()


  0%|          | 0/924 [00:00<?, ?it/s]

{'loss': 1.6668, 'grad_norm': 1.6632328033447266, 'learning_rate': 2.6785714285714288e-05, 'epoch': 0.32}
{'loss': 1.3592, 'grad_norm': 1.3904495239257812, 'learning_rate': 2.357142857142857e-05, 'epoch': 0.65}
{'loss': 1.3035, 'grad_norm': 1.6799440383911133, 'learning_rate': 2.0324675324675325e-05, 'epoch': 0.97}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 1.2165433168411255, 'eval_runtime': 1.007, 'eval_samples_per_second': 397.204, 'eval_steps_per_second': 49.651, 'epoch': 1.0}
{'loss': 1.2252, 'grad_norm': 1.0598976612091064, 'learning_rate': 1.707792207792208e-05, 'epoch': 1.3}
{'loss': 1.2129, 'grad_norm': 1.1750245094299316, 'learning_rate': 1.3863636363636363e-05, 'epoch': 1.62}
{'loss': 1.2106, 'grad_norm': 1.0239146947860718, 'learning_rate': 1.0616883116883117e-05, 'epoch': 1.94}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 1.1702256202697754, 'eval_runtime': 0.9806, 'eval_samples_per_second': 407.899, 'eval_steps_per_second': 50.987, 'epoch': 2.0}
{'loss': 1.1834, 'grad_norm': 1.2349812984466553, 'learning_rate': 7.370129870129871e-06, 'epoch': 2.27}
{'loss': 1.1753, 'grad_norm': 1.143837332725525, 'learning_rate': 4.123376623376624e-06, 'epoch': 2.59}
{'loss': 1.1738, 'grad_norm': 1.0734165906906128, 'learning_rate': 8.766233766233766e-07, 'epoch': 2.92}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 1.1581618785858154, 'eval_runtime': 0.9392, 'eval_samples_per_second': 425.885, 'eval_steps_per_second': 53.236, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 297.4868, 'train_samples_per_second': 199.108, 'train_steps_per_second': 3.106, 'train_loss': 1.2762004575688086, 'epoch': 3.0}


TrainOutput(global_step=924, training_loss=1.2762004575688086, metrics={'train_runtime': 297.4868, 'train_samples_per_second': 199.108, 'train_steps_per_second': 3.106, 'total_flos': 2000893192962048.0, 'train_loss': 1.2762004575688086, 'epoch': 2.995137763371151})

## Load model

In [8]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Path to your saved model
checkpoint_path = "mt5-detox-en-baseline/checkpoint-3702"

# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained(checkpoint_path)
model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)

# Send to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

## German to English Translation

In [9]:
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm
import json

# Load German-to-English model
model_name = "Helsinki-NLP/opus-mt-de-en"
tokenizer_mt = MarianTokenizer.from_pretrained(model_name)
model_mt = MarianMTModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Use formatted_de directly (already loaded as a Dataset)
toxic_de = [text.replace("detoxify: ", "") for text in baseline_formatted_de["input_text"]]
neutral_de = list(baseline_formatted_de["target_text"])

# Helper to batch translate a list of strings
def batch_translate(texts, batch_size=8):
    translations = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        inputs = tokenizer_mt(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(model_mt.device)
        outputs = model_mt.generate(**inputs, max_length=128)
        decoded = tokenizer_mt.batch_decode(outputs, skip_special_tokens=True)
        translations.extend(decoded)
    return translations

# Translate both
toxic_en = batch_translate(toxic_de)
neutral_en = batch_translate(neutral_de)

# Reconstruct translated dataset
translated_data = [
    {"input_text": "detoxify: " + toxic, "target_text": neutral}
    for toxic, neutral in zip(toxic_en, neutral_en)
]

# Save to file
with open("baseline_translated_de_to_en_pairs.json", "w", encoding="utf-8") as f:
    json.dump(translated_data, f, indent=2, ensure_ascii=False)

print("✅ Translated baseline_dataset saved.")

100%|██████████| 50/50 [00:25<00:00,  1.94it/s]
100%|██████████| 50/50 [00:23<00:00,  2.16it/s]

✅ Translated baseline_dataset saved.





## Load German translated to english Dataset

In [10]:
from datasets import Dataset
import json

# Load raw list of strings
with open("baseline_translated_de_to_en_pairs.json", "r", encoding="utf-8") as f:
    raw_texts = json.load(f)

print("Raw texts loaded:", (raw_texts))

# Wrap each string in a dictionary
data = [
    {
        "input_text": entry.get("input_text", ""),
        "target_text": entry.get("target_text", "")
    }
    for entry in raw_texts
]

baseline_eval_dataset_de = data

print(type(baseline_eval_dataset_de))         
print(type(baseline_eval_dataset_de[0]))      

<class 'list'>
<class 'dict'>


# Detoxification of Dataset

In [11]:
import json
from torch.utils.data import DataLoader
from transformers import T5Tokenizer
from tqdm import tqdm

def evaluate_and_save_detoxification(
    model,
    dataset,
    output_path,
    tokenizer=None,
    batch_size=12,
    lang="en"
):
    """
    Evaluates a detoxification model and saves results to a JSON file.
    
    Args:
        model: Trained T5 detoxification model.
        dataset: HuggingFace Dataset with "input_text" and "target_text".
        output_path: Path to save the JSON results.
        tokenizer: Optional tokenizer, default is T5.
        batch_size: Batch size for generation.
        lang: Language tag for progress bar.
    
    Returns:
        None
    """
    tokenizer = tokenizer or T5Tokenizer.from_pretrained("t5-small")

    def collate_fn(batch):
        texts = [ex["input_text"] for ex in batch]
        return tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)

    loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)

    input_texts = []
    reference_texts = []
    detoxified_outputs = []

    model.eval()
    for i, batch in enumerate(tqdm(loader, desc=f"Generating Detoxified ({lang})")):
        batch = {k: v.to(model.device) for k, v in batch.items()}
        outputs = model.generate(
            **batch,
            max_length=50,
            num_beams=4,
            early_stopping=True,
            decoder_start_token_id=tokenizer.pad_token_id
        )

        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        detoxified_outputs.extend(decoded)

        for j in range(len(decoded)):
            example = dataset[i * batch_size + j]
            input_texts.append(example["input_text"])
            reference_texts.append(example.get("target_text", ""))

    # Save results to JSON
    result = {
        "input_texts": input_texts,
        "reference_texts": reference_texts,
        "detoxified_outputs": detoxified_outputs
    }
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved baseline_detoxified outputs to {output_path}")


## Detoxify English

In [12]:
# For English
evaluate_and_save_detoxification(
    model=model,
    dataset=baseline_eval_dataset,
    output_path="baseline_detoxified_outputs_en.json",
    tokenizer=tokenizer,
    lang="en"
)

Generating Detoxified (en): 100%|██████████| 34/34 [00:14<00:00,  2.32it/s]

✅ Saved baseline_detoxified outputs to baseline_detoxified_outputs_en.json





## Detoxify German translated to english English

In [35]:
# For German
evaluate_and_save_detoxification(
    model=model,
    dataset=baseline_eval_dataset_de,  # or eval_dataset
    output_path="baseline_detoxified_outputs_de.json",
    tokenizer=tokenizer,
    lang="de"
)

Generating Detoxified (de): 100%|██████████| 34/34 [00:25<00:00,  1.32it/s]

✅ Saved baseline_detoxified outputs to baseline_detoxified_outputs_de.json





## Translation English to German

In [8]:
import json
from transformers import MarianMTModel, MarianTokenizer
from tqdm import tqdm
import torch

# Load the English-to-German translation model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer_mt = MarianTokenizer.from_pretrained(model_name)
model_mt = MarianMTModel.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Load the English dataset
with open("baseline_detoxified_outputs_de.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)

# Extract and clean the input and target texts
toxic_en = [text.replace("detoxify: ", "") for text in data_en["input_texts"]]
neutral_en = data_en["detoxified_outputs"]

# Helper function to batch translate a list of strings
def batch_translate(texts, batch_size=8):
    translations = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Translating"):
        batch = texts[i:i + batch_size]
        inputs = tokenizer_mt(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to(model_mt.device)
        outputs = model_mt.generate(**inputs, max_length=128)
        decoded = tokenizer_mt.batch_decode(outputs, skip_special_tokens=True)
        translations.extend(decoded)
    return translations

# Translate both toxic and neutral texts
toxic_de = batch_translate(toxic_en)
neutral_de = batch_translate(neutral_en)

# Reconstruct the translated dataset without the "detoxify: " prefix
translated_data = [
    {"input_text": toxic, "target_text": neutral}
    for toxic, neutral in zip(toxic_de, neutral_de)
]

# Save the translated dataset to a JSON file
with open("baseline_evaluation_dataset_german.json", "w", encoding="utf-8") as f:
    json.dump(translated_data, f, indent=2, ensure_ascii=False)

print("✅ Translated dataset saved as 'baseline_evaluation_dataset_german.json'.")

Translating: 100%|██████████| 50/50 [00:24<00:00,  2.05it/s]
Translating: 100%|██████████| 50/50 [00:20<00:00,  2.43it/s]

✅ Translated dataset saved as 'baseline_evaluation_dataset_german.json'.





# Data Evaluation

## Style Transfer Accuracy - Toxicity Level Evaluation

### from backtranslation baseline

In [6]:
import csv

with open("backtranslation_output_en.tsv", "r", encoding="utf-8") as tsv_file:
    reader = csv.reader(tsv_file, delimiter="\t")
    headers = next(reader)
    print(headers)

['toxic_sentence', 'neutral_sentence', 'lang']


In [7]:
import csv
import json

# Step 1: Read the TSV file
tsv_path = "backtranslation_output_en.tsv"
json_path = "converted_backtranslation_en.json"

data = []
with open(tsv_path, "r", encoding="utf-8") as tsv_file:
    reader = csv.DictReader(tsv_file, delimiter="\t")
    for row in reader:
        # Rename fields to match the target format
        entry = {
            "input_text": row.get("toxic_sentence", ""),
            "target_text": row.get("neutral_sentence", "")
        }
        data.append(entry)

# Step 2: Save to JSON
with open(json_path, "w", encoding="utf-8") as json_file:
    json.dump(data, json_file, indent=2, ensure_ascii=False)

print(f"Converted {len(data)} entries to JSON format.")

Converted 400 entries to JSON format.


In [9]:
import csv
import json

# Step 1: Read the TSV file
tsv_path = "backtranslation_output_de.tsv"
json_path = "converted_backtranslation_de.json"

data = []
with open(tsv_path, "r", encoding="utf-8") as tsv_file:
    reader = csv.DictReader(tsv_file, delimiter="\t")
    for row in reader:
        # Rename fields to match the target format
        entry = {
            "input_text": row.get("toxic_sentence", ""),
            "target_text": row.get("neutral_sentence", "")
        }
        data.append(entry)

# Step 2: Save to JSON
with open(json_path, "w", encoding="utf-8") as json_file:
    json.dump(data, json_file, indent=2, ensure_ascii=False)

print(f"Converted {len(data)} entries to JSON format.")

Converted 400 entries to JSON format.


In [11]:
def convert_tsv_to_json(tsv_path: str, json_path: str):
    data = []
    with open(tsv_path, "r", encoding="utf-8") as tsv_file:
        reader = csv.DictReader(tsv_file, delimiter="\t")
        for row in reader:
            entry = {
                "input_text": row.get("toxic_sentence", ""),
                "target_text": row.get("neutral_sentence", "")
            }
            data.append(entry)

    with open(json_path, "w", encoding="utf-8") as json_file:
        json.dump(data, json_file, indent=2, ensure_ascii=False)

    print(f"Converted {len(data)} entries to JSON format.")

In [13]:
convert_tsv_to_json("refined_output_en.tsv", "refined_backtranslation_en.json")

Converted 400 entries to JSON format.


### reformatting detoxified outputs

In [4]:
import json

# Load the detoxified results
with open("baseline_detoxified_outputs_en.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract and clean input and output texts
input_texts = [text.replace("detoxify: ", "").strip() for text in data["input_texts"]]
detoxified_outputs = data["detoxified_outputs"]

# Create the reformatted structure
reformatted = [
    {"input_text": input_text, "target_text": detoxified}
    for input_text, detoxified in zip(input_texts, detoxified_outputs)
]

# Save to file
with open("baseline_evaluation_dataset_english.json", "w", encoding="utf-8") as f:
    json.dump(reformatted, f, indent=2, ensure_ascii=False)

print("✅ Saved as 'baseline_evaluation_dataset_english.json'")


✅ Saved as 'baseline_evaluation_dataset_english.json'


In [2]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('textdetox/xlmr-large-toxicity-classifier-v2')
model = AutoModelForSequenceClassification.from_pretrained('textdetox/xlmr-large-toxicity-classifier-v2')
model.to("cuda")  # or .to("cuda") if using GPU

def evaluate_toxicity_from_list_of_dicts(file_path: str, text_key: str = "target_text"):
    with open(file_path, "r", encoding="utf-8") as f:
        dataset = json.load(f)

    texts = [entry[text_key] for entry in dataset]
    print(f"✅ Loaded {len(texts)} texts from {file_path}")

    batch_size = 16
    toxic_count = 0
    non_toxic_count = 0

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        encodings = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512)
        encodings = {k: v.to(model.device) for k, v in encodings.items()}

        with torch.no_grad():
            outputs = model(**encodings)

        predictions = torch.argmax(outputs.logits, dim=1)

        for label in predictions:
            if label.item() == 0:
                non_toxic_count += 1
            else:
                toxic_count += 1

    total = toxic_count + non_toxic_count
    print("\n--- Toxicity Evaluation Summary ---")
    print(f"Total texts evaluated: {total}")
    print(f"Non-toxic: {non_toxic_count} ({(non_toxic_count / total) * 100:.2f}%)")
    print(f"Toxic: {toxic_count} ({(toxic_count / total) * 100:.2f}%)")


In [14]:
evaluate_toxicity_from_list_of_dicts("refined_backtranslation_en.json", text_key="target_text")

✅ Loaded 400 texts from refined_backtranslation_en.json

--- Toxicity Evaluation Summary ---
Total texts evaluated: 400
Non-toxic: 322 (80.50%)
Toxic: 78 (19.50%)


In [10]:
evaluate_toxicity_from_list_of_dicts("converted_backtranslation_de.json", text_key="target_text")

✅ Loaded 400 texts from converted_backtranslation_de.json

--- Toxicity Evaluation Summary ---
Total texts evaluated: 400
Non-toxic: 317 (79.25%)
Toxic: 83 (20.75%)


In [8]:
evaluate_toxicity_from_list_of_dicts("converted_backtranslation_en.json", text_key="target_text")

✅ Loaded 400 texts from converted_backtranslation_en.json

--- Toxicity Evaluation Summary ---
Total texts evaluated: 400
Non-toxic: 323 (80.75%)
Toxic: 77 (19.25%)


In [3]:
evaluate_toxicity_from_list_of_dicts("baseline_evaluation_dataset_german.json", text_key="target_text")
evaluate_toxicity_from_list_of_dicts("baseline_evaluation_dataset_english.json", text_key="target_text")

✅ Loaded 400 texts from baseline_evaluation_dataset_german.json

--- Toxicity Evaluation Summary ---
Total texts evaluated: 400
Non-toxic: 224 (56.00%)
Toxic: 176 (44.00%)
✅ Loaded 400 texts from baseline_evaluation_dataset_english.json

--- Toxicity Evaluation Summary ---
Total texts evaluated: 400
Non-toxic: 272 (68.00%)
Toxic: 128 (32.00%)


## Comparison toxicity before

In [19]:
evaluate_toxicity_from_list_of_dicts("baseline_evaluation_dataset_german.json", text_key="input_text")
evaluate_toxicity_from_list_of_dicts("baseline_evaluation_dataset_english.json", text_key="input_text")

✅ Loaded 400 texts from baseline_evaluation_dataset_german.json

--- Toxicity Evaluation Summary ---
Total texts evaluated: 400
Non-toxic: 138 (34.50%)
Toxic: 262 (65.50%)
✅ Loaded 400 texts from baseline_evaluation_dataset_english.json

--- Toxicity Evaluation Summary ---
Total texts evaluated: 400
Non-toxic: 31 (7.75%)
Toxic: 369 (92.25%)


## Content preservation: LaBSE

In [20]:
import json
import torch
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Load LaBSE model once
labse_model = SentenceTransformer("sentence-transformers/LaBSE")

def compute_labse_similarity_from_file(filepath, input_key="input_text", target_key="target_text", sample_size=None, batch_size=16):
    with open(filepath, "r", encoding="utf-8") as f:
        data = json.load(f)

    if sample_size:
        data = data[:sample_size]

    input_texts = [entry[input_key] for entry in data]
    target_texts = [entry[target_key] for entry in data]

    similarities = []

    for i in tqdm(range(0, len(input_texts), batch_size), desc="Computing LaBSE Similarities"):
        batch_input = input_texts[i:i+batch_size]
        batch_target = target_texts[i:i+batch_size]

        emb_input = labse_model.encode(batch_input, convert_to_tensor=True)
        emb_target = labse_model.encode(batch_target, convert_to_tensor=True)

        sim = torch.nn.functional.cosine_similarity(emb_input, emb_target).cpu().numpy()
        similarities.extend(sim)

    avg_sim = float(np.mean(similarities))
    print(f"✅ {filepath} — Content Preservation (LaBSE): {avg_sim:.4f}")
    return avg_sim, similarities


In [21]:
compute_labse_similarity_from_file("baseline_evaluation_dataset_english.json", sample_size=100)
compute_labse_similarity_from_file("baseline_evaluation_dataset_german.json", sample_size=100)


Computing LaBSE Similarities: 100%|██████████| 7/7 [00:00<00:00, 30.27it/s]


✅ baseline_evaluation_dataset_english.json — Content Preservation (LaBSE): 0.9089


Computing LaBSE Similarities: 100%|██████████| 7/7 [00:00<00:00, 31.28it/s]

✅ baseline_evaluation_dataset_german.json — Content Preservation (LaBSE): 0.9001





(0.9000511765480042,
 [np.float32(0.83013844),
  np.float32(0.8651395),
  np.float32(0.9469372),
  np.float32(0.8971749),
  np.float32(0.89105636),
  np.float32(0.9737712),
  np.float32(0.9504341),
  np.float32(0.7288624),
  np.float32(0.7319522),
  np.float32(0.780321),
  np.float32(0.9135866),
  np.float32(0.88522166),
  np.float32(0.98429847),
  np.float32(0.9063605),
  np.float32(0.9041235),
  np.float32(0.9013325),
  np.float32(0.8859218),
  np.float32(0.5979222),
  np.float32(0.91307706),
  np.float32(0.9871712),
  np.float32(0.86609435),
  np.float32(0.91994584),
  np.float32(0.9342329),
  np.float32(0.8082813),
  np.float32(0.8509568),
  np.float32(0.8449616),
  np.float32(0.97551537),
  np.float32(0.83295226),
  np.float32(0.9817647),
  np.float32(0.97704446),
  np.float32(0.7917104),
  np.float32(0.7877049),
  np.float32(0.9449831),
  np.float32(0.93468344),
  np.float32(0.9247023),
  np.float32(0.9761138),
  np.float32(0.85103923),
  np.float32(0.91977817),
  np.float32(1.0)

## Fluency Evaluation XCOMET-lite in different file

In [None]:
import json
from typing import List, Dict

def prepare_xcomet_input(src_mt_path: str, ref_path: str) -> List[Dict[str, str]]:
    # Load source and MT data
    with open(src_mt_path, "r", encoding="utf-8") as f:
        src_mt_data = json.load(f)  # List of dicts

    # Load reference data
    with open(ref_path, "r", encoding="utf-8") as f:
        ref_data = json.load(f)  # List of dicts

    # Sanity check
    assert len(src_mt_data) == len(ref_data), "Input lengths do not match"

    # Build XCOMET input
    xcomet_input = []
    for src_mt, ref in zip(src_mt_data, ref_data):
        xcomet_input.append({
            "src": src_mt["input_text"],
            "mt": src_mt["target_text"],
            "ref": ref["target_text"]
        })

    return xcomet_input

# Example usage
xcomet_data = prepare_xcomet_input("baseline_evaluation_dataset_english.json", "baseline_test_data_en2.json")

# Save to file
with open("baseline_xcomet_input_data.json", "w", encoding="utf-8") as out_file:
    json.dump(xcomet_data, out_file, indent=2)
