In [None]:
import pandas as pd
import torch

from torch.utils.data import Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import(
    accuracy_score, 
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)
import numpy as np


# Seed


In [None]:
seed = 10

# Load Dataset

In [None]:
df = pd.read_csv("datasets/dataset.csv")

In [None]:
df.head(5)

# Load pre-trained model 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")

# model path
model_path = "bert-base-multilingual-cased"

# model tokenizer
tokenizer = BertTokenizer.from_pretrained(model_path)

# load model with binary classification head
model = BertForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label={0: "neutral", 1: "biased"},
    label2id={"neutral": 0, "biased": 1}
)
model.to(device)

# Set trainable parameters


- "transfer learning". we leave the base model parameters frozen, only train a classification head that we add on top
- might result in rigid model
- unfreeze final four layers, keeping computational cost down but keep flexibility

In [None]:
trainable = ["encoder.layer.10", "encoder.layer.11", "pooler", "classifier"]
for name, param in model.named_parameters():
    param.requires_grad = any(layer in name for layer in trainable)

# log param counts
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable params: {trainable_params}")

# Data pre-processing

- PyTorch models need input data in a specific format
- BiasDataset class turns each row from df into tokenized input tensors for BERT

In [None]:
class BiasDataset(Dataset):
    # store df and tokenizer
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    # how many samples in dataset
    def __len__(self):
        return len(self.data)

    # runs every time model needs one item from dataset
    # grabs english and german sentence, tokenizes them as a pair, applied padding, trunc and max_length, converts into pytorch tensors, returns a dict
    def __getitem__(self, idx):
        english = self.data.iloc[idx]["english"]
        german = self.data.iloc[idx]["german"]
        label = int(self.data.iloc[idx]["label"])

        encoded = self.tokenizer(
            text=english,
            text_pair=german,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt",
            return_overflowing_tokens=False
        )

        item = {key: val.squeeze(0) for key, val in encoded.items()}
        item["labels"] = torch.tensor(label)
        return item

- tokenizer gives tensors with a first size of 1 (a batch)
- squeeze(0) removes that first size, making single samples

## Train test split

In [None]:
train_df, temp_df = train_test_split(
    df, 
    test_size=0.2, 
    stratify=df["label"], 
    random_state=seed
)
val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    stratify=temp_df["label"], 
    random_state=seed
)

## Create Dataset Objects

In [None]:
train_dataset = BiasDataset(train_df, tokenizer)  
val_dataset = BiasDataset(val_df, tokenizer)    
test_dataset = BiasDataset(test_df, tokenizer) 

# Define evaluation metrics


- **`evaluate` function** runs the model on test data to check performance.

- `model.eval()`  
  - Sets the model to evaluation mode (no training or dropout).

- Initialize empty lists:  
  - `all_labels` to save true labels.  
  - `all_preds` to save predicted labels.

- Loop through batches in `dataloader`:  
  - Move inputs and labels to device (CPU/GPU).  
  - Get model outputs (logits).  
  - Select predicted class with highest score (`argmax`).  
  - Add true labels and predictions to lists.

- After the loop:  
  - Calculate **accuracy**: percentage of correct predictions.  
  - Calculate **precision**: correct biased predictions / all biased predictions made.  
  - Calculate **recall**: correct biased predictions / all actual biased samples.  
  - Calculate **f1-score**: balance between precision and recall.

- Return all four metrics.


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="binary", zero_division=0
    )
    accuracy = accuracy_score(labels, predictions)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Training

## Training Parameters

In [None]:
# hyperparameters
lr = 2e-5
batch_size = 8
num_epochs = 4

training_args = TrainingArguments(
    seed = seed,
    output_dir="./model_output",       
    num_train_epochs=num_epochs,   
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,   
    learning_rate=lr,             
    warmup_ratio=0.1,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",       
    load_best_model_at_end=True,  
    metric_for_best_model="f1",  
    greater_is_better=True   
)

## Run trainer

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  
    compute_metrics=compute_metrics,
)

In [None]:
print("Starting training...")

try:
    train_results = trainer.train()
except Exception as e:
    print("Training failed:", e)
    raise

print("Training complete. Saving model...")

trainer.save_model("./model_output")
tokenizer.save_pretrained("./model_output")

# Evaluate

In [None]:
print("Evaluating model...")

# Evaluate on validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print("Validation F1:", round(val_results["eval_f1"], 3))

# Evaluate on test set
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test F1:", round(test_results["eval_f1"], 3))

# Test cases

In [None]:
model_path = "./model_output"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

In [None]:
# Define test cases (English, German) with expected bias labels
test_cases = [
    ("The nurse is kind", "Die Krankenschwester ist freundlich", 1),  # Gendered (biased)
    ("The nurse is kind", "Die Pflegekraft ist freundlich", 0),       # Neutral
    ("The doctor is strong", "Der Arzt ist stark", 1),                # Gendered
    ("Today the weather is beautiful", "Heute ist das Wetter sch√∂n", 0),  # Neutral
    ("The woman is a coder", "Die Frau ist eine Programmiererin", 0)  # Gendered
]

# Prepare results table
results = []

# Run predictions
for eng, de, true_label in test_cases:
    # Tokenize
    inputs = tokenizer(
        eng, de,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        pred_label = torch.argmax(outputs.logits).item()
        prob = torch.softmax(outputs.logits, dim=1)[0].cpu().numpy()
    
    results.append({
        "English": eng,
        "German": de,
        "True Label": true_label,
        "Predicted Label": pred_label,
        "Neutral Prob": prob[0],
        "Biased Prob": prob[1],
        "Correct": true_label == pred_label
    })

# Display as formatted table
results_df = pd.DataFrame(results)
print("\nBias Detection Test Cases:")

# Calculate accuracy
accuracy = results_df["Correct"].mean()
print(f"\nModel Accuracy on Test Sentences: {accuracy:.1%}")