In [1]:
import pandas as pd
import torch

from torch.utils.data import Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import(
    accuracy_score, 
    precision_recall_fscore_support,
)
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


# Seed


In [2]:
seed = 10

# Load Dataset

In [3]:
df = pd.read_csv("datasets/mgente_lardelli_equal.csv")

In [4]:
df.head(5)

Unnamed: 0,english,german,label
0,The extremists are diligent.,Die Extremistinnen sind fleißig.,1
1,They may also fall under the competence of my ...,"Für diese dürfte mein Kollege, Herr Barnier, e...",0
2,We believe it is absolutely fundamental that t...,Unserer Ansicht nach sollte der geplante Forts...,1
3,The supremacists are diligent.,Die Suprematistinnen sind fleißig.,1
4,What the fishermen are asking is that the Comm...,"Die Fischer bitten darum, dass die Kommission ...",1


# Load pre-trained model 

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("GPU: None (using CPU)")
# model path
model_path = "bert-base-multilingual-cased"

# model tokenizer
tokenizer = BertTokenizer.from_pretrained(model_path)

# load model with binary classification head
model = BertForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label={0: "neutral", 1: "biased"},
    label2id={"neutral": 0, "biased": 1}
)
model.to(device)

CUDA Available: True
GPU: NVIDIA GeForce RTX 3070 Laptop GPU


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

# Set trainable parameters


- "transfer learning". we leave the base model parameters frozen, only train a classification head that we add on top
- might result in rigid model
- unfreeze final four layers, keeping computational cost down but keep flexibility

In [6]:
trainable = ["encoder.layer.10", "encoder.layer.11", "pooler", "classifier"]
for name, param in model.named_parameters():
    param.requires_grad = any(layer in name for layer in trainable)

# log param counts
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable params: {trainable_params}")

Trainable params: 14767874


# Data pre-processing

- PyTorch models need input data in a specific format
- BiasDataset class turns each row from df into tokenized input tensors for BERT

In [7]:
class BiasDataset(Dataset):
    # store df and tokenizer
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    # how many samples in dataset
    def __len__(self):
        return len(self.data)

    # runs every time model needs one item from dataset
    # grabs english and german sentence, tokenizes them as a pair, applied padding, trunc and max_length, converts into pytorch tensors, returns a dict
    def __getitem__(self, idx):
        english = self.data.iloc[idx]["english"]
        german = self.data.iloc[idx]["german"]
        label = int(self.data.iloc[idx]["label"])

        encoded = self.tokenizer(
            text=english,
            text_pair=german,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt",
            return_overflowing_tokens=False
        )

        item = {key: val.squeeze(0) for key, val in encoded.items()}
        item["labels"] = torch.tensor(label)
        return item

- tokenizer gives tensors with a first size of 1 (a batch)
- squeeze(0) removes that first size, making single samples

## Train test split

In [8]:
train_df, temp_df = train_test_split(
    df, 
    test_size=0.2, 
    stratify=df["label"], 
    random_state=seed
)
val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    stratify=temp_df["label"], 
    random_state=seed
)

## Create Dataset Objects

In [9]:
train_dataset = BiasDataset(train_df, tokenizer)  
val_dataset = BiasDataset(val_df, tokenizer)    
test_dataset = BiasDataset(test_df, tokenizer) 

# Define evaluation metrics


- **`evaluate` function** runs the model on test data to check performance.

- `model.eval()`  
  - Sets the model to evaluation mode (no training or dropout).

- Initialize empty lists:  
  - `all_labels` to save true labels.  
  - `all_preds` to save predicted labels.

- Loop through batches in `dataloader`:  
  - Move inputs and labels to device (CPU/GPU).  
  - Get model outputs (logits).  
  - Select predicted class with highest score (`argmax`).  
  - Add true labels and predictions to lists.

- After the loop:  
  - Calculate **accuracy**: percentage of correct predictions.  
  - Calculate **precision**: correct biased predictions / all biased predictions made.  
  - Calculate **recall**: correct biased predictions / all actual biased samples.  
  - Calculate **f1-score**: balance between precision and recall.

- Return all four metrics.


In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="binary", zero_division=0
    )
    accuracy = accuracy_score(labels, predictions)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Training

## Training Parameters

In [11]:
# hyperparameters
lr = 2e-5
batch_size = 8
num_epochs = 10

training_args = TrainingArguments(
    seed = seed,
    output_dir="./model_output",       
    num_train_epochs=num_epochs,   
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,   
    learning_rate=lr,             
    warmup_ratio=0.1,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",       
    load_best_model_at_end=True,  
    metric_for_best_model="f1",  
    greater_is_better=True   
)

## Run trainer

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  
    compute_metrics=compute_metrics,
)

In [13]:
print("Starting training...")

try:
    train_results = trainer.train()
except Exception as e:
    print("Training failed:", e)
    raise

print("Training complete. Saving model...")

trainer.save_model("./model_output")
tokenizer.save_pretrained("./model_output")

Starting training...


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6154,0.476623,0.74,0.6875,0.88,0.77193
2,0.5122,0.444573,0.69,0.75,0.57,0.647727
3,0.4747,0.412363,0.725,0.71028,0.76,0.7343
4,0.4318,0.403388,0.76,0.720339,0.85,0.779817
5,0.3978,0.396663,0.77,0.759615,0.79,0.77451
6,0.3516,0.404289,0.805,0.808081,0.8,0.80402
7,0.3006,0.387555,0.815,0.811881,0.82,0.81592
8,0.2988,0.397147,0.81,0.792453,0.84,0.815534
9,0.2797,0.399194,0.82,0.807692,0.84,0.823529
10,0.2699,0.409241,0.83,0.84375,0.81,0.826531


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Training complete. Saving model...


('./model_output\\tokenizer_config.json',
 './model_output\\special_tokens_map.json',
 './model_output\\vocab.txt',
 './model_output\\added_tokens.json')

# Evaluate

In [14]:
print("Evaluating model...")

# Evaluate on validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print("Validation F1:", round(val_results["eval_f1"], 3))

# Evaluate on test set
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test F1:", round(test_results["eval_f1"], 3))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Evaluating model...


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Validation F1: 0.827


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Test F1: 0.76


# Test cases

In [15]:
model_path = "./model_output"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [None]:
# Define test cases (English, German) with expected bias labels
test_cases = [
    ("The nurse is kind", "Die Krankenschwester ist freundlich", 1),  # Gendered (biased)
    ("The nurse is kind", "Die Pflegekraft ist freundlich", 0),       # Neutral
    ("The doctor is strong", "Der Arzt ist stark", 1),                # Gendered
    ("Today the weather is beautiful", "Heute ist das Wetter schön", 0),  # Neutral
    ("The woman is a coder", "Die Frau ist eine Programmiererin", 0)  # Gendered
]

# Prepare results table
results = []

# Run predictions
for eng, de, true_label in test_cases:
    # Tokenize
    inputs = tokenizer(
        eng, de,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    ).to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        pred_label = torch.argmax(outputs.logits).item()
        prob = torch.softmax(outputs.logits, dim=1)[0].cpu().numpy()
    
    results.append({
        "English": eng,
        "German": de,
        "True Label": true_label,
        "Predicted Label": pred_label,
        "Neutral Prob": prob[0],
        "Biased Prob": prob[1],
        "Correct": true_label == pred_label
    })

# Display as formatted table
results_df = pd.DataFrame(results)
print("\nBias Detection Test Cases:")
print(results_df.to_string(index=False))  # This prints the whole table cleanly

# Calculate accuracy
accuracy = results_df["Correct"].mean()
print(f"\nModel Accuracy on Test Sentences: {accuracy:.1%}")



Bias Detection Test Cases:
                       English                              German  True Label  Predicted Label  Neutral Prob  Biased Prob  Correct
             The nurse is kind Die Krankenschwester ist freundlich           1                1      0.003684     0.996316     True
             The nurse is kind      Die Pflegekraft ist freundlich           0                1      0.131893     0.868107    False
          The doctor is strong                  Der Arzt ist stark           1                1      0.003683     0.996317     True
Today the weather is beautiful          Heute ist das Wetter schön           0                1      0.011891     0.988109    False
          The woman is a coder   Die Frau ist eine Programmiererin           0                1      0.001469     0.998531    False

Model Accuracy on Test Sentences: 40.0%
