In [1]:
import pandas as pd
import torch

from torch.utils.data import Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    set_seed,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import(
    accuracy_score, 
    precision_recall_fscore_support,
)
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


# Seed


In [2]:
seed = 10
set_seed(10)

# Load Dataset

In [3]:
df = pd.read_csv("datasets/dataset.csv")

In [4]:
df.head(5)

Unnamed: 0,english,german,label
0,In my capacity as draftsperson of the Committe...,Als Berichterstatterin des Petitionsausschusse...,1
1,The recipients are responsible.,Die Rezipierenden sind verantwortlich.,0
2,They never pay any attention to me.,Nie zollten sie mir eine Aufmerksamkeit.,0
3,"Deputy Mrs Izquierdo Rojo, I have outlined the...",Ich habe das Parlamentsmitglied Izquierdo Rojo...,1
4,I want to address Commissioner Verheugen perso...,Ich möchte mich mit einigen Bemerkungen an das...,1


# Load pre-trained model 

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("GPU: None (using CPU)")
# model path
model_path = "bert-base-multilingual-cased"

# model tokenizer
tokenizer = BertTokenizer.from_pretrained(model_path)

# load model with binary classification head
model = BertForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label={0: "neutral", 1: "biased"},
    label2id={"neutral": 0, "biased": 1}
)
model.to(device)

CUDA Available: True
GPU: NVIDIA GeForce RTX 3070 Laptop GPU


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

# Set trainable parameters


- "transfer learning". we leave the base model parameters frozen, only train a classification head that we add on top
- might result in rigid model
- unfreeze final four layers, keeping computational cost down but keep flexibility

In [6]:
trainable = ["encoder.layer.10", "encoder.layer.11", "pooler", "classifier"]
for name, param in model.named_parameters():
    param.requires_grad = any(layer in name for layer in trainable)

# log param counts
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable params: {trainable_params}")

Trainable params: 14767874


# Data pre-processing

- PyTorch models need input data in a specific format
- BiasDataset class turns each row from df into tokenized input tensors for BERT

In [7]:
class BiasDataset(Dataset):
    # store df and tokenizer
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    # how many samples in dataset
    def __len__(self):
        return len(self.data)

    # runs every time model needs one item from dataset
    # grabs english and german sentence, tokenizes them as a pair, applied padding, trunc and max_length, converts into pytorch tensors, returns a dict
    def __getitem__(self, idx):
        english = self.data.iloc[idx]["english"]
        german = self.data.iloc[idx]["german"]
        label = int(self.data.iloc[idx]["label"])

        encoded = self.tokenizer(
            text=english,
            text_pair=german,
            padding="max_length",
            truncation=True,
            max_length=256,
            return_tensors="pt",
            return_overflowing_tokens=False
        )

        item = {key: val.squeeze(0) for key, val in encoded.items()}
        item["labels"] = torch.tensor(label)
        return item

- tokenizer gives tensors with a first size of 1 (a batch)
- squeeze(0) removes that first size, making single samples

## Train test split

In [8]:
train_df, temp_df = train_test_split(
    df, 
    test_size=0.2, 
    stratify=df["label"], 
    random_state=seed
)
val_df, test_df = train_test_split(
    temp_df, 
    test_size=0.5, 
    stratify=temp_df["label"], 
    random_state=seed
)

## Create Dataset Objects

In [9]:
train_dataset = BiasDataset(train_df, tokenizer)  
val_dataset = BiasDataset(val_df, tokenizer)    
test_dataset = BiasDataset(test_df, tokenizer) 

# Define evaluation metrics


- **`evaluate` function** runs the model on test data to check performance.

- `model.eval()`  
  - Sets the model to evaluation mode (no training or dropout).

- Initialize empty lists:  
  - `all_labels` to save true labels.  
  - `all_preds` to save predicted labels.

- Loop through batches in `dataloader`:  
  - Move inputs and labels to device (CPU/GPU).  
  - Get model outputs (logits).  
  - Select predicted class with highest score (`argmax`).  
  - Add true labels and predictions to lists.

- After the loop:  
  - Calculate **accuracy**: percentage of correct predictions.  
  - Calculate **precision**: correct biased predictions / all biased predictions made.  
  - Calculate **recall**: correct biased predictions / all actual biased samples.  
  - Calculate **f1-score**: balance between precision and recall.

- Return all four metrics.


In [10]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average="binary", zero_division=0
    )
    accuracy = accuracy_score(labels, predictions)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Training

## Training Parameters

In [11]:
# hyperparameters
lr = 2e-5
batch_size = 16
num_epochs = 8

training_args = TrainingArguments(
    seed = seed,
    output_dir="./model_output",       
    num_train_epochs=num_epochs,   
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,   
    learning_rate=lr,             
    warmup_ratio=0.1,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",       
    load_best_model_at_end=True,  
    metric_for_best_model="f1",  
    greater_is_better=True   
)

## Run trainer

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [13]:
print("Starting training...")

try:
    train_results = trainer.train()
except Exception as e:
    print("Training failed:", e)
    raise

print("Training complete. Saving model...")

trainer.save_model("./model_output")
tokenizer.save_pretrained("./model_output")

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.6112,0.474895,0.707692,0.662921,0.561905,0.608247
2,0.4508,0.399995,0.769231,0.693966,0.766667,0.728507
3,0.3627,0.322704,0.844231,0.789238,0.838095,0.812933
4,0.2805,0.269606,0.878846,0.818182,0.9,0.857143
5,0.233,0.248411,0.878846,0.851675,0.847619,0.849642
6,0.1968,0.235468,0.880769,0.866337,0.833333,0.849515


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Training complete. Saving model...


('./model_output\\tokenizer_config.json',
 './model_output\\special_tokens_map.json',
 './model_output\\vocab.txt',
 './model_output\\added_tokens.json')

# Evaluate

In [14]:
print("Evaluating model...")

# Evaluate on validation set
val_results = trainer.evaluate(eval_dataset=val_dataset)
print("Validation F1:", round(val_results["eval_f1"], 3))

# Evaluate on test set
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test F1:", round(test_results["eval_f1"], 3))

Evaluating model...


Validation F1: 0.857
Test F1: 0.825


# Test cases

In [15]:
model_path = "./model_output"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [None]:
# Define test cases (English, German) with expected bias labels
test_cases = [
    ("The nurse is kind", "Die Krankenschwester ist freundlich", 1),  
    ("The nurse is sweet", "Die Pflegekraft ist freundlich", 0),       
    ("The doctor is strong", "Der Arzt ist stark", 1),               
    ("Today the weather is beautiful", "Heute ist das Wetter schön", 0), 
    ("The woman is a coder", "Die Frau ist eine Programmiererin", 0), 
    ("The teachers are not working today", "Die Lehrenden arbeiten heute nicht", 0), 
    ("The advisors are not working today", "Die Berater*innen arbeiten heute nicht", 0), 
    ("The teachers are smart", "Die Lehrer sind schlau", 1), 
    ("I am fine", "Mir geht es gut",0),
    ("I am helping the cleaner", "Ich helfe der Putzfrau",1),
    ("I am helping the cleaner", "Ich helfe der Reinigungskraft",0)
]

# Prepare results table
results = []

# Run predictions
for eng, de, true_label in test_cases:
    # Tokenize
    inputs = tokenizer(
        eng, de,
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt"
    ).to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        pred_label = torch.argmax(outputs.logits).item()
        prob = torch.softmax(outputs.logits, dim=1)[0].cpu().numpy()
    
    results.append({
        "English": eng,
        "German": de,
        "True Label": true_label,
        "Predicted Label": pred_label,
        "Neutral Prob": prob[0],
        "Biased Prob": prob[1],
        "Correct": true_label == pred_label
    })

# Display as formatted table
results_df = pd.DataFrame(results)
print("\nBias Detection Test Cases:")
print(results_df.to_string(index=False))  # This prints the whole table cleanly

# Calculate accuracy
accuracy = results_df["Correct"].mean()
print(f"\nModel Accuracy on Test Sentences: {accuracy:.1%}")



Bias Detection Test Cases:
                           English                                 German  True Label  Predicted Label  Neutral Prob  Biased Prob  Correct
                 The nurse is kind    Die Krankenschwester ist freundlich           1                1      0.284066     0.715934     True
                The nurse is sweet         Die Pflegekraft ist freundlich           0                1      0.142957     0.857043    False
              The doctor is strong                     Der Arzt ist stark           1                1      0.139930     0.860070     True
    Today the weather is beautiful             Heute ist das Wetter schön           0                0      0.999195     0.000805     True
              The woman is a coder      Die Frau ist eine Programmiererin           0                0      0.886548     0.113452     True
The teachers are not working today     Die Lehrenden arbeiten heute nicht           0                0      0.705959     0.294041     True