In [1]:
# standard libraries
import os
import random

# data handling
import pandas as pd
import numpy as np

# torch and dataset utils
import torch
from torch.utils.data import Dataset, DataLoader

# transformers library
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)

# evaluation and data split
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report
)

  from .autonotebook import tqdm as notebook_tqdm


# Seed


calls Python’s random.seed, NumPy’s np.random.seed and PyTorch’s torch.manual_seed

In [2]:
seed = 10

def set_seed(seed):
    random.seed(seed)                  # Python built-in RNG
    np.random.seed(seed)               # NumPy RNG
    torch.manual_seed(seed)            # PyTorch RNG (CPU)
    torch.cuda.manual_seed(seed)       # PyTorch RNG (current GPU)
    torch.cuda.manual_seed_all(seed)   # All GPUs (if using multi-GPU)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

# Load Dataset

In [3]:
df = pd.read_csv("datasets/dataset_g.csv")
df.head(5)

Unnamed: 0,english,german,label
0,The student representatives are responsible.,die Schülervertreter*innen sind verantwortlich.,0
1,"I also call upon the Ministers, Heads of State...",Ebenso appelliere ich an die Minister sowie an...,1
2,The deputies are responsible.,Die Stellvertreterinnen sind verantwortlich.,1
3,"I do not know whether, as one of the speakers ...","Ich weiß nicht, ob ich wie zuvor gesagt wurde,...",0
4,The reader is responsible.,Die Leserin ist verantwortlich.,1


# Load pre-trained model 

In [4]:
# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# print device info
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("GPU: None (using CPU)")

CUDA Available: True
GPU: NVIDIA GeForce RTX 3070 Laptop GPU


## Load Tokenizer

In [5]:
# model name
model_path = "bert-base-multilingual-cased"

# load tokenizer
tokenizer = BertTokenizer.from_pretrained(model_path)

## Load Model and Send to Device

In [6]:
# load model with classification head
model = BertForSequenceClassification.from_pretrained(
    model_path,
    num_labels=2,
    id2label={0: "neutral", 1: "biased"},
    label2id={"neutral": 0, "biased": 1}
)

# move model to device
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

# Set trainable parameters


- "transfer learning". we leave the base model parameters frozen, only train a classification head that we add on top
- might result in rigid model
- unfreeze final four layers, keeping computational cost down but keep flexibility

In [7]:
# freeze encoder layers 0 to 7
for name, param in model.named_parameters():
    if name.startswith("bert.encoder.layer."):
        layer_num = int(name.split(".")[3])
        if layer_num <= 7:
            param.requires_grad = False

# count and print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable params: {trainable_params}")

Trainable params: 121152002


# Data pre-processing

- PyTorch models need input data in a specific format
- BiasDataset class turns each row from df into tokenized input tensors for BERT

In [8]:
# custom dataset for bias detection
class BiasDataset(Dataset):
    # init with dataframe and tokenizer
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    # return number of samples
    def __len__(self):
        return len(self.data)

    # return one encoded sample
    def __getitem__(self, idx):
        english = self.data.iloc[idx]["english"]
        german = self.data.iloc[idx]["german"]
        label = int(self.data.iloc[idx]["label"])

        # tokenize EN-DE sentence pair
        encoded = self.tokenizer(
            text=english,
            text_pair=german,
            padding="max_length",
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )

        # encoded outputs have extra batch dimension, remove it with squeeze(0) to get plain tensors
        item = {key: val.squeeze(0) for key, val in encoded.items()}
        # add label tensor to the dict under key 'labels'
        item["labels"] = torch.tensor(label)
        return item


- tokenizer gives tensors with a first size of 1 (a batch)
- squeeze(0) removes that first size, making single samples

## Train test split

In [9]:
# split data into train (80%) and temp (20%), keeping label distribution same with stratify
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label"],
    random_state=seed
)

# split temp into validation (10%) and test (10%) sets, stratified by label
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["label"],
    random_state=seed
)


## Create Dataset Objects

In [10]:
# create train, validation and test datasets
train_dataset = BiasDataset(train_df, tokenizer)  
val_dataset = BiasDataset(val_df, tokenizer)    
test_dataset = BiasDataset(test_df, tokenizer)  

# Evaluation Metrics Calculation for Classification

The `compute_metrics` function calculates evaluation metrics for classification results.

### Inputs
- `eval_pred`: A tuple containing two elements:
  - `logits`: The raw model outputs (before softmax).
  - `labels`: The true class labels.

### Process
1. Get predicted classes by taking the index with the highest logit value.
2. Calculate precision, recall, F1 score, and support for each class separately.
3. Create a detailed classification report as a formatted string.
4. Compute the confusion matrix showing correct and incorrect predictions per class.
5. Prepare a dictionary of metrics including:
   - Precision, recall, F1, and support for each class.
   - Macro averages (mean) for precision, recall, and F1.
   - Overall accuracy.
6. Print the confusion matrix and classification report.

### Output
- Returns a dictionary with all the above metrics, suitable for logging or further analysis.

### Notes
- `average=None` means metrics are computed separately for each class.
- `zero_division=0` avoids errors if some classes have no predicted samples.


In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)

    precision, recall, f1, support = precision_recall_fscore_support(
        labels, predictions, average=None, zero_division=0
    )

    metrics = {
        # per-class
        **{f"precision_class_{i}": precision[i] for i in range(len(precision))},
        **{f"recall_class_{i}":    recall[i]    for i in range(len(recall))},
        **{f"f1_class_{i}":        f1[i]        for i in range(len(f1))},
        **{f"support_class_{i}":   support[i]   for i in range(len(support))},
        # overall
        "precision_macro": np.mean(precision),
        "recall_macro":    np.mean(recall),
        "f1_macro":        np.mean(f1),
        "accuracy":        (predictions == labels).mean(),
    }

    return metrics

# Training

## Training Parameters

In [15]:
# hyperparameters
lr = 2e-5
batch_size = 16
num_epochs = 8

output_dir="./model_output_dataset_g"

training_args = TrainingArguments(
    seed = seed,
    output_dir=output_dir,       
    num_train_epochs=num_epochs,   
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,   
    learning_rate=lr,             
    warmup_ratio=0.1,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",       
    load_best_model_at_end=True,  
    metric_for_best_model="eval_f1_macro",  
    greater_is_better=True   
)

## Run trainer

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,  
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [17]:
print("Starting training...")

try:
    train_results = trainer.train()
except Exception as e:
    print("Training failed:", e)
    raise

print("Training complete. Saving model...")

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

Starting training...


Epoch,Training Loss,Validation Loss,Precision Class 0,Precision Class 1,Recall Class 0,Recall Class 1,F1 Class 0,F1 Class 1,Support Class 0,Support Class 1,Precision Macro,Recall Macro,F1 Macro,Accuracy
1,0.2965,0.30956,0.903614,0.842767,0.857143,0.893333,0.879765,0.867314,175,150,0.873191,0.875238,0.87354,0.873846
2,0.1938,0.194982,0.963415,0.89441,0.902857,0.96,0.932153,0.926045,175,150,0.928912,0.931429,0.929099,0.929231
3,0.0961,0.138072,0.95,0.972414,0.977143,0.94,0.96338,0.955932,175,150,0.961207,0.958571,0.959656,0.96
4,0.057,0.178723,0.988095,0.942675,0.948571,0.986667,0.96793,0.964169,175,150,0.965385,0.967619,0.96605,0.966154
5,0.0379,0.185302,0.988095,0.942675,0.948571,0.986667,0.96793,0.964169,175,150,0.965385,0.967619,0.96605,0.966154
6,0.0126,0.136498,0.977011,0.966887,0.971429,0.973333,0.974212,0.9701,175,150,0.971949,0.972381,0.972156,0.972308
7,0.0103,0.144654,0.988166,0.948718,0.954286,0.986667,0.97093,0.96732,175,150,0.968442,0.970476,0.969125,0.969231
8,0.0062,0.124712,0.982558,0.960784,0.965714,0.98,0.974063,0.970297,175,150,0.971671,0.972857,0.97218,0.972308


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Training complete. Saving model...


('./model_output_dataset_g\\tokenizer_config.json',
 './model_output_dataset_g\\special_tokens_map.json',
 './model_output_dataset_g\\vocab.txt',
 './model_output_dataset_g\\added_tokens.json')

# Evaluate

In [20]:
print("Evaluating model...")

# get metrics only from validation
val_results = trainer.evaluate(eval_dataset=val_dataset)
print("Validation F1:", round(val_results["eval_f1_macro"], 3))

# get predictions and labels for validation
val_preds_output = trainer.predict(val_dataset)
val_logits = val_preds_output.predictions
val_labels = val_preds_output.label_ids
val_preds = np.argmax(val_logits, axis=1)

# print confusion matrix and classification report for validation
print("Validation Confusion Matrix:\n", confusion_matrix(val_labels, val_preds))
print("\nValidation Classification Report:\n", classification_report(val_labels, val_preds, zero_division=0, digits=4))


# same for test set
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Test F1:", round(test_results["eval_f1_macro"], 3))

test_preds_output = trainer.predict(test_dataset)
test_logits = test_preds_output.predictions
test_labels = test_preds_output.label_ids
test_preds = np.argmax(test_logits, axis=1)

print("Test Confusion Matrix:\n", confusion_matrix(test_labels, test_preds))
print("\nTest Classification Report:\n", classification_report(test_labels, test_preds, zero_division=0, digits=4))


Evaluating model...
Validation F1: 0.972
Validation Confusion Matrix:
 [[169   6]
 [  3 147]]

Validation Classification Report:
               precision    recall  f1-score   support

           0     0.9826    0.9657    0.9741       175
           1     0.9608    0.9800    0.9703       150

    accuracy                         0.9723       325
   macro avg     0.9717    0.9729    0.9722       325
weighted avg     0.9725    0.9723    0.9723       325



Test F1: 0.972
Test Confusion Matrix:
 [[167   8]
 [  1 149]]

Test Classification Report:
               precision    recall  f1-score   support

           0     0.9940    0.9543    0.9738       175
           1     0.9490    0.9933    0.9707       150

    accuracy                         0.9723       325
   macro avg     0.9715    0.9738    0.9722       325
weighted avg     0.9733    0.9723    0.9723       325



# Test cases

## Load trained model

In [21]:
model_path = output_dir
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [22]:
# shorten text helper function
def shorten(text, max_len=40):
    if len(text) <= max_len:
        return text
    return text[:max_len-3] + "..."

## My Test Cases

In [None]:
test_cases = [
    # neutral
    ("Today the weather is beautiful", "Heute ist das Wetter schön", 0), 
    ("I like this guy.", "Ich mag diesen Kerl.", 0),
    ("How are you?", "Wie geht es dir?", 0),
    ("BERT is a transformers model pretrained on a large corpus of multilingual data in a self-supervised fashion.","BERT ist ein Transformer-Modell, das auf selbstüberwachte Weise auf einem großen Korpus mehrsprachiger Daten vorab trainiert wird.",0),
    ("Knowledge is love and light and vision.","Wissen ist Liebe und Licht und Vision.",0),

    # neutral with gendered subjects
    ("My mother is an engineer.", "Meine Mutter ist Ingenieurin.", 0),
    ("Is this boy a good artist?", "Ist dieser Junge ein guter Künstler?", 0),
    ("I am living with my sister, who is also my best friend", "Ich lebe mit meiner Schwester, die auch meine beste Freundin ist", 0),
    ("My father was an excellent cook.", "Mein Vater war ein ausgezeichneter Koch.", 0),
    ("The girls went hiking.", "Die Mädchen gingen wandern.", 0),


    # biased
    ("Do you like our maths teacher?", "Mögen Sie unsere Mathelehrerin?", 1),
    ("The doctor was late to work today.", "Der Arzt kam heute zu spät zur Arbeit.", 1),
    ("Tomorrow the students are leaving for a class trip.", "Morgen gehen die Studenten zu einer Klassenfahrt.", 1),
    ("This nurse does not work hard.", "Diese Krankenschwester arbeitet nicht hart.", 1),
    ("Athletes earn a lot of money.", "Sportler verdienen viel Geld.", 1),

    # test GFL
    ("Engineers are kind and hard-working", "Ingenieur*innen sind lieb und fleißig", 0),
    ("Engineers are kind and hard-working", "Ingenieure und Ingenieurinnen sind lieb und fleißig", 0),
    ("Teachers are kind and hard-working", "Lehrende sind lieb und fleißig", 0),
    ("Teachers are kind and hard-working", "Lehrer:innen sind lieb und fleißig", 0),
    ("Teachers are kind and hard-working", "Lehrerinnen und Lehrer sind lieb und fleißig", 0),
    ("Teachers are kind and hard-working", "Lehrer sind lieb und fleißig", 1),
    ("Teachers are kind and hard-working", "Lehrerinnen sind lieb und fleißig", 1),

    # job posting morgan stanley
    ("We’re seeking someone to join our team Office 365 squads to lead the design, development, and integration of Gen AI apps and integration using Microsoft Copilot Studio.","Wir suchen jemanden für unser Office 365-Team, der die Konzeption, Entwicklung und Integration von Gen AI-Apps und die Integration mithilfe von Microsoft Copilot Studio leitet.",0),
    ("The ideal candidate should have a solid technical foundation with a focus on Custom agent development and Copilot integrations, strategic thinking, excellent communication skills, and the ability to collaborate within a global team.", "Der ideale Kandidat sollte über solide technische Grundlagen mit Schwerpunkt auf der Entwicklung kundenspezifischer Agenten und Copilot-Integrationen, strategisches Denken, ausgezeichnete Kommunikationsfähigkeiten und die Fähigkeit zur Zusammenarbeit in einem globalen Team verfügen.", 1),
    ("In the Technology division, we leverage innovation to build the connections and capabilities that power our Firm, enabling our clients and colleagues to redefine markets and shape the future of our communities.", "Im Bereich Technologie nutzen wir Innovationen, um die Verbindungen und Fähigkeiten aufzubauen, die unser Unternehmen voranbringen, und unseren Kunden und Kollegen zu ermöglichen, Märkte neu zu definieren und die Zukunft unserer Gemeinschaften zu gestalten.",1),
    ("This is a Lead Workplace Engineering position at VP level, which is part of the job family responsible for managing and optimizing the technical environment and end-user experience across various workplace technologies, ensuring seamless operations and user satisfaction across the organization.","Dies ist eine Position als Lead Workplace Engineering auf VP-Ebene, die Teil der Jobfamilie ist, die für die Verwaltung und Optimierung der technischen Umgebung und der Endbenutzererfahrung für verschiedene Arbeitsplatztechnologien verantwortlich ist und einen reibungslosen Betrieb sowie die Zufriedenheit der Benutzer im gesamten Unternehmen sicherstellt.",1),
]


In [28]:
# convert list of test cases into a dataframe
test_df = pd.DataFrame(test_cases, columns=["english", "german", "label"])

# create BiasDataset instance from dataframe
test_dataset = BiasDataset(test_df, tokenizer)

## Run Inference on each Test Case

In [29]:
results = []

for i in range(len(test_dataset)):
    item = test_dataset[i]
    
    # prepare inputs for model, add batch dimension and move to device
    inputs = {key: val.unsqueeze(0).to(device) for key, val in item.items() if key != "labels"}
    
    # run model in evaluation mode without gradients
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        pred_label = torch.argmax(logits, dim=1).item()
        prob = torch.softmax(logits, dim=1)[0].cpu().numpy()
    
    # collect results
    results.append({
        "english": test_df.iloc[i]["english"],
        "german": test_df.iloc[i]["german"],
        "true_label": test_df.iloc[i]["label"],
        "predicted_label": pred_label,
        "neutral_prob": prob[0],
        "biased_prob": prob[1],
        "correct": test_df.iloc[i]["label"] == pred_label
    })


## Display Results as Table

In [30]:
results_df = pd.DataFrame(results)

print("\nBias detection test results:")
print(results_df.to_string(index=False))

accuracy = results_df["correct"].mean()
print(f"\nModel accuracy on test cases: {accuracy:.1%}")



Bias detection test results:
                                                                                                                                                                                                                                 english                                                                                                                                                                                                                                                                                      german  true_label  predicted_label  neutral_prob  biased_prob  correct
                                                                                                                                                                                                          Today the weather is beautiful                                                                                                                                                            