In [1]:
import pandas as pd
import torch
import re

from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    get_scheduler,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("datasets/mgente_transformed.csv")

In [None]:
class BiasDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        english = self.data.iloc[idx]["english"]
        german = self.data.iloc[idx]["german"]
        label = int(self.data.iloc[idx]["label"])

        inputs = self.tokenizer(
            english,
            german,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
            return_overflowing_tokens=False
        )

        item = {key: val.squeeze(0) for key, val in inputs.items()}
        item["labels"] = torch.tensor(label)
        return item


SyntaxError: invalid syntax. Perhaps you forgot a comma? (2678737701.py, line 21)

- combines english and german sentences into a single string for BERT model
- tokenize with padding truncation and max_length
- return a dict 

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", num_labels=2
)
model.to(device)

- uses GPU if available, else falls back to CPU
- loads mBERT
- specifies binary classification
- model.todevice if training happens on GPU

In [None]:
trainable = ["encoder.layer.10", "encoder.layer.11", "pooler", "classifier"]
for name, param in model.named_parameters():
    param.requires_grad = any(layer in name for layer in trainable)

BERT has 12 layers. If you unfreeze all of them, the model will update every weight during training. This:
- Takes longer
- Requires more GPU memory
- Can overwrite useful knowledge from pretraining

By freezing most layers and only training the top layers (e.g., layer.10, layer.11, pooler, classifier), you:
- Keep general language knowledge
- Only train the parts that matter most for gender bias detection

- training last two transformer layers (10, 11)
- pooler for CLS embedding
- classification head



In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=10)
train_dataset = BiasDataset(train_df, tokenizer)
val_dataset = BiasDataset(val_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=4)

- drop last true in case dataset is not divisible by batch_size 
- not causing issues with layers

In [None]:
num_epochs = 4

optimizer = AdamW(model.parameters(), lr=2e-5)

num_training_steps = len(train_loader) * num_epochs
num_warmup_steps = int(0.1 * num_training_steps)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

# Training Loop

In [None]:
best_val_loss = float("inf")

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    num_batches = 0
    all_preds = []
    all_labels = []

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        total_loss += loss.item()
        num_batches += 1

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = total_loss / num_batches
    train_acc = accuracy_score(all_labels, all_preds)
    print(f"Epoch {epoch+1}: Avg Train Loss = {avg_train_loss:.4f} | Train Acc = {train_acc:.4f}")

## Validation + Saving

In [None]:
model.eval()
val_loss = 0
val_batches = 0
val_preds = []
val_labels = []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        val_loss += outputs.loss.item()
        val_batches += 1

        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        val_preds.extend(preds.cpu().numpy())
        val_labels.extend(batch["labels"].cpu().numpy())

avg_val_loss = val_loss / val_batches
val_acc = accuracy_score(val_labels, val_preds)
print(f"Epoch {epoch+1}: Avg Val Loss = {avg_val_loss:.4f} | Val Acc = {val_acc:.4f}")

# Save best model
if avg_val_loss < best_val_loss:
    best_val_loss = avg_val_loss
    model.save_pretrained("./model_output")
    tokenizer.save_pretrained("./model_output")
    print("✅ Saved best model.")

model.train()


In [None]:
model_path = "./model_output"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
model.to(device)
model.eval()

- loads saved model weights 
- loads tokenizer
- saves model to cpu or gpu

In [None]:
val_dataset = BiasDataset(val_df, tokenizer)
val_loader = DataLoader(val_dataset, batch_size=4)

In [None]:
def evaluate_model(model, data_loader):
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            labels = batch["labels"].to(device)
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary")
    return accuracy, precision, recall, f1


In [None]:
accuracy, precision, recall, f1 = evaluate_model(model, val_loader)
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


In [None]:
def predict_bias(english, german, threshold=0.7):
    """
    Predict if the English-German pair is gender biased.
    Returns: label ('Biased' or 'Neutral') and confidence score.
    """
    text_pair = english + " [SEP] " + german
    inputs = tokenizer(
        text_pair,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128,
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs).item()
        confidence = probs[0][pred].item()

    if pred == 1 and confidence >= threshold:
        return "Biased", confidence
    else:
        return "Neutral", confidence


# Example with multiple sentences per input text
long_english_text = (
    "The nurse is kind and helpful. "
    "The doctor is very experienced."
)
long_german_text = (
    "Die Krankenschwester ist freundlich und hilfsbereit. "
    "Der Arzt ist sehr erfahren."
)

def simple_sent_tokenize(text):
    # Split on dot, question mark, exclamation mark followed by space or end of string
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s for s in sentences if s]

english_sentences = simple_sent_tokenize(long_english_text)
german_sentences = simple_sent_tokenize(long_german_text)

print("Checking long text sentence by sentence:\n")
for en_sent, de_sent in zip(english_sentences, german_sentences):
    label, conf = predict_bias(en_sent, de_sent)
    print(f"EN: {en_sent}")
    print(f"DE: {de_sent}")
    print(f"Prediction: {label} (Confidence: {conf:.2f})")
    print("-" * 50)


# Original simple test examples
examples = [
    ("The nurse is kind", "Die Krankenschwester ist freundlich"),
    ("The nurse is kind", "Die Pflegekraft ist freundlich"),
    ("The doctor is strong", "Der Arzt ist stark"),
]

print("\nSimple example tests:\n")
for en, de in examples:
    label, conf = predict_bias(en, de)
    print(f"EN: {en}")
    print(f"DE: {de}")
    print(f"Prediction: {label} (Confidence: {conf:.2f})")
    print("-" * 50)