# 02 — Monolingual Fine-Tuning (Baselines)

Fine-tune language-specific BERT models on their respective language's sentiment data.

| Language | Model | Dataset |
|----------|-------|--------|
| English | `bert-base-uncased` | IMDB |
| Dutch | `GroNLP/bert-base-dutch-cased` (BERTje) | DBRD |
| French | `almanach/camembert-base` | Allocine |

**Prerequisite:** Run `01_data_exploration.ipynb` first to create preprocessed data in `data/`.

In [1]:
import os
import json
import numpy as np
import torch
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate

SEED = 42
MAX_LENGTH = 256
DATA_DIR = "./data"
MODEL_DIR = "./models"
os.makedirs(MODEL_DIR, exist_ok=True)

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch: 2.10.0
CUDA available: False


## Shared Utilities

In [2]:
import os
import shutil
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from safetensors import safe_open
from safetensors.torch import save_file as safetensors_save_file

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        **accuracy_metric.compute(predictions=preds, references=labels),
        **f1_metric.compute(predictions=preds, references=labels),
        **precision_metric.compute(predictions=preds, references=labels),
        **recall_metric.compute(predictions=preds, references=labels),
    }


def tokenize_dataset(dataset, tokenizer, max_length=MAX_LENGTH):
    """Tokenize a dataset using the given tokenizer."""
    def tokenize_fn(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
    tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text", "language"])
    tokenized.set_format("torch")
    return tokenized


def get_training_args(output_dir, num_train_samples, num_epochs=3):
    batch_size = 8
    grad_accum = 4
    warmup_steps = int(num_train_samples / (batch_size * grad_accum) * num_epochs * 0.1)
    return TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=grad_accum,
        fp16=torch.cuda.is_available(),
        gradient_checkpointing=True,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_steps=warmup_steps,
        lr_scheduler_type="linear",
        max_grad_norm=1.0,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        logging_steps=100,
        report_to="none",
        seed=SEED,
        dataloader_num_workers=0,
        dataloader_pin_memory=False,
    )


def _patch_safetensors_inplace(path):
    """Rename gamma→weight and beta→bias keys in a safetensors file, in place."""
    tensors = {}
    with safe_open(path, framework="pt", device="cpu") as f:
        metadata = f.metadata() or {}
        for key in f.keys():
            tensors[key] = f.get_tensor(key)

    if not any(k.endswith(".gamma") or k.endswith(".beta") for k in tensors):
        return  # already clean

    new_tensors = {}
    for key, val in tensors.items():
        if key.endswith(".gamma"):
            new_tensors[key[:-6] + ".weight"] = val
        elif key.endswith(".beta"):
            new_tensors[key[:-5] + ".bias"] = val
        else:
            new_tensors[key] = val

    tmp = path + ".tmp"
    safetensors_save_file(new_tensors, tmp, metadata=metadata)
    os.replace(tmp, path)


class CleanCheckpointTrainer(Trainer):
    """Trainer subclass that patches gamma/beta→weight/bias in every saved
    checkpoint immediately after saving, so load_best_model_at_end never
    encounters the legacy key names."""

    def _save_checkpoint(self, model, trial):
        super()._save_checkpoint(model, trial)
        # Find the checkpoint directory that was just written
        ckpt_dir = self._get_output_dir(trial=trial)
        # Walk all subdirectories for model.safetensors files
        for root, dirs, files in os.walk(ckpt_dir):
            for fname in files:
                if fname == "model.safetensors":
                    _patch_safetensors_inplace(os.path.join(root, fname))


def train_and_evaluate(model_name, dataset, output_dir):
    """Fine-tune a model and evaluate on test set. Returns metrics dict."""
    print(f"\n{'='*60}")
    print(f"Training: {model_name}")
    print(f"Output: {output_dir}")
    print(f"{'='*60}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=2
    )

    tok_train = tokenize_dataset(dataset["train"], tokenizer)
    tok_val = tokenize_dataset(dataset["validation"], tokenizer)
    tok_test = tokenize_dataset(dataset["test"], tokenizer)

    trainer = CleanCheckpointTrainer(
        model=model,
        args=get_training_args(output_dir, num_train_samples=len(dataset["train"])),
        train_dataset=tok_train,
        eval_dataset=tok_val,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Evaluate on test set
    test_results = trainer.evaluate(tok_test, metric_key_prefix="test")
    print(f"\nTest results for {model_name}:")
    for k, v in test_results.items():
        if k.startswith("test_"):
            print(f"  {k}: {v:.4f}")

    # Save best model and tokenizer (also patch in case of gamma/beta)
    trainer.save_model(output_dir)
    _patch_safetensors_inplace(os.path.join(output_dir, "model.safetensors"))
    tokenizer.save_pretrained(output_dir)

    return test_results

## 1. Load Preprocessed Data

In [3]:
en_data = load_from_disk(os.path.join(DATA_DIR, "en"))
fr_data = load_from_disk(os.path.join(DATA_DIR, "fr"))
nl_data = load_from_disk(os.path.join(DATA_DIR, "nl"))

print(f"English (IMDB): {en_data}")
print(f"French (Allocine): {fr_data}")
print(f"Dutch (DBRD): {nl_data}")

English (IMDB): DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 22500
    })
    validation: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 25000
    })
})
French (Allocine): DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 25000
    })
    validation: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 20000
    })
})
Dutch (DBRD): DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 18028
    })
    validation: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'language'],
        num_row

## 2. Fine-Tune English BERT on IMDB

In [4]:
en_results = train_and_evaluate(
    model_name="bert-base-uncased",
    dataset=en_data,
    output_dir=os.path.join(MODEL_DIR, "bert-en"),
)


Training: bert-base-uncased
Output: ./models/bert-en




Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.898226,0.237704,0.9092,0.906004,0.939056,0.8752
2,0.639724,0.215556,0.9196,0.92008,0.914625,0.9256
3,0.377703,0.278952,0.9156,0.916103,0.910672,0.9216


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Test results for bert-base-uncased:
  test_loss: 0.2183
  test_accuracy: 0.9201
  test_f1: 0.9208
  test_precision: 0.9129
  test_recall: 0.9288
  test_runtime: 541.0830
  test_samples_per_second: 46.2040
  test_steps_per_second: 2.8890


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

## 3. Fine-Tune BERTje on DBRD (Dutch)

In [5]:
nl_results = train_and_evaluate(
    model_name="GroNLP/bert-base-dutch-cased",
    dataset=nl_data,
    output_dir=os.path.join(MODEL_DIR, "bertje-nl"),
)


Training: GroNLP/bert-base-dutch-cased
Output: ./models/bertje-nl


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: GroNLP/bert-base-dutch-cased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
bert.embeddings.position_ids               | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
bert.pooler.dense.weight                   | MISSING    | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 
bert.pooler.dense.bias                     | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the ch

Map:   0%|          | 0/2224 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.265838,0.287828,0.8685,0.880291,0.807853,0.967
2,0.870295,0.263755,0.8885,0.888108,0.891239,0.885
3,0.424629,0.323283,0.8885,0.888221,0.890452,0.886


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Test results for GroNLP/bert-base-dutch-cased:
  test_loss: 0.3510
  test_accuracy: 0.8790
  test_f1: 0.8790
  test_precision: 0.8794
  test_recall: 0.8786
  test_runtime: 48.2686
  test_samples_per_second: 46.0760
  test_steps_per_second: 2.8800


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

## 4. Fine-Tune CamemBERT on Allocine (French)

In [7]:
fr_results = train_and_evaluate(
    model_name="almanach/camembert-base",
    dataset=fr_data,
    output_dir=os.path.join(MODEL_DIR, "camembert-fr"),
)


Training: almanach/camembert-base
Output: ./models/camembert-fr


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mCamembertForSequenceClassification LOAD REPORT[0m from: almanach/camembert-base
Key                         | Status     | 
----------------------------+------------+-
lm_head.layer_norm.weight   | UNEXPECTED | 
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
classifier.dense.bias       | MISSING    | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.619556,0.16675,0.94705,0.948081,0.912084,0.987036
2,0.451051,0.118489,0.96385,0.963301,0.958001,0.968661
3,0.252441,0.142429,0.9625,0.962209,0.95005,0.974684


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]


Test results for almanach/camembert-base:
  test_loss: 0.1151
  test_accuracy: 0.9639
  test_f1: 0.9625
  test_precision: 0.9594
  test_recall: 0.9656
  test_runtime: 434.0888
  test_samples_per_second: 46.0740
  test_steps_per_second: 2.8800


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

## 5. Monolingual Results Summary

In [8]:
import pandas as pd

results_summary = pd.DataFrame([
    {
        "Model": "bert-base-uncased",
        "Language": "English",
        "Dataset": "IMDB",
        "Accuracy": en_results["test_accuracy"],
        "F1": en_results["test_f1"],
        "Precision": en_results["test_precision"],
        "Recall": en_results["test_recall"],
    },
    {
        "Model": "BERTje",
        "Language": "Dutch",
        "Dataset": "DBRD",
        "Accuracy": nl_results["test_accuracy"],
        "F1": nl_results["test_f1"],
        "Precision": nl_results["test_precision"],
        "Recall": nl_results["test_recall"],
    },
    {
        "Model": "CamemBERT",
        "Language": "French",
        "Dataset": "Allocine",
        "Accuracy": fr_results["test_accuracy"],
        "F1": fr_results["test_f1"],
        "Precision": fr_results["test_precision"],
        "Recall": fr_results["test_recall"],
    },
])

print("Monolingual Baselines (In-Language Performance)")
print("=" * 60)
print(results_summary.to_string(index=False, float_format="{:.4f}".format))

# Save results
results_summary.to_csv("./results/monolingual_results.csv", index=False)
print("\nResults saved to ./results/monolingual_results.csv")

Monolingual Baselines (In-Language Performance)
            Model Language  Dataset  Accuracy     F1  Precision  Recall
bert-base-uncased  English     IMDB    0.9201 0.9208     0.9129  0.9288
           BERTje    Dutch     DBRD    0.8790 0.8790     0.8794  0.8786
        CamemBERT   French Allocine    0.9639 0.9625     0.9594  0.9656

Results saved to ./results/monolingual_results.csv
