# 02 â€” Monolingual Fine-Tuning (Baselines)

Fine-tune language-specific BERT models on their respective language's sentiment data.

| Language | Model | Dataset |
|----------|-------|--------|
| English | `bert-base-uncased` | IMDB |
| Dutch | `GroNLP/bert-base-dutch-cased` (BERTje) | DBRD |
| French | `almanach/camembert-base` | Allocine |

**Prerequisite:** Run `01_data_exploration.ipynb` first to create preprocessed data in `data/`.

In [1]:
import os
import json
import numpy as np
import torch
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
import evaluate

SEED = 42
MAX_LENGTH = 256
DATA_DIR = "./data"
MODEL_DIR = "./models"
os.makedirs(MODEL_DIR, exist_ok=True)

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch: 2.10.0
CUDA available: False


## Shared Utilities

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        **accuracy_metric.compute(predictions=preds, references=labels),
        **f1_metric.compute(predictions=preds, references=labels),
        **precision_metric.compute(predictions=preds, references=labels),
        **recall_metric.compute(predictions=preds, references=labels),
    }


def tokenize_dataset(dataset, tokenizer, max_length=MAX_LENGTH):
    """Tokenize a dataset using the given tokenizer."""
    def tokenize_fn(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=max_length,
        )
    tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text", "language"])
    tokenized.set_format("torch")
    return tokenized


def get_training_args(output_dir, num_epochs=3):
    return TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=4,
        fp16=torch.cuda.is_available(),
        gradient_checkpointing=True,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        lr_scheduler_type="linear",
        max_grad_norm=1.0,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        logging_steps=100,
        report_to="none",
        seed=SEED,
        dataloader_num_workers=0,
        dataloader_pin_memory=False,
    )


def train_and_evaluate(model_name, dataset, output_dir):
    """Fine-tune a model and evaluate on test set. Returns metrics dict."""
    print(f"\n{'='*60}")
    print(f"Training: {model_name}")
    print(f"Output: {output_dir}")
    print(f"{'='*60}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, num_labels=2
    )

    tok_train = tokenize_dataset(dataset["train"], tokenizer)
    tok_val = tokenize_dataset(dataset["validation"], tokenizer)
    tok_test = tokenize_dataset(dataset["test"], tokenizer)

    trainer = Trainer(
        model=model,
        args=get_training_args(output_dir),
        train_dataset=tok_train,
        eval_dataset=tok_val,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Evaluate on test set
    test_results = trainer.evaluate(tok_test, metric_key_prefix="test")
    print(f"\nTest results for {model_name}:")
    for k, v in test_results.items():
        if k.startswith("test_"):
            print(f"  {k}: {v:.4f}")

    # Save best model and tokenizer
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    return test_results

## 1. Load Preprocessed Data

In [3]:
en_data = load_from_disk(os.path.join(DATA_DIR, "en"))
fr_data = load_from_disk(os.path.join(DATA_DIR, "fr"))
nl_data = load_from_disk(os.path.join(DATA_DIR, "nl"))

print(f"English (IMDB): {en_data}")
print(f"French (Allocine): {fr_data}")
print(f"Dutch (DBRD): {nl_data}")

English (IMDB): DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 22500
    })
    validation: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 2500
    })
    test: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 25000
    })
})
French (Allocine): DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 25000
    })
    validation: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 20000
    })
    test: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 20000
    })
})
Dutch (DBRD): DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 18028
    })
    validation: Dataset({
        features: ['text', 'label', 'language'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'language'],
        num_row

## 2. Fine-Tune English BERT on IMDB

In [None]:
en_results = train_and_evaluate(
    model_name="bert-base-uncased",
    dataset=en_data,
    output_dir=os.path.join(MODEL_DIR, "bert-en"),
)


Training: bert-base-uncased
Output: ./models/bert-en


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss


## 3. Fine-Tune BERTje on DBRD (Dutch)

In [None]:
nl_results = train_and_evaluate(
    model_name="GroNLP/bert-base-dutch-cased",
    dataset=nl_data,
    output_dir=os.path.join(MODEL_DIR, "bertje-nl"),
)

## 4. Fine-Tune CamemBERT on Allocine (French)

In [None]:
fr_results = train_and_evaluate(
    model_name="almanach/camembert-base",
    dataset=fr_data,
    output_dir=os.path.join(MODEL_DIR, "camembert-fr"),
)

## 5. Monolingual Results Summary

In [None]:
import pandas as pd

results_summary = pd.DataFrame([
    {
        "Model": "bert-base-uncased",
        "Language": "English",
        "Dataset": "IMDB",
        "Accuracy": en_results["test_accuracy"],
        "F1": en_results["test_f1"],
        "Precision": en_results["test_precision"],
        "Recall": en_results["test_recall"],
    },
    {
        "Model": "BERTje",
        "Language": "Dutch",
        "Dataset": "DBRD",
        "Accuracy": nl_results["test_accuracy"],
        "F1": nl_results["test_f1"],
        "Precision": nl_results["test_precision"],
        "Recall": nl_results["test_recall"],
    },
    {
        "Model": "CamemBERT",
        "Language": "French",
        "Dataset": "Allocine",
        "Accuracy": fr_results["test_accuracy"],
        "F1": fr_results["test_f1"],
        "Precision": fr_results["test_precision"],
        "Recall": fr_results["test_recall"],
    },
])

print("Monolingual Baselines (In-Language Performance)")
print("=" * 60)
print(results_summary.to_string(index=False, float_format="{:.4f}".format))

# Save results
results_summary.to_csv("./results/monolingual_results.csv", index=False)
print("\nResults saved to ./results/monolingual_results.csv")