# Multi-Stage Job Advertisement Analysis — Training Bert Zone Identification Model

[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mansamoussa/llm-skill-extractor/blob/main/notebooks/02_train_bert.ipynb)

---

### Objective
Train a **multilingual BERT token classification model** that predicts zone labels for each token in a job advertisement, using the preprocessed datasets generated in *01_data_preparation.ipynb*.

This notebook will:
1. Load:
   - The preprocessed `train_dataset` and `test_dataset`
   - The `id2label.json` and `label2id.json` mappings  
2. Initialize a `bert-base-multilingual-cased` model for token classification  
3. Configure and run the full training loop:
   - Optimizer (AdamW)
   - Learning rate scheduler  
   - Weighted loss function to handle class imbalance  
   - Periodic validation  
4. Save artifacts:
   - The best-performing model checkpoint (`best_model.pt`)
   - TensorBoard logs for visualization  
5. Evaluate model performance using **seqeval** metrics:
   - Precision  
   - Recall  
   - F1-score  

### Input Data
- `data/train_dataset.pt` — tokenized, labeled training chunks  
- `data/test_dataset.pt` — tokenized, labeled evaluation chunks  
- `model/id2label.json` — mapping from label IDs → label names  
- `model/label2id.json` — mapping from label names → label IDs  

### Output
- **`model/best_model.pt`** — best model checkpoint based on validation loss  
- **TensorBoard logs** stored under `logs/train/`  
- **Evaluation results** including seqeval classification report

In [1]:
!pip install -q transformers seqeval tensorboard

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


In [2]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader

from transformers import (
    BertForTokenClassification,
    BertTokenizerFast,
    get_linear_schedule_with_warmup
)

import json
from pathlib import Path
from sklearn.utils.class_weight import compute_class_weight
from seqeval.metrics import classification_report, f1_score
import numpy as np

from torch.utils.tensorboard import SummaryWriter

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os

PROJECT_ROOT = "/content/drive/MyDrive/GroupWork_GEN03"

# Define paths
train_dataset_path = f"{PROJECT_ROOT}/processed_data/train_dataset.pt"
test_dataset_path  = f"{PROJECT_ROOT}/processed_data/test_dataset.pt"
id2label_path      = f"{PROJECT_ROOT}/model/id2label.json"
label2id_path      = f"{PROJECT_ROOT}/model/label2id.json"
model_save_path    = f"{PROJECT_ROOT}/model/best_model.pt"

paths = {
    "train_dataset.pt": train_dataset_path,
    "test_dataset.pt": test_dataset_path,
    "id2label.json": id2label_path,
    "label2id.json": label2id_path,
}

# Validate all paths
missing = [name for name, p in paths.items() if not os.path.exists(p)]

if missing:
    raise FileNotFoundError(
        "❌ Missing required input files:\n" +
        "\n".join(f" - {name}" for name in missing) +
        "\n\nPlease verify where Notebook 01 has exported."
    )
else:
    print("✅ All required files found.")

✅ All required files found.


In [5]:
# --- For PyTorch 2.6 unpickling TensorDataset ---
from torch.utils.data import TensorDataset
import torch
torch.serialization.add_safe_globals([TensorDataset])
# -----------------------------------------------------

# Load datasets (weights_only=False for full objects)
train_dataset = torch.load(train_dataset_path, weights_only=False)
test_dataset  = torch.load(test_dataset_path,  weights_only=False)

# Load id2label mapping (keys are strings, convert to int)
with open(id2label_path, "r") as f:
    id2label_raw = json.load(f)

# Convert: {"0": "O"} to {0: "O"}
id2label = {int(k): v for k, v in id2label_raw.items()}

# Create label2id: {"O": 0, ...}
label2id = {v: k for k, v in id2label.items()}

num_labels = len(label2id)

id2label, label2id, num_labels

({0: 'O',
  1: 'Fähigkeiten und Inhalte',
  2: 'Abschlüsse',
  3: 'Anstellung',
  4: 'Erfahrung',
  5: 'Challenges',
  6: 'Bewerbungsprozess',
  7: 'Firmenbeschreibung',
  8: 'Benefits',
  9: 'Arbeitsumfeld',
  10: 'Firmenkundenbeschreibung'},
 {'O': 0,
  'Fähigkeiten und Inhalte': 1,
  'Abschlüsse': 2,
  'Anstellung': 3,
  'Erfahrung': 4,
  'Challenges': 5,
  'Bewerbungsprozess': 6,
  'Firmenbeschreibung': 7,
  'Benefits': 8,
  'Arbeitsumfeld': 9,
  'Firmenkundenbeschreibung': 10},
 11)

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")
model = BertForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

device

In [7]:
batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

len(train_loader), len(test_loader)

(374, 94)

In [8]:
# Sanity check: take one batch and run it through the model
batch = next(iter(train_loader))
input_ids, labels, attention_mask = [b.to(device) for b in batch]

print("input_ids:", input_ids.shape)
print("attention_mask:", attention_mask.shape)
print("labels:", labels.shape)

with torch.no_grad():
    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
    )

print("Forward pass OK — loss:", outputs.loss.item())


input_ids: torch.Size([8, 510])
attention_mask: torch.Size([8, 510])
labels: torch.Size([8, 510])
Forward pass OK — loss: 2.388327121734619


In [None]:
all_labels = []
for _, labels, _ in train_loader:
    all_labels.extend(labels.view(-1).numpy())

all_labels = np.array(all_labels)
valid_mask = all_labels != label2id["O"]
filtered_labels = all_labels[valid_mask]

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array(list(label2id.values())),
    y=all_labels
)

class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)
class_weights

In [None]:
epochs = 5
learning_rate = 3e-5

optimizer = AdamW(model.parameters(), lr=learning_rate)

num_training_steps = len(train_loader) * epochs
num_warmup_steps = int(0.1 * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)

loss_fn = nn.CrossEntropyLoss(weight=class_weights, ignore_index=label2id["O"])

writer = SummaryWriter(log_dir="../logs/train")

In [None]:
best_val_loss = float("inf")
save_path = "../model/best_model.pt"

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_loader):
        input_ids, labels, attention_mask = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    writer.add_scalar("Loss/train", avg_train_loss, epoch)

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}")

    # ---------- VALIDATION ----------
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for batch in test_loader:
            input_ids, labels, attention_mask = [b.to(device) for b in batch]
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            val_loss += outputs.loss.item()

    avg_val_loss = val_loss / len(test_loader)
    writer.add_scalar("Loss/val", avg_val_loss, epoch)

    print(f"Validation Loss: {avg_val_loss:.4f}")

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), save_path)
        print("✓ Saved new best model")

writer.close()

In [None]:
model.load_state_dict(torch.load("../model/best_model.pt"))
model.eval()
print("Loaded best_model.pt")

In [None]:
true_labels = []
pred_labels = []

for batch in test_loader:
    input_ids, labels, attention_mask = [b.to(device) for b in batch]

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=-1)

    for i in range(labels.size(0)):
        true_seq = []
        pred_seq = []
        for t, p in zip(labels[i], preds[i]):
            t = t.item()
            p = p.item()

            if t == label2id["O"] and p == label2id["O"]:
                continue

            true_seq.append(id2label[str(t)])
            pred_seq.append(id2label[str(p)])

        if true_seq:
            true_labels.append(true_seq)
            pred_labels.append(pred_seq)

In [None]:
print("F1 Score:", f1_score(true_labels, pred_labels))
print()
print(classification_report(true_labels, pred_labels))