In [1]:
!pip install -q -U optuna transformers[torch] accelerate bitsandbytes scikit-learn seaborn plotly

from google.colab import drive
drive.mount('/content/drive')

DRIVE_PATH = "/content/drive/MyDrive/data"
import os
os.makedirs(DRIVE_PATH, exist_ok=True)
os.chdir(DRIVE_PATH)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m44.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m159.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m152.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m148.4 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


In [2]:
import pandas as pd
import torch
import transformers
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    IntervalStrategy
)
from transformers import DataCollatorWithPadding
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import logging
import sys
import os
import gc
import copy
import json
from pathlib import Path
from typing import Dict, List, Tuple, Optional
import optuna
from tqdm import tqdm

# Logging
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
logging.basicConfig(
    level=logging.INFO,
    format="%(message)s",
    handlers=[logging.StreamHandler(sys.stdout)]
)
warnings.filterwarnings("ignore")
transformers.logging.set_verbosity_error()

# Config class

In [45]:
# ===================== CONFIGURATION =====================
class Config:
    """Central configuration for the training pipeline"""

    # Data paths
    MAIN_DATA_FILE = "match_sentences.csv"
    PRIORITY_SCORES_PATH = "intent_description.json"
    FINAL_TEST_SET_PATH = "final_test_set.csv"
    FINAL_TRAINVAL_SET_PATH = "final_trainval_set.csv"

    # Column names
    RAW_TEXT_COLUMN = "text"
    LABEL_COLUMN = "intent"
    TEST_SIZE = 0.15
    SEED = 42

    # Model configuration
    MODEL_NAME = 'dbmdz/bert-base-italian-xxl-cased'
    MAX_LEN = 128

    # Training hyperparameters (will be optimized)
    BATCH_SIZE = 32
    EPOCHS = 15
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    GRADIENT_CLIP = 1.0

    # Advanced training features
    USE_CLASS_WEIGHTS = True
    USE_PRIORITY_SCORES = False
    USE_MIXED_PRECISION = True
    USE_FOCAL_LOSS = True
    USE_LABEL_SMOOTHING = True
    USE_RDROP = False

    # Loss function parameters
    EARLY_STOPPING_PATIENCE = 10
    ALPHA = 0.5  # Balance between class weights and priority
    FOCAL_GAMMA = 2.0
    LABEL_SMOOTHING = 0.05

    # Cross-validation and hyperparameter tuning
    K_FOLDS = 5
    OPTUNA_N_TRIALS = 30
    OPTUNA_TIMEOUT = None  # Optional timeout in seconds
    USE_PRUNING = True

    # INFERENCE PARAMETERS
    TEMPERATURE = 0.8
    CONFIDENCE_THRESHOLD = 0.5

    # Output paths
    MODEL_SAVE_PATH = "italian_intent_model"
    PLOT_SAVE_PATH = "training_plots"
    CHECKPOINT_DIR = "checkpoints"
    BEST_PARAMS_PATH = "best_hyperparameters.json"

    def __repr__(self):
        return f"Config(LR={self.LEARNING_RATE:.2e}, BS={self.BATCH_SIZE}, WD={self.WEIGHT_DECAY:.2f})"

    def to_dict(self):
        """Convert config to dictionary for saving"""
        return {k: v for k, v in self.__dict__.items() if not k.startswith('_')}

    def save(self, path: str):
        """Save configuration to JSON"""
        with open(path, 'w') as f:
            json.dump(self.to_dict(), f, indent=2)

# Dataset class

In [4]:
# ===================== DATASET =====================
class IntentDataset(Dataset):
    """
    Dataset that holds raw text and tokenizes on-the-fly.
    """
    def __init__(self, texts: List[str], labels: List[int], tokenizer: AutoTokenizer, max_len: int):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self) -> int:
        return len(self.texts)

    def __getitem__(self, idx: int) -> Dict:
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Tokenize only this specific sample
        # padding=False. DataCollator will pad the batch later.
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding=False,
            return_tensors=None # standard lists, not tensor
        )

        item = {
            'input_ids': encoding['input_ids'],
            'attention_mask': encoding['attention_mask'],
            'labels': label
        }

        if 'token_type_ids' in encoding:
            item['token_type_ids'] = encoding['token_type_ids']

        return item

# Loss related functions

In [5]:
# ===================== LOSS FUNCTIONS =====================
class FocalLoss(torch.nn.Module):
    """
    Focal Loss for multi-class classification with priority weighting.

    Features:
    - Focuses on hard examples (reduces loss for easy examples)
    - Supports class weighting (alpha)
    - Optional label smoothing for regularization

    Paper: "Focal Loss for Dense Object Detection" (Lin et al., 2017)
    """
    def __init__(
        self,
        alpha: Optional[torch.Tensor] = None,
        gamma: float = 2.0,
        reduction: str = 'mean',
        label_smoothing: float = 0.0
    ):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction
        self.label_smoothing = label_smoothing

    def forward(self, inputs: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
        """
        Args:
            inputs: Logits [batch_size, num_classes]
            targets: True labels [batch_size]
        """
        # Compute cross-entropy with label smoothing
        ce_loss = torch.nn.functional.cross_entropy(
            inputs,
            targets,
            reduction='none',
            label_smoothing=self.label_smoothing
        )

        # Get probabilities
        p = torch.nn.functional.softmax(inputs, dim=1)
        p_t = p.gather(1, targets.view(-1, 1)).squeeze(1)

        # Focal weight: (1 - p_t)^gamma
        focal_weight = (1 - p_t) ** self.gamma

        # Apply class weights (alpha)
        if self.alpha is not None:
            if self.alpha.device != inputs.device:
                self.alpha = self.alpha.to(inputs.device)
            alpha_t = self.alpha.gather(0, targets)
            focal_loss = alpha_t * focal_weight * ce_loss
        else:
            focal_loss = focal_weight * ce_loss

        # Apply reduction
        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

In [6]:
# ===================== TRAINER =====================
class FocalTrainer(Trainer):
    """
    Enhanced trainer with:
    - Focal loss
    - Priority weighting
    - Label smoothing
    """
    def __init__(
        self,
        loss_weights: Optional[torch.Tensor] = None,
        gamma: float = 2.0,
        label_smoothing: float = 0.0,
        use_focal_loss: bool = True,
        *args,
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.loss_weights = loss_weights
        self.gamma = gamma
        self.label_smoothing = label_smoothing
        self.use_focal_loss = use_focal_loss

        # Initialize loss functions
        if self.use_focal_loss:
            self.loss_fn = FocalLoss(
                alpha=loss_weights,
                gamma=gamma,
                reduction='mean',
                label_smoothing=label_smoothing
            )
            logging.info(f"Using Focal Loss (γ={gamma}, smoothing={label_smoothing})")
        else:
            self.loss_fn = None
            logging.info(f"Using Weighted CrossEntropy")

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")

        # Standard forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Compute base loss
        if self.use_focal_loss and self.loss_fn is not None:
            loss = self.loss_fn(logits, labels)
        else:
            if self.loss_weights is not None:
                if self.loss_weights.device != logits.device:
                    self.loss_weights = self.loss_weights.to(logits.device)
                loss_fct = torch.nn.CrossEntropyLoss(
                    weight=self.loss_weights,
                    label_smoothing=self.label_smoothing
                )
            else:
                loss_fct = torch.nn.CrossEntropyLoss(label_smoothing=self.label_smoothing)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

In [7]:
import torch.nn.functional as F

class RDropTrainer(FocalTrainer):
    """
    Trainer with R-Drop regularization (Simultaneously minimizes CE and KL-Divergence).
    Paper: https://arxiv.org/abs/2106.14448
    """
    def __init__(self, rdrop_alpha: float = 4.0, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.rdrop_alpha = rdrop_alpha

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        if return_outputs:
            # During evaluation/inference, we don't need R-Drop
            return super().compute_loss(model, inputs, return_outputs)

        labels = inputs.get("labels")

        # Forward pass 1
        outputs1 = model(**inputs)
        logits1 = outputs1.get("logits")

        # Forward pass 2
        outputs2 = model(**inputs)
        logits2 = outputs2.get("logits")

        # We use the internal loss_fn (Focal) if available, otherwise standard CE
        if self.use_focal_loss and self.loss_fn is not None:
            loss1 = self.loss_fn(logits1, labels)
            loss2 = self.loss_fn(logits2, labels)
            ce_loss = (loss1 + loss2) / 2
        else:
            # Reconstruct standard loss function if Focal is off
            loss_fct = torch.nn.CrossEntropyLoss(
                weight=self.loss_weights,
                label_smoothing=self.label_smoothing
            )
            ce_loss = (loss_fct(logits1, labels) + loss_fct(logits2, labels)) / 2

        # KL Divergence (Consistency loss)
        # We want the predictions of Pass 1 to be close to Pass 2, and vice versa.
        # No .detach() ensures gradients flow through both to align them.
        p1 = F.log_softmax(logits1, dim=-1)
        p2 = F.log_softmax(logits2, dim=-1)

        # kl_div(input, target) -> divergence of input FROM target
        kl_loss = (
            F.kl_div(p1, p2, reduction='batchmean', log_target=True) +
            F.kl_div(p2, p1, reduction='batchmean', log_target=True)
        ) / 2

        # Final loss
        loss = ce_loss + (self.rdrop_alpha * kl_loss)

        return loss

In [8]:
# ===================== Utilities =====================
def compute_metrics(eval_pred):
    """Enhanced metrics computation"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average="macro", zero_division=0)
    f1_weighted = f1_score(labels, predictions, average="weighted", zero_division=0)

    return {
        "accuracy": accuracy,
        "f1": f1_weighted,
        "f1_macro": f1_macro
    }


def compute_loss_weights(
    labels: List[int],
    json_path: str,
    id2label: Dict[int, str],
    num_labels: int,
    use_balance: bool = True,
    use_priority: bool = True,
    alpha: float = 0.5
) -> torch.Tensor:
    """
    Computes combined weights for loss function.

    Args:
        alpha: 0=priority only, 1=balance only, 0.5=equal mix
    """
    balance_weights = np.ones(num_labels, dtype=np.float32)
    priority_weights = np.ones(num_labels, dtype=np.float32)

    # Class balance weights
    if use_balance:
        counts = np.bincount(labels, minlength=num_labels)
        safe_counts = np.where(counts == 0, 1, counts)
        total = len(labels)
        balance_weights = total / (num_labels * safe_counts.astype(np.float32))
        balance_weights = normalize_weights(balance_weights, min_val=0.5, max_val=2.0)
        logging.info(f"Class balance weights: {balance_weights.min():.3f} - {balance_weights.max():.3f}")

    # Priority weights
    if use_priority and os.path.exists(json_path):
        with open(json_path, 'r') as f:
            descriptions = json.load(f)
        priority_map = {item['intent']: item['priority'] for item in descriptions}
        priorities = np.array([priority_map.get(id2label[i], 50) for i in range(num_labels)])
        priority_weights = 0.5 + 1.5 * (priorities / 100.0)
        logging.info(f"Priority weights: {priority_weights.min():.3f} - {priority_weights.max():.3f}")

    # Combine weights
    if use_balance and use_priority:
        final_weights = alpha * balance_weights + (1 - alpha) * priority_weights
        logging.info(f"Combined weights (α={alpha:.2f})")
    elif use_balance:
        final_weights = balance_weights
    elif use_priority:
        final_weights = priority_weights
    else:
        final_weights = np.ones(num_labels, dtype=np.float32)

    # Normalize to mean=1.0 and clip extremes
    final_weights = final_weights / final_weights.mean()
    final_weights = np.clip(final_weights, 0.1, 10.0)

    logging.info(f"Final weights: {final_weights.min():.3f} - {final_weights.max():.3f}")

    return torch.tensor(final_weights, dtype=torch.float32)


def normalize_weights(weights: np.ndarray, min_val: float = 0.5, max_val: float = 2.0) -> np.ndarray:
    """Min-max normalization to [min_val, max_val]"""
    w_min, w_max = weights.min(), weights.max()
    if w_max - w_min < 1e-6:
        return np.ones_like(weights) * ((min_val + max_val) / 2)
    normalized = (weights - w_min) / (w_max - w_min)
    return min_val + (max_val - min_val) * normalized


def tokenize_data(
    tokenizer: AutoTokenizer,
    texts: List[str],
    labels: List[int],
    max_len: int
) -> Tuple[Dict, List[int]]:
    """Tokenize texts for model input"""
    encodings = tokenizer(
        texts,
        padding=False,
        truncation=True,
        max_length=max_len,
        return_tensors=None
    )
    return encodings, labels

# Data load and preparation utility

In [9]:
# ===================== Data preparation =====================
def load_and_prep_data(config: Config) -> Tuple[pd.DataFrame, pd.DataFrame, Dict, Dict, int]:
    """Load and prepare dataset with stratified split"""
    logging.info("=" * 70)
    logging.info("LOADING DATASET")
    logging.info("=" * 70)

    # Read and sanitize dataset
    df = pd.read_csv(config.MAIN_DATA_FILE)
    df = df.dropna(subset=[config.RAW_TEXT_COLUMN, config.LABEL_COLUMN])
    df[config.RAW_TEXT_COLUMN] = df[config.RAW_TEXT_COLUMN].astype(str).str.strip()
    df.reset_index(drop=True, inplace=True)
    logging.info(f"Loaded {len(df)} samples")

    # Stratified split
    df_trainval, df_test = train_test_split(
            df,
            test_size=config.TEST_SIZE,
            random_state=config.SEED,
            stratify=df[config.LABEL_COLUMN]
    )

    # Create label mappings
    target_intents = sorted(df[config.LABEL_COLUMN].unique())
    num_labels = len(target_intents)
    label2id = {label: i for i, label in enumerate(target_intents)}
    id2label = {i: label for i, label in enumerate(target_intents)}
    logging.info(f"Model will train on {num_labels} valid intents.")

    # Show distribution
    intent_counts = df[config.LABEL_COLUMN].value_counts()
    logging.info(f"\n{num_labels} intents found:")
    for intent, count in intent_counts.head(10).items():
        logging.info(f"  {intent[:50]:50s} : {count:4d} ({count/len(df)*100:5.2f}%)")

    df_trainval['labels'] = df_trainval[config.LABEL_COLUMN].map(label2id)
    df_test['labels'] = df_test[config.LABEL_COLUMN].map(label2id)

    # Save splits
    df_trainval.to_csv(config.FINAL_TRAINVAL_SET_PATH, index=False)
    df_test.to_csv(config.FINAL_TEST_SET_PATH, index=False)

    logging.info(f"\nTrain/Val: {len(df_trainval)} | Test: {len(df_test)}")

    return df_trainval, df_test, label2id, id2label, num_labels

# Training and hyperparameter optimization

In [17]:
# ===================== Training =====================
def finetune_model(
    config: Config,
    tokenizer: AutoTokenizer,
    train_texts: List[str],
    train_labels: List[int],
    val_texts: List[str],
    val_labels: List[int],
    label2id: Dict,
    id2label: Dict,
    num_labels: int,
    main_model: Optional[AutoModelForSequenceClassification] = None,
    fold_id: str = "0",
    save_model: bool = False,
    model_save_path: Optional[str] = None
) -> float:
    """Train model on a single fold"""
    logging.info(f"Training Fold {fold_id}...")

    train_dataset = IntentDataset(train_texts, train_labels, tokenizer, config.MAX_LEN)
    val_dataset = IntentDataset(val_texts, val_labels, tokenizer, config.MAX_LEN)

    if main_model is not None:
        model = copy.deepcopy(main_model)
    else:
        # Load from disk (fallback)
        model = AutoModelForSequenceClassification.from_pretrained(
            config.MODEL_NAME,
            num_labels=num_labels,
            id2label=id2label,
            label2id=label2id
        )
    model.gradient_checkpointing_enable()

    # Compute loss weights
    final_weights = None
    if config.USE_CLASS_WEIGHTS or config.USE_PRIORITY_SCORES:
        final_weights = compute_loss_weights(
            labels=train_labels,
            json_path=config.PRIORITY_SCORES_PATH,
            id2label=id2label,
            num_labels=num_labels,
            use_balance=config.USE_CLASS_WEIGHTS,
            use_priority=config.USE_PRIORITY_SCORES,
            alpha=config.ALPHA
        )

    # Training arguments
    args = TrainingArguments(
        output_dir=f"{config.CHECKPOINT_DIR}/fold_{fold_id}",
        overwrite_output_dir=True,
        eval_strategy=IntervalStrategy.EPOCH,
        save_strategy="no",
        learning_rate=config.LEARNING_RATE,
        per_device_train_batch_size=config.BATCH_SIZE,
        per_device_eval_batch_size=config.BATCH_SIZE,
        num_train_epochs=config.EPOCHS,
        weight_decay=config.WEIGHT_DECAY,
        warmup_ratio=config.WARMUP_RATIO,
        metric_for_best_model="f1",
        load_best_model_at_end=False,
        greater_is_better=True,
        fp16=config.USE_MIXED_PRECISION,
        logging_steps=50,
        report_to="none",
        disable_tqdm=False
    )

    # Select trainer class
    data_collator = DataCollatorWithPadding(tokenizer, padding=True)
    TrainerClass = RDropTrainer if config.USE_RDROP else FocalTrainer

    trainer = TrainerClass(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=config.EARLY_STOPPING_PATIENCE)],
        loss_weights=final_weights,
        gamma=config.FOCAL_GAMMA,
        label_smoothing=config.LABEL_SMOOTHING,
        use_focal_loss=config.USE_FOCAL_LOSS
        #rdrop_alpha=5.0
    )

    # Train
    trainer.train()

    # Get best F1
    log_history = trainer.state.log_history
    eval_f1_scores = [entry['eval_f1'] for entry in log_history if 'eval_f1' in entry]
    best_f1 = max(eval_f1_scores) if eval_f1_scores else trainer.evaluate()["eval_f1"]

    logging.info(f"Fold {fold_id}: Best F1 = {best_f1:.4f}")

    if save_model and model_save_path:
        trainer.save_model(model_save_path)
        tokenizer.save_pretrained(model_save_path)
        logging.info(f"Model saved to {model_save_path}")

    # Cleanup
    del model, trainer
    torch.cuda.empty_cache()
    gc.collect()

    return best_f1


In [11]:
# ===================== Optuna Hyperparameter Search =====================
def run_hp_search_optuna(
    config: Config,
    df_trainval: pd.DataFrame,
    label2id: Dict,
    id2label: Dict,
    num_labels: int,
    text_column: str
) -> Tuple[Config, pd.DataFrame]:
    """
    Optuna hyperparameter search with:
    - Pruning for efficiency
    - Better search space
    """
    logging.info("=" * 70)
    logging.info(f"OPTUNA HYPERPARAMETER SEARCH ({config.OPTUNA_N_TRIALS} trials)")
    logging.info("=" * 70)
    logging.info("Loading main Model and tokenizer into CPU memory...")

    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)

    main_model = AutoModelForSequenceClassification.from_pretrained(
        config.MODEL_NAME,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )

    main_model.cpu()

    # Create cached datasets
    X = df_trainval[text_column].tolist()
    y = df_trainval['labels'].tolist()

    logging.info("Assets loaded. Starting Optimization.")

    def objective(trial: optuna.Trial) -> float:
        """Objective function with expanded search space"""

        # Sample hyperparameters
        trial_config = copy.deepcopy(config)

        # Core hyperparameters
        trial_config.LEARNING_RATE = trial.suggest_float("learning_rate", 5e-6, 5e-5, log=True)
        trial_config.BATCH_SIZE = trial.suggest_categorical("batch_size", [16, 24, 32])
        trial_config.WEIGHT_DECAY = trial.suggest_float("weight_decay", 0.0, 0.15)
        trial_config.WARMUP_RATIO = trial.suggest_float("warmup_ratio", 0.05, 0.2)

        # Loss function parameters
        if config.USE_CLASS_WEIGHTS and config.USE_PRIORITY_SCORES:
            trial_config.ALPHA = trial.suggest_float("alpha", 0.2, 0.8)

        if config.USE_FOCAL_LOSS:
            trial_config.FOCAL_GAMMA = trial.suggest_float("focal_gamma", 0.5, 3.0)
            trial_config.LABEL_SMOOTHING = trial.suggest_float("label_smoothing", 0.0, 0.1)

        # K-Fold cross-validation
        skf = StratifiedKFold(n_splits=config.K_FOLDS, shuffle=True, random_state=42)
        fold_f1_scores = []

        try:
            for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
                train_texts = [X[i] for i in train_idx]
                train_labels = [y[i] for i in train_idx]
                val_texts = [X[i] for i in val_idx]
                val_labels = [y[i] for i in val_idx]

                fold_id = f"T{trial.number}_F{fold+1}"

                # Train fold
                fold_f1 = finetune_model(
                    config=trial_config,
                    tokenizer=tokenizer,
                    train_texts=train_texts,
                    train_labels=train_labels,
                    val_texts=val_texts,
                    val_labels=val_labels,
                    label2id=label2id,
                    id2label=id2label,
                    num_labels=num_labels,
                    main_model=main_model,
                    fold_id=fold_id,
                    save_model=False
                )
                fold_f1_scores.append(fold_f1)

                # Cleanup
                gc.collect()
                torch.cuda.empty_cache()

                # Optuna pruning (stop bad trials early)
                if config.USE_PRUNING:
                    current_avg_f1 = np.mean(fold_f1_scores)
                    trial.report(current_avg_f1, step=fold)
                    if trial.should_prune():
                        logging.info(f"Pruning Trial {trial.number} at Fold {fold+1}")
                        raise optuna.exceptions.TrialPruned()

        except optuna.exceptions.TrialPruned:
            raise
        except Exception as e:
            logging.error(f"Trial {trial.number} failed: {e}")
            return 0.0

        # Calculate average F1
        avg_f1 = np.mean(fold_f1_scores)
        std_f1 = np.std(fold_f1_scores)

        logging.info(f"Trial {trial.number}: F1={avg_f1:.4f} ± {std_f1:.4f}")

        return avg_f1

    # Create Optuna study with pruning
    pruner = optuna.pruners.MedianPruner(
        n_startup_trials=5,
        n_warmup_steps=2,
        interval_steps=1
    ) if config.USE_PRUNING else optuna.pruners.NopPruner()

    study = optuna.create_study(
        direction="maximize",
        pruner=pruner,
        study_name="bert_intent_classification"
    )

    # Run optimization
    study.optimize(
        objective,
        n_trials=config.OPTUNA_N_TRIALS,
        timeout=config.OPTUNA_TIMEOUT,
        show_progress_bar=True
    )

    # Report results
    logging.info(f"\n{'=' * 70}")
    logging.info("OPTIMIZATION COMPLETE")
    logging.info(f"{'=' * 70}")
    logging.info(f"Best Trial: {study.best_trial.number}")
    logging.info(f"Best F1: {study.best_value:.4f}")
    logging.info(f"Best Params:")
    for key, value in study.best_params.items():
        logging.info(f"  {key}: {value}")
    logging.info(f"{'=' * 70}\n")

    # Update config with best params
    best_config = copy.deepcopy(config)
    for param, value in study.best_params.items():
        if hasattr(best_config, param.upper()):
            setattr(best_config, param.upper(), value)
        else:
            # Handle nested parameters
            param_map = {
                "learning_rate": "LEARNING_RATE",
                "batch_size": "BATCH_SIZE",
                "weight_decay": "WEIGHT_DECAY",
                "warmup_ratio": "WARMUP_RATIO",
                "alpha": "ALPHA",
                "focal_gamma": "FOCAL_GAMMA",
                "label_smoothing": "LABEL_SMOOTHING",

            }
            if param in param_map:
                setattr(best_config, param_map[param], value)

    # Save best hyperparameters
    with open(config.BEST_PARAMS_PATH, 'w') as f:
        json.dump(study.best_params, f, indent=2)
    logging.info(f"Best params saved to {config.BEST_PARAMS_PATH}")

    # Save trials dataframe
    trials_df = study.trials_dataframe()
    trials_df.to_csv("optuna_trials.csv", index=False)
    logging.info(f"Trials saved to optuna_trials.csv")

    return best_config, trials_df


In [12]:
# ===================== Final Model Training =====================
def train_final_model(
    config: Config,
    df_trainval: pd.DataFrame,
    label2id: Dict,
    id2label: Dict,
    num_labels: int,
    text_column: str,
    model_save_suffix: str = ""
) -> str:
    """Train final model on full trainval set with best hyperparameters"""
    model_save_path = f"{config.MODEL_SAVE_PATH}{model_save_suffix}"

    logging.info("=" * 70)
    logging.info("TRAINING FINAL MODEL")
    logging.info("=" * 70)
    logging.info(f"Using optimized hyperparameters:")
    logging.info(f"  LR={config.LEARNING_RATE:.2e}, BS={config.BATCH_SIZE}, WD={config.WEIGHT_DECAY:.3f}")

    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)

    # Internal train/val split for early stopping
    train_df, val_df = train_test_split(
        df_trainval,
        test_size=0.1,
        random_state=42,
        stratify=df_trainval['labels']
    )

    train_texts = train_df[text_column].tolist()
    train_labels = train_df['labels'].tolist()
    val_texts = val_df[text_column].tolist()
    val_labels = val_df['labels'].tolist()

    best_f1 = finetune_model(
        config=config,
        tokenizer=tokenizer,
        train_texts=train_texts,
        train_labels=train_labels,
        val_texts=val_texts,
        val_labels=val_labels,
        label2id=label2id,
        id2label=id2label,
        num_labels=num_labels,
        fold_id="FINAL",
        save_model=True,
        model_save_path=model_save_path
    )

    logging.info(f"Final model training complete. Val F1: {best_f1:.4f}")
    logging.info(f"Model saved to: {model_save_path}")

    # Save configuration
    config.save(f"{model_save_path}/training_config.json")

    return model_save_path

# Inference and final eval

In [48]:
class InferencePipeline:
    """
    Optimized inference class that loads the model once and stays in memory.
    """
    def __init__(self, model_path: str, device: str = None):
        self.model_path = model_path
        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")

        logging.info(f"Loading inference model from {model_path} to {self.device}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)

        self.model.to(self.device)
        self.model.eval()
        logging.info("Model loaded successfully.")

    # def predict(self, text: str, threshold: float = 0.7) -> Dict[str, any]:
    #     """Predict intent for a single text string."""
    #     inputs = self.tokenizer(
    #         text, return_tensors="pt", padding=True, truncation=True, max_length=128
    #     ).to(self.device)
    #
    #     with torch.no_grad():
    #         outputs = self.model(**inputs)
    #         probs = torch.nn.functional.softmax(scaled_logits, dim=-1)
    #         # probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    #         confidence, predicted_class = torch.max(probs, dim=1)
    #
    #     predicted_label = self.model.config.id2label[predicted_class.item()]
    #
    #     return {
    #         'text': text,
    #         'intent': predicted_label,
    #         'confidence': float(confidence.item()),
    #         'probabilities': probs[0].cpu().numpy()  # All class probabilities
    #     }

    def predict_batch(
        self,
        texts: List[str],
        batch_size: int = 32,
        temperature: float = 1.0,
        threshold: float = 0.7,
        return_confidences: bool = False
    ) -> Tuple[List[str], List[int], Optional[np.ndarray]]:
        """
        Batch prediction optimized for test set evaluation.

        Args:
            texts: List of input texts
            batch_size: Batch size for inference
            return_confidences: If True, also return confidence scores

        Returns:
            predicted_labels: List of predicted intent labels
            all_preds_ids: List of predicted class IDs
            confidences: (Optional) Array of confidence scores
        """
        all_preds_ids = []
        all_confidences = []
        all_probs = []

        for i in tqdm(range(0, len(texts), batch_size), desc="Inference Batch"):
            batch_texts = texts[i : i + batch_size]
            inputs = self.tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=128
            ).to(self.device)


            with torch.no_grad():
                outputs = self.model(**inputs)
                scaled_logits = outputs.logits / temperature
                probs = torch.nn.functional.softmax(scaled_logits, dim=-1)
                confidences, predictions = torch.max(probs, dim=-1)

                all_preds_ids.extend(predictions.cpu().numpy())
                all_confidences.extend(confidences.cpu().numpy())
                all_probs.extend(probs.cpu().numpy())


        predicted_labels = []
        for idx, conf in zip(all_preds_ids, all_confidences):
            if conf < threshold:
                predicted_labels.append("NOMATCH")
            else:
                predicted_labels.append(self.model.config.id2label[idx])

        if return_confidences:
            return predicted_labels, all_preds_ids, np.array(all_confidences), np.array(all_probs)
        else:
            return predicted_labels, all_preds_ids




In [49]:
# Final evaluation
def evaluate_on_test_set(
    config: Config,
    df_test: pd.DataFrame,
    model_path: str,
    label2id: Dict,
    id2label: Dict,
    num_labels: int,
    text_column: str,
    report_suffix: str = ""
):
    """
    Evaluate using the InferencePipeline class (batch mode).
    """
    logging.info("\n" + "=" * 70)
    logging.info("FINAL EVALUATION ON TEST SET")
    logging.info("=" * 70)

    pipeline = InferencePipeline(model_path=model_path)

    test_texts = df_test[text_column].tolist()
    true_labels = df_test[config.LABEL_COLUMN].tolist()

    logging.info(f"Running inference on {len(test_texts)} samples...")

    # Get predictions with confidences
    pred_labels, pred_ids, confidences, all_probs = pipeline.predict_batch(
        test_texts,
        batch_size=config.BATCH_SIZE,
        temperature=config.TEMPERATURE,
        threshold=config.CONFIDENCE_THRESHOLD,
        return_confidences=True
    )
    unique_labels = sorted(list(set(true_labels + pred_labels)))

    # Calculate metrics
    test_acc = accuracy_score(true_labels, pred_labels)
    test_f1 = f1_score(true_labels, pred_labels, average="weighted", zero_division=0)

    logging.info(f"\nTest Accuracy: {test_acc:.4f}")
    logging.info(f"Test F1 Score: {test_f1:.4f}")

    # Classification report
    target_names = [id2label[i] for i in range(num_labels)]
    print("\n" + "=" * 70)
    print("DETAILED CLASSIFICATION REPORT")
    print("=" * 70)
    print(classification_report(
        true_labels, pred_labels,
        zero_division=0
    ))

    # ============================================================
    # CONFIDENCE STATISTICS
    # ============================================================
    print("\n" + "=" * 70)
    print("CONFIDENCE STATISTICS")
    print("=" * 70)
    print(f"Mean Confidence:   {np.mean(confidences):.4f}")
    print(f"Median Confidence: {np.median(confidences):.4f}")
    print(f"Std Confidence:    {np.std(confidences):.4f}")
    print(f"Min Confidence:    {np.min(confidences):.4f}")
    print(f"Max Confidence:    {np.max(confidences):.4f}")

    # Confidence distribution
    print("\nConfidence Distribution:")
    conf_ranges = [
        ("Very High (>0.90)", np.sum(confidences > 0.90)),
        ("High (0.80-0.90)", np.sum((confidences >= 0.80) & (confidences <= 0.90))),
        ("Medium (0.70-0.80)", np.sum((confidences >= 0.70) & (confidences < 0.80))),
        ("Low (0.50-0.70)", np.sum((confidences >= 0.50) & (confidences < 0.70))),
        ("Very Low (<0.50)", np.sum(confidences < 0.50)),
    ]

    for range_name, count in conf_ranges:
        percentage = (count / len(confidences)) * 100
        print(f"  {range_name:25s}: {count:4d} ({percentage:5.1f}%)")

    # Confidence by correctness
    correct_mask = np.array([t == p for t, p in zip(true_labels, pred_labels)])
    correct_confidences = confidences[correct_mask]
    incorrect_confidences = confidences[~correct_mask]

    print("\nConfidence by Prediction Correctness:")
    print(f"  Correct predictions:   Mean={np.mean(correct_confidences):.4f}, "
          f"Median={np.median(correct_confidences):.4f}")
    if len(incorrect_confidences) > 0:
        print(f"  Incorrect predictions: Mean={np.mean(incorrect_confidences):.4f}, "
              f"Median={np.median(incorrect_confidences):.4f}")
    else:
        print(f"  Incorrect predictions: None (Perfect accuracy!)")

    print("=" * 70 + "\n")

    # ============================================================
    # SAVE DETAILED RESULTS
    # ============================================================
    results_df = pd.DataFrame({
        'raw_text': df_test[config.RAW_TEXT_COLUMN].tolist(),
        'normalized_text': test_texts,
        'true_intent': true_labels,
        'predicted_intent': pred_labels,
        'confidence': confidences,
        'correct': correct_mask
    })

    # Add top-3 predictions for each sample
    top3_intents = []
    top3_probs = []

    for probs in all_probs:
        top3_idx = np.argsort(probs)[-3:][::-1]  # Top 3 indices
        top3_intents.append([id2label[idx] for idx in top3_idx])
        top3_probs.append(probs[top3_idx].tolist())

    results_df['top3_intents'] = [str(intents) for intents in top3_intents]
    results_df['top3_probabilities'] = [str(probs) for probs in top3_probs]

    # Save results
    results_path = f"evaluation_results{report_suffix}.csv"
    results_df.to_csv(results_path, index=False, encoding='utf-8')
    logging.info(f"✓ Evaluation results saved to: {results_path}")

    # ============================================================
    # ERROR ANALYSIS
    # ============================================================
    errors = results_df[results_df['correct'] == False].copy()

    if len(errors) > 0:
        print("\n" + "=" * 70)
        print("ERROR ANALYSIS")
        print("=" * 70)
        print(f"Total Errors: {len(errors)} ({len(errors)/len(results_df)*100:.2f}%)")

        # Error patterns
        error_patterns = (
            errors.groupby(['true_intent', 'predicted_intent'])
            .size()
            .sort_values(ascending=False)
            .head(10)
        )

        print("\nTop 10 Error Patterns:")
        print("-" * 70)
        for (true_int, pred_int), count in error_patterns.items():
            percentage = (count / len(errors)) * 100
            print(f"  {true_int:40s} → {pred_int:40s}: {count:3d} ({percentage:4.1f}%)")

        # Low confidence errors
        low_conf_errors = errors[errors['confidence'] < 0.5]
        if len(low_conf_errors) > 0:
            print(f"\nLow Confidence Errors (<0.5): {len(low_conf_errors)}")
            print("  (These are expected errors - model was uncertain)")

        # High confidence errors (more concerning)
        high_conf_errors = errors[errors['confidence'] >= 0.8]
        if len(high_conf_errors) > 0:
            print(f"\n High Confidence Errors (≥0.8): {len(high_conf_errors)}")
            print("  (These are concerning - model was confident but wrong)")
            print("\n  Top 5 High-Confidence Errors:")
            for idx, row in high_conf_errors.nlargest(5, 'confidence').iterrows():
                print(f"    Confidence: {row['confidence']:.3f}")
                print(f"    Text: {row['normalized_text'][:80]}...")
                print(f"    True: {row['true_intent']}")
                print(f"    Predicted: {row['predicted_intent']}")
                print()

        # Save error details
        errors = errors.sort_values('confidence')
        errors_path = f"classification_errors{report_suffix}.csv"
        errors.to_csv(errors_path, index=False, encoding='utf-8')
        logging.info(f"✓ Error details saved to: {errors_path}")

        print("=" * 70 + "\n")
    else:
        print("\n" + "=" * 70)
        print("NO CLASSIFICATION ERRORS - PERFECT ACCURACY!")
        print("=" * 70 + "\n")

    # ============================================================
    # CONFUSION MATRIX
    # ============================================================
    cm = confusion_matrix(true_labels, pred_labels, labels=unique_labels)

    plt.figure(figsize=(20, 20))
    sns.heatmap(
        cm,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=unique_labels,
        yticklabels=unique_labels,
        cbar_kws={'label': 'Count'}
    )
    plt.title(f'Confusion Matrix (Accuracy: {test_acc:.3f}, F1: {test_f1:.3f})')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()

    os.makedirs(config.PLOT_SAVE_PATH, exist_ok=True)
    cm_path = f"{config.PLOT_SAVE_PATH}/confusion_matrix{report_suffix}.png"
    plt.savefig(cm_path, dpi=300, bbox_inches='tight')
    plt.close()
    logging.info(f"Confusion matrix saved to: {cm_path}")

    # ============================================================
    # CONFIDENCE HISTOGRAM
    # ============================================================
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Overall confidence distribution
    axes[0].hist(confidences, bins=50, alpha=0.7, color='blue', edgecolor='black')
    axes[0].axvline(np.mean(confidences), color='red', linestyle='--',
                    label=f'Mean: {np.mean(confidences):.3f}')
    axes[0].axvline(np.median(confidences), color='green', linestyle='--',
                    label=f'Median: {np.median(confidences):.3f}')
    axes[0].set_xlabel('Confidence Score')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Overall Confidence Distribution')
    axes[0].legend()
    axes[0].grid(alpha=0.3)

    # Confidence by correctness
    if len(incorrect_confidences) > 0:
        axes[1].hist(correct_confidences, bins=30, alpha=0.6, color='green',
                     label=f'Correct (n={len(correct_confidences)})', edgecolor='black')
        axes[1].hist(incorrect_confidences, bins=30, alpha=0.6, color='red',
                     label=f'Incorrect (n={len(incorrect_confidences)})', edgecolor='black')
    else:
        axes[1].hist(correct_confidences, bins=30, alpha=0.7, color='green',
                     label=f'All Correct (n={len(correct_confidences)})', edgecolor='black')

    axes[1].set_xlabel('Confidence Score')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Confidence by Prediction Correctness')
    axes[1].legend()
    axes[1].grid(alpha=0.3)

    plt.tight_layout()
    conf_path = f"{config.PLOT_SAVE_PATH}/confidence_distribution{report_suffix}.png"
    plt.savefig(conf_path, dpi=300, bbox_inches='tight')
    plt.close()
    logging.info(f"Confidence distribution saved to: {conf_path}")

    return test_f1

## Ensamble training and inference

In [None]:
def train_ensemble_models(
    config: Config,
    df_trainval: pd.DataFrame,
    label2id: Dict,
    id2label: Dict,
    num_labels: int,
    text_column: str,
    num_seeds: int = 5
) -> List[str]:

    saved_model_paths = []

    logging.info("=" * 70)
    logging.info(f"TRAINING ENSEMBLE ({num_seeds} seeds)")
    logging.info("=" * 70)

    # Base tokenizer
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)

    for i in range(num_seeds):
        seed = 42 + i
        current_suffix = f"_seed_{seed}"
        logging.info(f"--- Starting Training for Seed {seed} ---")

        # Different splits for each seed
        train_df, val_df = train_test_split(
            df_trainval,
            test_size=0.1,
            random_state=seed,
            stratify=df_trainval['labels']
        )

        train_texts = train_df[text_column].tolist()
        train_labels = train_df['labels'].tolist()
        val_texts = val_df[text_column].tolist()
        val_labels = val_df['labels'].tolist()

        # Train with current seed
        path = finetune_model(
            config=config,
            tokenizer=tokenizer,
            train_texts=train_texts,
            train_labels=train_labels,
            val_texts=val_texts,
            val_labels=val_labels,
            label2id=label2id,
            id2label=id2label,
            num_labels=num_labels,
            fold_id=f"SEED_{seed}",
            save_model=True,
            model_save_path=f"{config.MODEL_SAVE_PATH}{current_suffix}"
        )

        # Save config for current seed
        config.save(f"{config.MODEL_SAVE_PATH}{current_suffix}/training_config.json")
        saved_model_paths.append(f"{config.MODEL_SAVE_PATH}{current_suffix}")

    return saved_model_paths

In [15]:
class EnsembleInferencePipeline:
    def __init__(self, model_paths: List[str], device: str = None):
        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        self.models = []

        logging.info(f"Loading {len(model_paths)} models for ensemble...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_paths[0])

        # Load all models
        for path in model_paths:
            model = AutoModelForSequenceClassification.from_pretrained(path)
            model.to(self.device)
            model.eval()
            self.models.append(model)

        self.config = self.models[0].config
        logging.info("Ensemble loaded successfully.")

    def predict_batch(
        self,
        texts: List[str],
        batch_size: int = 32,
        temperature: float = 1.0,
        threshold: float = 0.0,
        return_confidences: bool = False
    ):
        all_preds_ids = []
        all_confidences = []
        all_probs = []

        for i in tqdm(range(0, len(texts), batch_size), desc="Ensemble Batch"):
            batch_texts = texts[i : i + batch_size]
            inputs = self.tokenizer(
                batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128
            ).to(self.device)

            # Clean inputs if needed
            if "token_type_ids" in inputs: inputs.pop("token_type_ids")

            with torch.no_grad():
                logits_list = []
                for model in self.models:
                    outputs = model(**inputs)
                    logits_list.append(outputs.logits)

                # Average (soft voting)
                avg_logits = torch.stack(logits_list).mean(dim=0)

                # Apply temperature and softmax on the average
                scaled_logits = avg_logits / temperature
                probs = torch.nn.functional.softmax(scaled_logits, dim=-1)

                confidences, predictions = torch.max(probs, dim=-1)

                all_preds_ids.extend(predictions.cpu().numpy())
                all_confidences.extend(confidences.cpu().numpy())
                all_probs.extend(probs.cpu().numpy())

        # Thresholding
        predicted_labels = []
        for idx, conf in zip(all_preds_ids, all_confidences):
            if conf < threshold:
                predicted_labels.append("NOMATCH")
            else:
                predicted_labels.append(self.config.id2label[idx])

        if return_confidences:
            return predicted_labels, all_preds_ids, np.array(all_confidences), np.array(all_probs)
        else:
            return predicted_labels, all_preds_ids

# Contrastive Learning

In [14]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models, evaluation
from torch.utils.data import DataLoader

def train_contrastive_embeddings(
    config: Config,
    df_train: pd.DataFrame,
    text_col: str,
    label_col: str,
    output_path: str = "contrastive_bert_base"
) -> str:
    """
    Stage 1: Fine-tune the BERT body using Contrastive Learning.
    This aligns embeddings so that same-intents are close and diff-intents are far.
    """
    logging.info("="*70)
    logging.info("STAGE 1: CONTRASTIVE EMBEDDING TRAINING")
    logging.info("="*70)

    # 1. Prepare Data for Contrastive Learning
    # We create a list of InputExamples.
    # For MultipleNegativesRankingLoss, we just need pairs of (Anchor, Positive).
    # The loss function treats all other samples in the batch as Negatives (In-batch negatives).

    train_examples = []

    # Group by intent to create positive pairs
    intent_groups = df_train.groupby(label_col)[text_col].apply(list).to_dict()

    import random
    random.seed(42)

    # Generate pairs (Anchor, Positive)
    # We want to maximize the number of pairs to give the model rich signal
    for intent, texts in intent_groups.items():
        if len(texts) < 2: continue

        # Create pairs: (Text A, Text B) where both have same intent
        # We can cycle through to create robust pairs
        for i in range(len(texts)):
            anchor = texts[i]
            # Pick a random positive that isn't the anchor
            # (Or pick multiple positives per anchor)
            positive_idx = (i + 1) % len(texts)
            positive = texts[positive_idx]

            train_examples.append(InputExample(texts=[anchor, positive], label=1))

            # Augmentation: Add another pair with a different positive to robustify
            if len(texts) > 2:
                pos_idx_2 = (i + 2) % len(texts)
                train_examples.append(InputExample(texts=[anchor, texts[pos_idx_2]], label=1))

    logging.info(f"Generated {len(train_examples)} contrastive pairs.")

    # 2. Define Model Architecture for Sentence Transformers
    # We wrap your specific Italian BERT model
    word_embedding_model = models.Transformer(config.MODEL_NAME, max_seq_length=config.MAX_LEN)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # 3. Define DataLoader and Loss
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=config.BATCH_SIZE)

    # MultipleNegativesRankingLoss is incredible for this.
    # It calculates loss: -log(exp(sim(a,p)) / sum(exp(sim(a, n))))
    # Effectively maximizing sim(a,p) while minimizing sim(a, everything_else_in_batch)
    train_loss = losses.MultipleNegativesRankingLoss(model)

    # 4. Train
    # Usually 1-3 epochs is enough for contrastive fine-tuning
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=3, # Keep it short to avoid overfitting on the specifics
        warmup_steps=int(len(train_dataloader) * 0.1),
        show_progress_bar=True,
        output_path=output_path
    )

    logging.info(f"Contrastive Model saved to {output_path}")
    return output_path

In [15]:
from sentence_transformers import SentenceTransformer, InputExample, losses, models, util
from torch.utils.data import DataLoader
import torch
import random
from tqdm import tqdm

def train_with_hard_negatives(
    config: Config,
    df_train: pd.DataFrame,
    text_col: str,
    label_col: str,
    output_path: str = "contrastive_bert_hard_negatives"
) -> str:
    """
    Advanced Embedding Training:
    1. Trains a base model using MultipleNegativesRankingLoss (MNRL).
    2. Uses that base model to MINE Hard Negatives (wrong intents that look similar).
    3. Fine-tunes using TripletLoss to force those specific errors apart.
    """
    logging.info("="*70)
    logging.info("STARTING 2-STAGE CONTRASTIVE TRAINING WITH HARD NEGATIVES")
    logging.info("="*70)

    # ==========================================
    # STAGE 1: Warmup with MNRL (The Standard Approach)
    # ==========================================
    logging.info("--- Stage 1: Warmup with MultipleNegativesRankingLoss ---")

    # Define the Base Model
    word_embedding_model = models.Transformer(config.MODEL_NAME, max_seq_length=config.MAX_LEN)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Create simple pairs (Anchor, Positive)
    train_examples_mnrl = []
    intent_groups = df_train.groupby(label_col)[text_col].apply(list).to_dict()

    for intent, texts in intent_groups.items():
        if len(texts) < 2: continue
        for i in range(len(texts)):
            anchor = texts[i]
            positive = texts[(i + 1) % len(texts)] # Simple cycle
            train_examples_mnrl.append(InputExample(texts=[anchor, positive], label=1))

    # Train Stage 1
    train_dataloader_mnrl = DataLoader(train_examples_mnrl, shuffle=True, batch_size=config.BATCH_SIZE)
    train_loss_mnrl = losses.MultipleNegativesRankingLoss(model)

    model.fit(
        train_objectives=[(train_dataloader_mnrl, train_loss_mnrl)],
        epochs=1, # Just 1 epoch to get embeddings roughly aligned
        warmup_steps=int(len(train_dataloader_mnrl) * 0.1),
        show_progress_bar=True
    )

    # ==========================================
    # STAGE 2: Mining Hard Negatives
    # ==========================================
    logging.info("--- Stage 2: Mining Hard Negatives & Triplet Training ---")
    logging.info("Encoding all training data to find confusion points...")

    # 1. Encode all training texts to find similarities
    # We need a list of all texts and their corresponding labels
    all_texts = df_train[text_col].tolist()
    all_labels = df_train[label_col].tolist()

    # Encode everything (on GPU)
    embeddings = model.encode(all_texts, convert_to_tensor=True, show_progress_bar=True)

    # 2. Mine Triplets (Anchor, Positive, HARD Negative)
    triplets = []

    # Group indices by label for fast positive lookup
    label_to_indices = df_train.groupby(label_col).indices

    # Calculate Cosine Similarity Matrix for the whole dataset
    # This might be heavy for >50k rows, but fine for <10k.
    # If OOM, do this in batches.
    cos_scores = util.cos_sim(embeddings, embeddings)

    logging.info("Generating Triplets...")

    # For every sample in the dataset...
    for idx, anchor_text in enumerate(tqdm(all_texts, desc="Mining")):
        anchor_label = all_labels[idx]

        # A. Find a Positive (Same Label)
        possible_positives = label_to_indices[anchor_label]
        if len(possible_positives) < 2: continue

        # Pick a random positive that isn't the anchor itself
        pos_idx = random.choice(possible_positives)
        while pos_idx == idx:
            pos_idx = random.choice(possible_positives)
        positive_text = all_texts[pos_idx]

        # B. Find a HARD Negative (Different Label, but High Similarity)
        # Get semantic scores for this anchor against EVERYONE else
        scores = cos_scores[idx]

        # Sort scores descending (Highest similarity first)
        top_results = torch.topk(scores, k=20) # Look at top 20 closest sentences

        hard_negative_text = None

        for score, match_idx in zip(top_results.values, top_results.indices):
            match_idx = match_idx.item()
            # If the high-similarity match has a DIFFERENT label, it's a Hard Negative!
            if all_labels[match_idx] != anchor_label:
                hard_negative_text = all_texts[match_idx]
                break # Found the hardest one, stop looking

        # If we didn't find a hard negative in top 20, pick a semi-hard one random
        if hard_negative_text is None:
            # Fallback: Pick a random index from a different label
            neg_idx = random.randint(0, len(all_texts)-1)
            while all_labels[neg_idx] == anchor_label:
                neg_idx = random.randint(0, len(all_texts)-1)
            hard_negative_text = all_texts[neg_idx]

        # Create Triplet
        triplets.append(InputExample(texts=[anchor_text, positive_text, hard_negative_text]))

    logging.info(f"Mined {len(triplets)} triplets.")

    # ==========================================
    # STAGE 3: Train with Triplet Loss
    # ==========================================
    train_dataloader_triplet = DataLoader(triplets, shuffle=True, batch_size=config.BATCH_SIZE)

    # TripletLoss minimizes: dist(anchor, positive) - dist(anchor, negative) + margin
    # FIX: The parameter is named 'triplet_margin'
    train_loss_triplet = losses.TripletLoss(
        model=model,
        distance_metric=losses.TripletDistanceMetric.COSINE,
        triplet_margin=0.5
    )

    model.fit(
        train_objectives=[(train_dataloader_triplet, train_loss_triplet)],
        epochs=2, # Fine-tune specifically on the hard cases
        warmup_steps=int(len(train_dataloader_triplet) * 0.1),
        show_progress_bar=True,
        output_path=output_path
    )

    logging.info(f"Hard Negative Model saved to {output_path}")
    return output_path

# Main executions

In [61]:
# 1. Re-initialize Config to ensure defaults
os.environ["WANDB_DISABLED"] = "true"

config = Config()
# Apply your optimized params (from your V2 code)
config.LEARNING_RATE = 4.755892715970439e-05
config.BATCH_SIZE = 24
config.WEIGHT_DECAY = 0.000175
config.WARMUP_RATIO = 0.000175
config.ALPHA = 0.776
config.FOCAL_GAMMA = 1.38
config.LABEL_SMOOTHING = 0.092

# 2. Reload Data
train_df, test_df, l2id, id2l, n_labels = load_and_prep_data(config)

contrastive_model_path = "contrastive_bert"

if not os.path.exists(contrastive_model_path):
    train_contrastive_embeddings(
        config,
        train_df,
        config.RAW_TEXT_COLUMN,
        config.LABEL_COLUMN,
        output_path=contrastive_model_path
    )

# Point the Config to use the NEW contrastive model instead of the base 'dbmdz/bert...'
logging.info(f"Switching backbone to Contrastive Tuned Model: {contrastive_model_path}")
config.MODEL_NAME = contrastive_model_path  # <--- CRITICAL CHANGE

# 3. Train
# Ensure the model_save_suffix is unique to avoid overwriting issues
model_path = train_final_model(
    config,
    train_df,
    l2id,
    id2l,
    n_labels,
    config.RAW_TEXT_COLUMN,
    #model_save_suffix="_final_v3"  # NEW NAME
)

# 4. Evaluate
# This should now work without any "token_type_id" errors
evaluate_on_test_set(
    config,
    test_df,
    model_path,
    l2id,
    id2l,
    n_labels,
    config.RAW_TEXT_COLUMN,
    report_suffix="_final_v3"
)

LOADING DATASET
Loaded 3990 samples
Model will train on 11 valid intents.

11 intents found:
  Ordini.Richiesta_Stato_Consegna                    :  759 (19.02%)
  ComingSoon.Contabilità.Informazioni                :  619 (15.51%)
  ComingSoon.Procedure.Ordini_Informazioni           :  494 (12.38%)
  Generico.Richiesta_Generica                        :  420 (10.53%)
  Problemi.Lettura_Device                            :  380 ( 9.52%)
  Problemi.Alimentazione_Device                      :  361 ( 9.05%)
  ComingSoon.Problemi.Autenticazione_Smarrimento_Blo :  279 ( 6.99%)
  Problemi.Software_Device                           :  225 ( 5.64%)
  Procedure.Accettazione_Materiale_Problemi          :  224 ( 5.61%)
  NOMATCH                                            :  137 ( 3.43%)

Train/Val: 3391 | Test: 599
Switching backbone to Contrastive Tuned Model: contrastive_bert
TRAINING FINAL MODEL
Using optimized hyperparameters:
  LR=4.76e-05, BS=24, WD=0.000
Training Fold FINAL...
Class balance we

Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.304,0.23938,0.894118,0.895343,0.885977
2,0.1746,0.218332,0.879412,0.880932,0.861607
3,0.0963,0.272664,0.902941,0.901128,0.879421
4,0.0783,0.326748,0.879412,0.876826,0.858737
5,0.0404,0.413297,0.870588,0.870738,0.842096
6,0.0309,0.373174,0.894118,0.892581,0.863233
7,0.0124,0.442378,0.888235,0.88706,0.862104
8,0.0053,0.421826,0.882353,0.883013,0.86216
9,0.0043,0.440482,0.888235,0.887155,0.864609
10,0.0017,0.431495,0.882353,0.880895,0.85324


Fold FINAL: Best F1 = 0.9011
Model saved to italian_intent_model
Final model training complete. Val F1: 0.9011
Model saved to: italian_intent_model

FINAL EVALUATION ON TEST SET
Loading inference model from italian_intent_model to cuda...
Model loaded successfully.
Running inference on 599 samples...


Inference Batch: 100%|██████████| 25/25 [00:00<00:00, 41.49it/s]


Test Accuracy: 0.8898
Test F1 Score: 0.8890

DETAILED CLASSIFICATION REPORT
                                                             precision    recall  f1-score   support

                        ComingSoon.Contabilità.Informazioni       0.96      0.98      0.97        93
ComingSoon.Problemi.Autenticazione_Smarrimento_Blocco_Carta       0.95      0.86      0.90        42
                   ComingSoon.Procedure.Ordini_Informazioni       0.80      0.80      0.80        74
                                Generico.Richiesta_Generica       0.83      0.78      0.80        63
                                                    NOMATCH       0.94      0.75      0.83        20
                            Ordini.Richiesta_Stato_Consegna       0.87      0.85      0.86       114
                              Problemi.Alimentazione_Device       1.00      0.98      0.99        54
                                    Problemi.Lettura_Device       0.92      0.98      0.95        57
             





ERROR ANALYSIS
Total Errors: 66 (11.02%)

Top 10 Error Patterns:
----------------------------------------------------------------------
  Ordini.Richiesta_Stato_Consegna          → ComingSoon.Procedure.Ordini_Informazioni:  13 (19.7%)
  ComingSoon.Procedure.Ordini_Informazioni → Ordini.Richiesta_Stato_Consegna         :   9 (13.6%)
  ComingSoon.Procedure.Ordini_Informazioni → Procedure.Accettazione_Materiale_Problemi:   4 ( 6.1%)
  ComingSoon.Problemi.Autenticazione_Smarrimento_Blocco_Carta → Problemi.Software_Device                :   4 ( 6.1%)
  NOMATCH                                  → Generico.Richiesta_Generica             :   4 ( 6.1%)
  Ordini.Richiesta_Stato_Consegna          → Generico.Richiesta_Generica             :   4 ( 6.1%)
  Generico.Richiesta_Generica              → Ordini.Richiesta_Stato_Consegna         :   4 ( 6.1%)
  Generico.Richiesta_Generica              → Problemi.Lettura_Device                 :   3 ( 4.5%)
  Generico.Richiesta_Generica              → Proced

0.889042079540029

In [62]:
config.TEMPERATURE = 1
config.CONFIDENCE_THRESHOLD = 0
evaluate_on_test_set(
    config,
    test_df,
    model_path,
    l2id,
    id2l,
    n_labels,
    config.RAW_TEXT_COLUMN,
    report_suffix="_final_v3"
)


FINAL EVALUATION ON TEST SET
Loading inference model from italian_intent_model to cuda...
Model loaded successfully.
Running inference on 599 samples...


Inference Batch: 100%|██████████| 25/25 [00:00<00:00, 41.43it/s]


Test Accuracy: 0.8865
Test F1 Score: 0.8852

DETAILED CLASSIFICATION REPORT
                                                             precision    recall  f1-score   support

                        ComingSoon.Contabilità.Informazioni       0.96      0.98      0.97        93
ComingSoon.Problemi.Autenticazione_Smarrimento_Blocco_Carta       0.92      0.86      0.89        42
                   ComingSoon.Procedure.Ordini_Informazioni       0.80      0.80      0.80        74
                                Generico.Richiesta_Generica       0.83      0.78      0.80        63
                                                    NOMATCH       0.93      0.65      0.76        20
                            Ordini.Richiesta_Stato_Consegna       0.87      0.85      0.86       114
                              Problemi.Alimentazione_Device       1.00      0.98      0.99        54
                                    Problemi.Lettura_Device       0.92      0.98      0.95        57
             





ERROR ANALYSIS
Total Errors: 68 (11.35%)

Top 10 Error Patterns:
----------------------------------------------------------------------
  Ordini.Richiesta_Stato_Consegna          → ComingSoon.Procedure.Ordini_Informazioni:  13 (19.1%)
  ComingSoon.Procedure.Ordini_Informazioni → Ordini.Richiesta_Stato_Consegna         :   9 (13.2%)
  ComingSoon.Procedure.Ordini_Informazioni → Procedure.Accettazione_Materiale_Problemi:   4 ( 5.9%)
  Ordini.Richiesta_Stato_Consegna          → Generico.Richiesta_Generica             :   4 ( 5.9%)
  Generico.Richiesta_Generica              → Ordini.Richiesta_Stato_Consegna         :   4 ( 5.9%)
  ComingSoon.Problemi.Autenticazione_Smarrimento_Blocco_Carta → Problemi.Software_Device                :   4 ( 5.9%)
  NOMATCH                                  → Generico.Richiesta_Generica             :   4 ( 5.9%)
  Generico.Richiesta_Generica              → Procedure.Accettazione_Materiale_Problemi:   3 ( 4.4%)
  Generico.Richiesta_Generica              → Probl

0.8852232288750148

In [54]:
# =====================================================================
# POST-TRAINING CONFIDENCE CALIBRATION
# =====================================================================
# This is a SIMPLE approach that you can apply to your EXISTING model
# without retraining from scratch.
# =====================================================================

import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import log_loss
from scipy.optimize import minimize

class TemperatureScaling:
    """
    Simple temperature scaling for calibration.
    Based on "On Calibration of Modern Neural Networks" (Guo et al., 2017)

    This is a single learned parameter that scales all logits.
    """

    def __init__(self):
        self.temperature = 1.0

    def fit(self, logits: np.ndarray, labels: np.ndarray):
        """
        Find optimal temperature on validation set

        Args:
            logits: Raw model outputs [N, num_classes]
            labels: True labels [N]
        """

        def objective(temp):
            """Minimize negative log likelihood"""
            scaled_logits = logits / temp[0]
            # Compute softmax
            exp_logits = np.exp(scaled_logits - np.max(scaled_logits, axis=1, keepdims=True))
            probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
            # Compute cross-entropy
            loss = log_loss(labels, probs)
            return loss

        # Optimize temperature
        result = minimize(
            objective,
            x0=[1.0],  # Start at 1.0 (no scaling)
            bounds=[(0.1, 10.0)],  # Temperature between 0.1 and 10
            method='L-BFGS-B'
        )

        self.temperature = result.x[0]
        print(f"Optimal temperature: {self.temperature:.3f}")

        # Higher temperature (>1) = less confident
        # Lower temperature (<1) = more confident
        if self.temperature > 1.5:
            print("⚠️  Model is OVERCONFIDENT - temperature scaling will help")
        elif self.temperature < 0.8:
            print("⚠️  Model is UNDERCONFIDENT - might be undertrained")
        else:
            print("✓ Model confidence is reasonable")

    def apply(self, logits: np.ndarray) -> np.ndarray:
        """Apply temperature scaling to logits"""
        scaled_logits = logits / self.temperature
        exp_logits = np.exp(scaled_logits - np.max(scaled_logits, axis=1, keepdims=True))
        probs = exp_logits / np.sum(exp_logits, axis=1, keepdims=True)
        return probs


def calibrate_existing_model(
    model_path: str,
    val_texts: List[str],
    val_labels: List[int],
    config: Config
) -> TemperatureScaling:
    """
    Calibrate an existing trained model using validation set

    Returns:
        calibrator: TemperatureScaling object to use in inference
    """

    logging.info("=" * 70)
    logging.info("CALIBRATING MODEL CONFIDENCE")
    logging.info("=" * 70)

    # Load model
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    # Get logits on validation set
    all_logits = []
    all_labels = []

    batch_size = 32
    for i in tqdm(range(0, len(val_texts), batch_size), desc="Collecting logits"):
        batch_texts = val_texts[i:i + batch_size]
        batch_labels = val_labels[i:i + batch_size]

        inputs = tokenizer(
            batch_texts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            all_logits.append(logits.cpu().numpy())
            all_labels.extend(batch_labels)

    all_logits = np.vstack(all_logits)
    all_labels = np.array(all_labels)

    # Fit temperature scaling
    calibrator = TemperatureScaling()
    calibrator.fit(all_logits, all_labels)

    # Compare before/after
    print("\n" + "=" * 70)
    print("CALIBRATION COMPARISON")
    print("=" * 70)

    # Before calibration
    before_probs = torch.nn.functional.softmax(torch.tensor(all_logits), dim=-1).numpy()
    before_conf = np.max(before_probs, axis=1)
    before_preds = np.argmax(before_probs, axis=1)
    before_acc = (before_preds == all_labels).mean()

    # After calibration
    after_probs = calibrator.apply(all_logits)
    after_conf = np.max(after_probs, axis=1)
    after_preds = np.argmax(after_probs, axis=1)
    after_acc = (after_preds == all_labels).mean()

    print(f"Before Calibration:")
    print(f"  Accuracy: {before_acc:.4f}")
    print(f"  Mean Confidence: {before_conf.mean():.4f}")
    print(f"  Median Confidence: {np.median(before_conf):.4f}")

    print(f"\nAfter Calibration:")
    print(f"  Accuracy: {after_acc:.4f}")
    print(f"  Mean Confidence: {after_conf.mean():.4f}")
    print(f"  Median Confidence: {np.median(after_conf):.4f}")

    # Save calibrator
    import pickle
    with open(f"{model_path}/temperature_calibrator.pkl", 'wb') as f:
        pickle.dump(calibrator, f)

    logging.info(f"✓ Calibrator saved to {model_path}/temperature_calibrator.pkl")

    return calibrator


# =====================================================================
# MODIFIED INFERENCE PIPELINE WITH CALIBRATION
# =====================================================================

class CalibratedInferencePipeline:
    """
    Enhanced inference with temperature scaling
    """

    def __init__(
        self,
        model_path: str,
        use_calibration: bool = True,
        device: str = None
    ):
        self.device = device if device else ("cuda" if torch.cuda.is_available() else "cpu")
        self.model_path = model_path

        # Load model
        from transformers import AutoTokenizer, AutoModelForSequenceClassification
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.model.to(self.device)
        self.model.eval()

        # Load calibrator if available
        self.calibrator = None
        if use_calibration:
            calibrator_path = f"{model_path}/temperature_calibrator.pkl"
            if os.path.exists(calibrator_path):
                import pickle
                with open(calibrator_path, 'rb') as f:
                    self.calibrator = pickle.load(f)
                logging.info(f"✓ Loaded calibrator (T={self.calibrator.temperature:.3f})")
            else:
                logging.warning(f"⚠️  No calibrator found at {calibrator_path}")

    def predict_batch(
        self,
        texts: List[str],
        batch_size: int = 32,
        threshold: float = 0.7,
        return_confidences: bool = False
    ):
        """Inference with optional calibration"""

        all_preds_ids = []
        all_confidences = []
        all_probs = []

        for i in tqdm(range(0, len(texts), batch_size), desc="Inference"):
            batch_texts = texts[i:i + batch_size]

            inputs = self.tokenizer(
                batch_texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=128
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)
                logits = outputs.logits

                # Apply calibration if available
                if self.calibrator is not None:
                    probs = self.calibrator.apply(logits.cpu().numpy())
                    probs = torch.tensor(probs)
                else:
                    probs = torch.nn.functional.softmax(logits, dim=-1)

                confidences, predictions = torch.max(probs, dim=-1)

                all_preds_ids.extend(predictions.cpu().numpy())
                all_confidences.extend(confidences.cpu().numpy())
                all_probs.extend(probs.cpu().numpy())

        # Apply threshold
        predicted_labels = []
        for idx, conf in zip(all_preds_ids, all_confidences):
            if conf < threshold:
                predicted_labels.append("NOMATCH")
            else:
                predicted_labels.append(self.model.config.id2label[idx])

        if return_confidences:
            return predicted_labels, all_preds_ids, np.array(all_confidences), np.array(all_probs)
        else:
            return predicted_labels, all_preds_ids


# =====================================================================
# INTEGRATION: HOW TO USE
# =====================================================================

def calibrate_and_evaluate(config: Config, model_path: str, train_df, test_df, l2id, id2l, n_labels):
    """
    Complete workflow: Calibrate → Evaluate
    """

    # Step 1: Create calibration set (use 20% of training data)
    from sklearn.model_selection import train_test_split

    _, cal_df = train_test_split(
        train_df,
        test_size=0.2,
        random_state=42,
        stratify=train_df['labels']
    )

    cal_texts = cal_df[config.RAW_TEXT_COLUMN].tolist()
    cal_labels = cal_df['labels'].tolist()

    # Step 2: Calibrate model
    calibrator = calibrate_existing_model(
        model_path=model_path,
        val_texts=cal_texts,
        val_labels=cal_labels,
        config=config
    )

    # Step 3: Evaluate with calibration
    pipeline = CalibratedInferencePipeline(
        model_path=model_path,
        use_calibration=True
    )

    test_texts = test_df[config.RAW_TEXT_COLUMN].tolist()
    true_labels = test_df[config.LABEL_COLUMN].tolist()

    pred_labels, pred_ids, confidences, all_probs = pipeline.predict_batch(
        test_texts,
        batch_size=config.BATCH_SIZE,
        threshold=config.CONFIDENCE_THRESHOLD,
        return_confidences=True
    )

    # Calculate metrics
    from sklearn.metrics import accuracy_score, f1_score
    test_acc = accuracy_score(true_labels, pred_labels)
    test_f1 = f1_score(true_labels, pred_labels, average="weighted", zero_division=0)

    print("\n" + "=" * 70)
    print("CALIBRATED MODEL RESULTS")
    print("=" * 70)
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print(f"Mean Confidence: {np.mean(confidences):.4f}")
    print(f"Max Confidence: {np.max(confidences):.4f}")
    print(f"# of 1.000 confidence predictions: {np.sum(confidences >= 0.9999)}")

    return test_f1


# =====================================================================
# USAGE IN YOUR MAIN CODE
# =====================================================================

"""
# After training your model:
model_path = train_final_model(...)

# Calibrate and evaluate:
final_f1 = calibrate_and_evaluate(
    config=config,
    model_path=model_path,
    train_df=train_df,
    test_df=test_df,
    l2id=l2id,
    id2l=id2l,
    n_labels=n_labels
)
"""

'\n# After training your model:\nmodel_path = train_final_model(...)\n\n# Calibrate and evaluate:\nfinal_f1 = calibrate_and_evaluate(\n    config=config,\n    model_path=model_path,\n    train_df=train_df,\n    test_df=test_df,\n    l2id=l2id,\n    id2l=id2l,\n    n_labels=n_labels\n)\n'

In [55]:
final_f1 = calibrate_and_evaluate(
    config=config,
    model_path=model_path,
    train_df=train_df,
    test_df=test_df,
    l2id=l2id,
    id2l=id2l,
    n_labels=n_labels
)

CALIBRATING MODEL CONFIDENCE


Collecting logits: 100%|██████████| 22/22 [00:00<00:00, 26.45it/s]

Optimal temperature: 1.394
✓ Model confidence is reasonable

CALIBRATION COMPARISON
Before Calibration:
  Accuracy: 0.9249
  Mean Confidence: 0.9768
  Median Confidence: 0.9962

After Calibration:
  Accuracy: 0.9249
  Mean Confidence: 0.9397
  Median Confidence: 0.9662
✓ Calibrator saved to italian_intent_model/temperature_calibrator.pkl





✓ Loaded calibrator (T=1.394)


Inference: 100%|██████████| 25/25 [00:00<00:00, 40.67it/s]


CALIBRATED MODEL RESULTS
Test Accuracy: 0.8815
Test F1 Score: 0.8806
Mean Confidence: 0.9326
Max Confidence: 0.9748
# of 1.000 confidence predictions: 0





In [None]:
from IPython.core.display import deepcopy
# ===================== MAIN EXECUTION =====================
def main():
    """Main training pipeline"""

    # Initialize configuration
    config = Config()

    logging.info("=" * 70)
    logging.info("BERT INTENT CLASSIFICATION PIPELINE")
    logging.info("=" * 70)
    logging.info(f"Model: {config.MODEL_NAME}")
    logging.info(f"Features enabled:")
    logging.info(f"  - Focal Loss: {config.USE_FOCAL_LOSS}")
    logging.info(f"  - Class Weights: {config.USE_CLASS_WEIGHTS}")
    logging.info(f"  - Priority Scores: {config.USE_PRIORITY_SCORES}")
    logging.info(f"  - Label Smoothing: {config.USE_LABEL_SMOOTHING}")
    logging.info(f"  - Mixed Precision: {config.USE_MIXED_PRECISION}")
    logging.info("=" * 70 + "\n")

    # Load data
    train_df, test_df, l2id, id2l, n_labels = load_and_prep_data(config)
    text_column = config.RAW_TEXT_COLUMN

    # Hyperparameter optimization
    logging.info("Starting hyperparameter optimization...")
    # best_config, trials_df = run_hp_search_optuna(
    #     config, train_df, l2id, id2l, n_labels, text_column
    # )

    best_config = copy.deepcopy(config)
    best_config.LEARNING_RATE = 4.755892715970439e-05
    best_config.BATCH_SIZE = 24
    best_config.WEIGHT_DECAY = 0.0001751018975372498
    best_config.WARMUP_RATIO = 0.0001751018975372498
    best_config.ALPHA = 0.7768600075165988
    best_config.FOCAL_GAMMA = 1.3798296530795757
    best_config.LABEL_SMOOTHING = 0.09242568547841506

    # Train final model
    logging.info("Training final model with best hyperparameters...")
    model_path = train_final_model(
        best_config, train_df, l2id, id2l, n_labels, text_column, "_optimized2"
    )

    # Evaluate
    logging.info("Evaluating on test set...")
    final_f1 = evaluate_on_test_set(
        best_config, test_df, model_path, l2id, id2l, n_labels, text_column, "_optimized2"
    )

    # Final summary
    logging.info("\n" + "=" * 70)
    logging.info("TRAINING COMPLETE")
    logging.info("=" * 70)
    logging.info(f"Final Test F1: {final_f1:.4f}")
    logging.info(f"Model saved to: {model_path}")
    logging.info(f"Best hyperparameters: {best_config.BEST_PARAMS_PATH}")
    logging.info("=" * 70)


if __name__ == "__main__":
    main()

2025-11-24 16:57:45,529 - INFO - BERT INTENT CLASSIFICATION PIPELINE
2025-11-24 16:57:45,530 - INFO - Model: dbmdz/bert-base-italian-xxl-cased
2025-11-24 16:57:45,531 - INFO - Features enabled:
2025-11-24 16:57:45,531 - INFO -   - Focal Loss: True
2025-11-24 16:57:45,532 - INFO -   - Class Weights: True
2025-11-24 16:57:45,532 - INFO -   - Priority Scores: True
2025-11-24 16:57:45,533 - INFO -   - Label Smoothing: True
2025-11-24 16:57:45,533 - INFO -   - Mixed Precision: True

2025-11-24 16:57:45,535 - INFO - LOADING DATASET
2025-11-24 16:57:45,550 - INFO - ✓ Loaded 4021 samples
2025-11-24 16:57:45,553 - INFO - 
11 intents found:
2025-11-24 16:57:45,554 - INFO -   Ordini.Richiesta_Stato_Consegna                    :  763 (18.98%)
2025-11-24 16:57:45,554 - INFO -   ComingSoon.Contabilità.Informazioni                :  620 (15.42%)
2025-11-24 16:57:45,555 - INFO -   ComingSoon.Procedure.Ordini_Informazioni           :  496 (12.34%)
2025-11-24 16:57:45,555 - INFO -   Generico.Richiesta_G

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

2025-11-24 16:57:48,293 - INFO - Training Fold FINAL...


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

2025-11-24 16:57:51,759 - INFO - ✓ Class balance weights: 0.500 - 2.000
2025-11-24 16:57:53,885 - INFO - ✓ Priority weights: 1.100 - 1.925
2025-11-24 16:57:53,885 - INFO - ✓ Combined weights (α=0.78)
2025-11-24 16:57:53,886 - INFO - ✓ Final weights: 0.677 - 1.752
2025-11-24 16:57:54,059 - INFO - Using Focal Loss (γ=1.3798296530795757, smoothing=0.09242568547841506)


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.6727,0.398074,0.807018,0.806496,0.783163
2,0.2956,0.350473,0.836257,0.83004,0.830532
3,0.1975,0.377643,0.818713,0.813934,0.803499
4,0.1046,0.40962,0.850877,0.851962,0.843673
5,0.0624,0.505578,0.818713,0.817687,0.81271
6,0.0555,0.461739,0.845029,0.84305,0.843845
7,0.0329,0.532822,0.839181,0.836462,0.829863
8,0.0236,0.552957,0.842105,0.839728,0.832825
9,0.0192,0.538714,0.856725,0.856416,0.844802
10,0.0054,0.583954,0.850877,0.85019,0.83779


2025-11-24 17:00:31,634 - INFO - Fold FINAL: Best F1 = 0.8564
2025-11-24 17:00:32,708 - INFO - ✓ Model saved to italian_intent_model_optimized2
2025-11-24 17:00:33,109 - INFO - Final model training complete. Val F1: 0.8564
2025-11-24 17:00:33,110 - INFO - Model saved to: italian_intent_model_optimized2
2025-11-24 17:00:33,124 - INFO - \Evaluating on test set...
2025-11-24 17:00:33,126 - INFO - 
2025-11-24 17:00:33,126 - INFO - FINAL EVALUATION ON TEST SET
2025-11-24 17:00:33,127 - INFO - Loading inference model from italian_intent_model_optimized2 to cuda...
2025-11-24 17:00:34,199 - INFO - ✓ Model loaded successfully.
2025-11-24 17:00:34,200 - INFO - Running inference on 604 samples...


Inference Batch:   0%|          | 0/26 [00:00<?, ?it/s]


TypeError: OPTForSequenceClassification.forward() got an unexpected keyword argument 'token_type_ids'

In [None]:
from google.colab import runtime

runtime.unassign()

# Stakeholder utternaces report

In [39]:
import pandas as pd

def generate_stakeholder_report(config, model_path, input_csv_path, output_csv_path):
    df_new = pd.read_csv(input_csv_path)

    if 'UserUtterance' in df_new.columns:
        texts = df_new['UserUtterance'].astype(str).tolist()
    else:
        texts = df_new.iloc[:, 0].astype(str).tolist()

    pipeline = InferencePipeline(model_path=model_path)

    print(f"Running inference on {len(texts)} new sentences...")

    pred_labels, pred_ids, confidences, all_probs = pipeline.predict_batch(
        texts,
        batch_size=32,
        return_confidences=True,
        temperature=1.5,
        threshold=0.65
    )

    report_df = pd.DataFrame({
        'User_Sentence': texts,
        'Predicted_Intent': pred_labels,
        'Confidence_Score': confidences
    })

    report_df['Model_Certainty'] = report_df['Confidence_Score'].apply(
        lambda x: 'High' if x > 0.8 else ('Medium' if x > 0.6 else 'Low (Check Manually)')
    )

    report_df = report_df[[
        'User_Sentence',
        'Predicted_Intent',
        'Model_Certainty',
        'Confidence_Score'
    ]]

    report_df.to_csv(output_csv_path, index=False)
    print(f"Report generated: {output_csv_path}")
    return report_df

# --- EXECUTION ---
# Assumes you have your config and trained model path ready
# model_path = "models/italian_intent_model_final_v3" # Update this to your actual path
generate_stakeholder_report(config, model_path, "user_utterance_test.csv", "stakeholder_review_v1.csv")

Loading inference model from italian_intent_model to cuda...
Model loaded successfully.
Running inference on 294 new sentences...


Inference Batch: 100%|██████████| 10/10 [00:00<00:00, 28.49it/s]


Report generated: stakeholder_review_v1.csv


Unnamed: 0,User_Sentence,Predicted_Intent,Model_Certainty,Confidence_Score
0,stop quelle però mi servono le ho scaricate,Ordini.Richiesta_Stato_Consegna,High,0.873126
1,non più manco a chiamare così il numero,NOMATCH,Low (Check Manually),0.519890
2,devo sollecitare una riparazione,Generico.Richiesta_Generica,High,0.949710
3,quello che quello che mostra i numeri durante ...,Generico.Richiesta_Generica,Medium,0.668279
4,Non mi funziona la convalida delle vincite del...,Problemi.Lettura_Device,High,0.906829
...,...,...,...,...
289,guardi Ho un problema con la stampante,Generico.Richiesta_Generica,High,0.954317
290,il terminale del Lotto quando inserisco la sch...,ComingSoon.Problemi.Autenticazione_Smarrimento...,High,0.804300
291,il terminale del Lotto fischia,Problemi.Lettura_Device,High,0.856151
292,"cosa €3,50 ti devo dare e sono Eccoli qua Gius...",NOMATCH,Medium,0.697643
