## Deberta v3 base

In [None]:
# Keep lightweight imports
import pandas as pd
import numpy as np
import torch
import torch.nn as nn # Needed for custom loss
import os
import shutil
import sys # For exit on critical errors
from sklearn.model_selection import StratifiedKFold
from rich.console import Console
from rich.table import Table
import gc
# Defer heavy imports
# from transformers import ...
# from sklearn.metrics import ...

console = Console()

# -------------------------------
# Configuration (MODIFIED)
# -------------------------------
SEED = 42
MODEL_NAME = "microsoft/deberta-v3-base"
N_SPLITS = 5
MAX_LENGTH = 512
EFFECTIVE_BATCH_SIZE = 8
PER_DEVICE_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
EPOCHS = 6
EARLY_STOPPING_PATIENCE = 3
WEIGHT_DECAY = 0.01
# This will create a subfolder in BASE_OUTPUT_DIR
RUN_NAME = f"{MODEL_NAME.split('/')[-1]}_cv_{N_SPLITS}folds_ep{EPOCHS}_bs{EFFECTIVE_BATCH_SIZE}"

# Output directory for CV models and results
BASE_OUTPUT_DIR = "./models" # Base directory where run folders will be created
DATA_DIR = "./data" # Directory containing train.csv and valid.csv

# For reproducibility
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    NUM_GPUS = torch.cuda.device_count()
    console.print(f"[cyan]CUDA available. Using {NUM_GPUS} GPU(s). Device: {torch.cuda.get_device_name(0)}[/]")
else:
    NUM_GPUS = 1 # Assume 1 for calculation if CPU
    console.print("[yellow]⚠️ CUDA not available. Training on CPU (will be very slow).[/]")

# Calculate Gradient Accumulation Steps
GRADIENT_ACCUMULATION_STEPS = max(1, EFFECTIVE_BATCH_SIZE // (PER_DEVICE_BATCH_SIZE * NUM_GPUS))
console.print(f"[cyan]Effective Batch Size: {EFFECTIVE_BATCH_SIZE}, Per-Device Batch Size: {PER_DEVICE_BATCH_SIZE}, Num GPUs: {NUM_GPUS} => Gradient Accumulation Steps: {GRADIENT_ACCUMULATION_STEPS}[/]")


# --- Defer heavy library imports ---
console.print("[dim]Importing libraries...[/]")
try:
    from transformers import (
        AutoTokenizer,
        AutoModelForSequenceClassification,
        Trainer,
        TrainingArguments,
        EarlyStoppingCallback
    )
    from sklearn.metrics import f1_score, precision_recall_curve, classification_report, confusion_matrix, precision_recall_fscore_support
    from rich.panel import Panel
    from rich import box
    from rich.progress import track
    from torch.utils.data import Dataset, DataLoader
    from transformers import TrainerCallback # Needed for custom trainer loss
except ImportError as e:
    console.print(f"[bold red]Error: Missing required library -> {e}[/]")
    console.print("[yellow]Please install all necessary libraries (pandas, torch, transformers[accelerate], scikit-learn, rich, tqdm).[/]")
    sys.exit(1)
console.print("[green]✓ Libraries imported.[/]")


# -------------------------------
# 1. Load Data (Separately!)
# -------------------------------
train_csv_path = os.path.join(DATA_DIR, "train.csv")
valid_csv_path = os.path.join(DATA_DIR, "valid.csv")

try:
    train_df = pd.read_csv(train_csv_path)
    holdout_valid_df = pd.read_csv(valid_csv_path) # Load original valid set separately

    # --- Data Cleaning Function ---
    def clean_dataframe(df, name):
        console.print(f"Cleaning {name} Data...")
        initial_count = len(df)
        # Standardize column names (handle potential variations)
        df.columns = [col.lower().strip() for col in df.columns]
        if 'labels' in df.columns and 'label' not in df.columns:
            df = df.rename(columns={"labels": "label"})

        # Check required columns
        required_cols = ['text', 'label']
        if not all(col in df.columns for col in required_cols):
            missing = [col for col in required_cols if col not in df.columns]
            raise KeyError(f"Missing required columns in {name}: {missing}")

        # Add 'id' column if not present (using index)
        if 'id' not in df.columns:
             console.print(f"[dim]Adding 'id' column based on index to {name} data.[/]")
             df['id'] = df.index

        # Drop rows with NaNs in text or label
        nan_rows = df['text'].isnull() | df['label'].isnull()
        if nan_rows.any():
            console.print(f"[yellow]⚠️ NaNs found in {name} data. Dropping {nan_rows.sum()} rows...[/]")
            df = df[~nan_rows].copy()

        # Ensure label is numeric and then integer
        df['label'] = pd.to_numeric(df['label'], errors='coerce')
        label_nan_rows = df['label'].isnull()
        if label_nan_rows.any():
            console.print(f"[yellow]⚠️ Non-numeric labels found after coercion in {name}. Dropping {label_nan_rows.sum()} rows...[/]")
            df = df[~label_nan_rows].copy()

        df['label'] = df['label'].astype(int)

        # Ensure text is string
        df['text'] = df['text'].astype(str)

        cleaned_count = len(df)
        if cleaned_count < initial_count:
            console.print(f"[dim]Dropped {initial_count - cleaned_count} rows from {name}.[/]")

        # Check for valid labels (0 and 1)
        valid_labels = {0, 1}
        if not set(df['label'].unique()).issubset(valid_labels):
            invalid_labels = set(df['label'].unique()) - valid_labels
            console.print(f"[yellow]⚠️ Invalid labels found in {name}: {invalid_labels}. Keeping only 0 and 1.[/]")
            df = df[df['label'].isin(valid_labels)].copy()
            if len(df) < cleaned_count:
                 console.print(f"[dim]Dropped {cleaned_count - len(df)} rows with invalid labels.[/]")


        console.print(f"[green]✓ {name} data loaded and cleaned. Total: {len(df)} examples.[/]")
        # Check label distribution
        label_counts = df['label'].value_counts()
        console.print(f"{name} data label distribution:\n{label_counts}")
        if len(label_counts) < 2 and name == "Training":
             console.print("[bold red]Error: Training data must contain both labels 0 and 1 for stratified splitting.[/]")
             sys.exit(1)
        return df.reset_index(drop=True) # Reset index after cleaning

    train_df = clean_dataframe(train_df, "Training")
    holdout_valid_df = clean_dataframe(holdout_valid_df, "Holdout Validation")

except FileNotFoundError as e:
    console.print(f"[bold red]Error: CSV file not found - {e}. Check paths '{train_csv_path}' and '{valid_csv_path}'.[/]")
    sys.exit(1)
except KeyError as e:
    console.print(f"[bold red]Error: Missing expected column in CSV - {e}. Ensure 'text' and 'label' (or 'labels') exist.[/]")
    sys.exit(1)
except Exception as e:
    console.print(f"[bold red]Unexpected error loading/cleaning data: {e}[/]")
    sys.exit(1)


# -------------------------------
# 2. Define Dataset Class (No changes needed here, uses MAX_LENGTH from config)
# -------------------------------
class VaccineDataset(Dataset):
    def __init__(self, texts, labels=None, ids=None, tokenizer=None, max_length=512, is_inference=False):
        if tokenizer is None:
            raise ValueError("Tokenizer must be provided.")
        self.texts = texts
        self.labels = labels
        self.ids = ids # Store IDs if provided
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_inference = (labels is None)

        if not self.is_inference and (self.labels is None or len(texts) != len(labels)):
            raise ValueError("Texts and Labels must be provided and have the same length for training/evaluation.")
        if self.ids is not None and len(texts) != len(self.ids):
             raise ValueError("Texts and IDs must have the same length if IDs are provided.")

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx]) if idx < len(self.texts) and self.texts[idx] is not None else ""
        item_id = self.ids[idx] if self.ids is not None and idx < len(self.ids) else idx # Use index as fallback ID

        try:
            encoding = self.tokenizer(
                text,
                truncation=True,
                padding='max_length',
                max_length=self.max_length,
                return_tensors='pt'
            )
            item = {key: val.squeeze(0) for key, val in encoding.items()}

            if not self.is_inference:
                if idx < len(self.labels):
                    label = self.labels[idx]
                    item['labels'] = torch.tensor(label, dtype=torch.long)
                else:
                     item['labels'] = torch.tensor(-1, dtype=torch.long) # Should not happen

            if self.ids is not None:
                item['id'] = item_id # Keep ID as is (numeric or string)

            return item

        except Exception as e:
            console.print(f"[bold red]Error in __getitem__ at index {idx} (ID: {item_id}): {e}[/]")
            dummy_item = {
                'input_ids': torch.zeros(self.max_length, dtype=torch.long),
                'attention_mask': torch.zeros(self.max_length, dtype=torch.long),
            }
            if not self.is_inference:
                dummy_item['labels'] = torch.tensor(-1, dtype=torch.long)
            if self.ids is not None:
                dummy_item['id'] = item_id # Return the original ID even for dummy item
            return dummy_item

# -------------------------------
# 3. Load Tokenizer (once)
# -------------------------------
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    console.print(f"[green]✓ Tokenizer loaded from '{MODEL_NAME}'.[/]")
except Exception as e:
    console.print(f"[bold red]Error loading tokenizer '{MODEL_NAME}': {e}[/]")
    sys.exit(1)


# -------------------------------
# 4. Metric Function (for Trainer, based on argmax)
# -------------------------------
def compute_metrics_for_trainer(eval_pred):
    """Calculates F1 based on argmax for checkpoint selection during training."""
    logits, labels = eval_pred
    valid_indices = labels != -1 # Filter out potential errors from __getitem__
    labels = labels[valid_indices]
    logits = logits[valid_indices]

    if len(labels) == 0: return {"f1": 0.0} # No valid labels to compute

    preds = np.argmax(logits, axis=-1)
    # Calculate F1 for class 1 specifically, as requested
    f1 = f1_score(labels, preds, average='binary', pos_label=1, zero_division=0)
    return {"f1": f1} # Trainer uses this metric key


# -----------------------------------
# 5. Threshold Optimization Function (MODIFIED: Target Class 1 F1)
# -----------------------------------
def find_optimal_threshold(labels, probs, target_label=1):
    """Finds the threshold that maximizes the F1 score for the target_label."""
    valid_indices = labels != -1
    labels = labels[valid_indices]
    probs = probs[valid_indices]

    if len(labels) == 0 or len(np.unique(labels)) < 2:
         console.print("[yellow]⚠️ Not enough valid data or classes for threshold optimization. Returning default 0.5.[/]")
         return 0.5, 0.0

    # Ensure probs are for the positive class (target_label)
    # Assuming probs are already P(class=1) as calculated later
    precision, recall, thresholds = precision_recall_curve(labels, probs, pos_label=target_label)

    # Calculate F1 score, avoiding division by zero
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-9)
    f1_scores = f1_scores[:-1] # Drop last value corresponding to no prediction
    thresholds = thresholds[:len(f1_scores)] # Align thresholds with scores

    f1_scores = np.nan_to_num(f1_scores) # Handle potential NaNs if precision/recall are zero

    if len(f1_scores) == 0:
         console.print("[yellow]⚠️ No valid F1 scores computed during threshold search. Returning default 0.5.[/]")
         return 0.5, 0.0

    best_f1_idx = np.argmax(f1_scores)
    best_f1 = f1_scores[best_f1_idx]
    best_thresh = thresholds[best_f1_idx]

    # Sanity check against 0.5 threshold F1 for the target class
    preds_at_05 = (probs >= 0.5).astype(int)
    f1_at_05 = f1_score(labels, preds_at_05, pos_label=target_label, zero_division=0)

    # Optionally uncomment to see comparison
    # console.print(f"[dim]Threshold search: Best F1={best_f1:.4f} @ Thr={best_thresh:.4f} vs F1={f1_at_05:.4f} @ Thr=0.5[/]")

    # No need to force 0.5, let the optimization decide
    return best_thresh, best_f1


# -------------------------------
# 6. Custom Trainer for Weighted Loss (CORRECTED)
# -------------------------------
class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Store weights on CPU initially, move to device in compute_loss
        self.class_weights_cpu = class_weights.cpu() if class_weights is not None else None
        if self.class_weights_cpu is not None:
            console.print(f"[cyan]Custom Trainer initialized with class weights (stored on CPU): {self.class_weights_cpu.numpy()}[/]")
        else:
            console.print("[yellow]⚠️ Custom Trainer initialized WITHOUT class weights (will use standard CE loss).[/]")

    # Modify signature to accept **kwargs
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        """
        Computes the loss using class weights if provided, otherwise falls back
        to the default Trainer loss computation.
        Accepts **kwargs to handle potential extra arguments passed by the Trainer internals.
        """
        if self.class_weights_cpu is None:
            # No class weights provided, fall back to the default Hugging Face loss.
            # Pass along any extra kwargs received.
            # console.print("[dim]Using default compute_loss (no weights).[/dim]") # Optional debug print
            return super().compute_loss(model, inputs, return_outputs=return_outputs, **kwargs)
        else:
            # Class weights are provided, compute custom weighted loss.
            if "labels" not in inputs:
                raise ValueError("Inputs must contain 'labels' for custom loss calculation.")

            labels = inputs.pop("labels") # Remove labels from inputs to avoid passing them to the model directly if not needed
            outputs = model(**inputs)     # Pass remaining inputs to the model
            logits = outputs.get("logits")

            if logits is None:
                 # Handle cases where the model output format might be different
                 # If your model returns loss directly, you might need to adjust
                 console.print("[yellow]⚠️ Model outputs did not contain 'logits'. Falling back to default loss calculation if possible.[/]")
                 # Re-add labels for the potential fallback
                 inputs["labels"] = labels
                 return super().compute_loss(model, inputs, return_outputs=return_outputs, **kwargs)

            # --- Custom Loss Calculation ---
            # Move weights to the same device as logits just-in-time
            class_weights_on_device = self.class_weights_cpu.to(logits.device)
            # console.print(f"[dim]Using weighted loss on device {logits.device} with weights {class_weights_on_device.cpu().numpy()}.[/dim]") # Optional debug print

            loss_fct = nn.CrossEntropyLoss(weight=class_weights_on_device)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            # --- End Custom Loss Calculation ---

            return (loss, outputs) if return_outputs else loss


# -------------------------------
# 7. Cross-Validation Loop (on train_df ONLY) (MODIFIED PATHS)
# -------------------------------
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
fold_results = []
best_model_paths = [] # Store paths to best model dir (e.g., ./models/RUN_NAME/fold_X/best_model)

# --- Create the main output directory for this run ---
RUN_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, RUN_NAME)
os.makedirs(RUN_OUTPUT_DIR, exist_ok=True)
console.print(f"[INFO] CV Run output will be saved under: '{RUN_OUTPUT_DIR}'")
# --- Create a directory for temporary checkpoints ---
TEMP_CHECKPOINT_BASE_DIR = os.path.join(BASE_OUTPUT_DIR, f"_cv_temp_checkpoints_{RUN_NAME}")
os.makedirs(TEMP_CHECKPOINT_BASE_DIR, exist_ok=True)


console.rule("[bold yellow]Starting Cross-Validation on Training Data[/]")

# Use train_df for splitting
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df['text'], train_df['label'])):
    current_fold = fold + 1
    console.rule(f"[bold blue]CV Fold {current_fold}/{N_SPLITS}[/]")

    # --- Get fold data from train_df ---
    train_fold_df = train_df.iloc[train_idx].copy().reset_index(drop=True)
    fold_valid_df = train_df.iloc[val_idx].copy().reset_index(drop=True) # Validation set FOR THIS FOLD

    console.print(f"Fold Train Size: {len(train_fold_df)}, Fold Validation Size: {len(fold_valid_df)}")
    train_fold_labels_dist = train_fold_df['label'].value_counts(dropna=False).sort_index()
    valid_fold_labels_dist = fold_valid_df['label'].value_counts(dropna=False).sort_index()
    console.print(f"Fold Train Labels:\n{train_fold_labels_dist}")
    console.print(f"Fold Validation Labels:\n{valid_fold_labels_dist}")

    if len(train_fold_labels_dist) < 2:
        console.print(f"[bold red]Error: Fold {current_fold} training data only has one class after splitting. Skipping fold.[/]")
        continue

    # --- Calculate Class Weights for this fold's training data ---
    try:
        n_samples = len(train_fold_df)
        n_classes = 2
        class_counts = train_fold_labels_dist.to_dict()
        # Ensure both 0 and 1 counts exist, default to 1 if missing (to avoid div by zero, though split should prevent this)
        count0 = class_counts.get(0, 1)
        count1 = class_counts.get(1, 1)

        # Inverse frequency weighting: weight = total_samples / (n_classes * count_for_class)
        weight0 = n_samples / (n_classes * count0)
        weight1 = n_samples / (n_classes * count1)

        class_weights_tensor = torch.tensor([weight0, weight1], dtype=torch.float)
        console.print(f"Calculated class weights for Fold {current_fold}: {class_weights_tensor.numpy()}")

    except Exception as e:
        console.print(f"[bold red]Error calculating class weights for fold {current_fold}: {e}. Proceeding without weights.[/]")
        class_weights_tensor = None # Fallback

    # --- Create fold datasets ---
    try:
        train_dataset = VaccineDataset(
            texts=train_fold_df['text'].tolist(),
            labels=train_fold_df['label'].tolist(),
            tokenizer=tokenizer,
            max_length=MAX_LENGTH
        )
        fold_eval_dataset = VaccineDataset(
            texts=fold_valid_df['text'].tolist(),
            labels=fold_valid_df['label'].tolist(),
            tokenizer=tokenizer,
            max_length=MAX_LENGTH
        )
        if len(train_dataset) == 0 or len(fold_eval_dataset) == 0:
             console.print(f"[bold red]Error: Empty dataset for fold {current_fold}. Skipping fold.[/]")
             continue
    except Exception as e:
        console.print(f"[bold red]Error creating datasets for fold {current_fold}: {e}[/]")
        continue

    # --- Load fresh model ---
    try:
        model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    except Exception as e:
        console.print(f"[bold red]Error loading model for fold {current_fold}: {e}[/]")
        continue

    # --- Define Training Paths (MODIFIED STRUCTURE) ---
    # Temporary directory for checkpoints during this fold's training
    fold_temp_checkpoint_dir = os.path.join(TEMP_CHECKPOINT_BASE_DIR, f"fold_{current_fold}")
    # Final directory structure for the *best* saved model of this fold
    final_fold_output_basedir = os.path.join(RUN_OUTPUT_DIR, f"fold_{current_fold}")
    final_best_model_dir = os.path.join(final_fold_output_basedir, "best_model") # <<< CHANGED STRUCTURE

    # --- Training Arguments ---
    training_args = TrainingArguments(
        output_dir=fold_temp_checkpoint_dir,    # <<< Use temporary dir for checkpoints
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
        per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE * 2,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        learning_rate=LEARNING_RATE,
        weight_decay=WEIGHT_DECAY,
        disable_tqdm=False,
        load_best_model_at_end=True,            # Crucial for getting the best model
        metric_for_best_model="f1",             # Use F1 on fold's validation set
        greater_is_better=True,
        logging_strategy="epoch",
        logging_steps=max(10, len(train_dataset) // (EFFECTIVE_BATCH_SIZE * 4)),
        log_level="info",
        save_total_limit=2,                     # Limit checkpoints saved in temp dir
        seed=SEED + fold,
        fp16=torch.cuda.is_available(),
        report_to=[],
        dataloader_num_workers=0,
        dataloader_pin_memory=torch.cuda.is_available(),
        optim="adamw_torch",
    )

    # --- Trainer Setup ---
    trainer = WeightedLossTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=fold_eval_dataset,
        compute_metrics=compute_metrics_for_trainer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE,
                                         early_stopping_threshold=0.001)],
        class_weights=class_weights_tensor
    )

    # --- Train ---
    console.print(f"🚀 Training Fold {current_fold}...")
    try:
        train_result = trainer.train()
        console.print(f"[green]✓ Training Fold {current_fold} completed after {train_result.metrics.get('train_runtime', 0):.2f}s.[/]")
        best_metric_val = trainer.state.best_metric
        if best_metric_val:
             console.print(f"[cyan]Fold {current_fold} - Best F1 score on internal validation set during training: {best_metric_val:.4f}[/]")
        else:
             console.print("[yellow]Could not retrieve best metric from trainer state.[/]")

    except Exception as e:
        console.print(f"[bold red]Error during training for fold {current_fold}: {e}[/]")
        del model, trainer, train_dataset, fold_eval_dataset; gc.collect(); torch.cuda.empty_cache()
        continue # Skip to next fold

    # --- Evaluate on Fold's Validation Set & Find Optimal Threshold for THIS fold ---
    console.print(f"🔍 Evaluating and finding best threshold for Fold {current_fold} (using its internal validation split)...")
    try:
        predictions_output = trainer.predict(fold_eval_dataset)
        logits = predictions_output.predictions
        labels = predictions_output.label_ids
        internal_val_metrics = predictions_output.metrics

        console.print(f"Fold {current_fold} Internal Validation Metrics (at threshold 0.5): {internal_val_metrics}")

        if logits is not None and labels is not None:
            # Calculate probabilities for the positive class (1)
            if logits.shape[1] >= 2:
                probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
            elif logits.shape[1] == 1:
                 probs = torch.sigmoid(torch.tensor(logits)).squeeze(-1).numpy()
            else:
                 probs = np.array([])
                 console.print(f"[yellow]⚠️ Unexpected logits shape in fold {current_fold}: {logits.shape}. Cannot calculate probs.[/]")

            if probs.size > 0:
                # Find optimal threshold based on this fold's validation split, targeting class 1
                optimal_thresh_fold, best_f1_fold_val = find_optimal_threshold(labels, probs, target_label=1)

                console.print(f"[cyan]Fold {current_fold} - Best F1 on Internal Validation: {best_f1_fold_val:.4f} @ Optimized Threshold = {optimal_thresh_fold:.4f}[/]")
                fold_results.append({
                    "fold": current_fold,
                    "best_f1_internal_val_optimized": best_f1_fold_val,
                    "optimal_threshold_internal": optimal_thresh_fold,
                    "f1_internal_val_0.5": internal_val_metrics.get('test_f1', 0.0)
                })

                # --- Save Best Model for Ensemble (MODIFIED PATH & FILENAME) ---
                try:
                    # Create the specific final directory structure: RUN_OUTPUT_DIR/fold_X/best_model/
                    os.makedirs(final_best_model_dir, exist_ok=True) # <<< ENSURE FINAL DIR EXISTS
                    trainer.save_model(final_best_model_dir) # Save model files here
                    tokenizer.save_pretrained(final_best_model_dir) # Save tokenizer here
                    best_model_paths.append(final_best_model_dir) # Store path for later ensemble
                    console.print(f"[green]✓ Best model for Fold {current_fold} saved to '{final_best_model_dir}'[/]")

                    # --- Save the optimal threshold file (MODIFIED FILENAME) ---
                    threshold_file_path = os.path.join(final_best_model_dir, "threshold.txt") # <<< CHANGED FILENAME
                    with open(threshold_file_path, "w") as f: f.write(f"{optimal_thresh_fold:.4f}")
                    console.print(f"[green]✓ Optimal threshold ({optimal_thresh_fold:.4f}) saved to '{threshold_file_path}'[/]")

                except Exception as e:
                    console.print(f"[bold red]Error saving best model or threshold for fold {current_fold}: {e}[/]")
                    best_checkpoint_path = trainer.state.best_model_checkpoint
                    if best_checkpoint_path and os.path.isdir(best_checkpoint_path):
                         console.print(f"[yellow]Best checkpoint was at: {best_checkpoint_path}. Consider manually copying.[/]")
                    else:
                         console.print(f"[yellow]⚠️ No best model path found for fold {current_fold}. Cannot use for ensemble.[/]")

            else:
                 console.print(f"[yellow]⚠️ No probabilities calculated for fold {current_fold}. Cannot optimize threshold.[/]")
                 # Try to save the model anyway if training completed, but without threshold
                 try:
                     os.makedirs(final_best_model_dir, exist_ok=True)
                     trainer.save_model(final_best_model_dir)
                     tokenizer.save_pretrained(final_best_model_dir)
                     best_model_paths.append(final_best_model_dir)
                     console.print(f"[yellow]✓ Model saved to '{final_best_model_dir}' despite probability calculation issue (NO threshold file saved).[/]")
                 except Exception as e_save:
                     console.print(f"[bold red]Error saving model for fold {current_fold} after probability issue: {e_save}[/]")

        else:
            console.print(f"[yellow]⚠️ Prediction output missing logits or labels for fold {current_fold}. Cannot evaluate or save best model.[/]")

    except Exception as e:
        console.print(f"[bold red]Error during evaluation/optimization/saving for fold {current_fold}: {e}[/]")
        del model, trainer, train_dataset, fold_eval_dataset; gc.collect(); torch.cuda.empty_cache()
        continue # Skip to next fold

    # --- Clean Temporary Checkpoints (MODIFIED PATH) ---
    console.print(f"[dim]Cleaning up temporary checkpoints directory: '{fold_temp_checkpoint_dir}'[/]")
    try:
        shutil.rmtree(fold_temp_checkpoint_dir) # <<< Use correct temp path
    except OSError as e:
        console.print(f"[yellow]⚠️ Error deleting checkpoint directory {fold_temp_checkpoint_dir}: {e}[/]")


    # --- Free memory after fold completion ---
    console.print(f"[dim]Cleaning up memory for fold {current_fold}...[/dim]")
    del model, trainer, train_dataset, fold_eval_dataset, predictions_output, logits, labels, probs
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# --- End of CV Loop ---

# -------------------------------
# 8. Display CV Summary
# -------------------------------
console.rule("[bold green]Cross-Validation Summary (on Internal Validation Splits)[/]")
if fold_results:
    results_table = Table(title="Fold Performance on Internal Validation Splits")
    results_table.add_column("Fold", style="cyan")
    results_table.add_column("F1 (Thr=0.5)", style="magenta", justify="right")
    results_table.add_column("Optimized F1", style="green", justify="right")
    results_table.add_column("Optimal Thr (Internal)", style="yellow", justify="right")

    all_f1s_opt = [res["best_f1_internal_val_optimized"] for res in fold_results]
    all_f1s_05 = [res["f1_internal_val_0.5"] for res in fold_results]
    all_thresholds = [res["optimal_threshold_internal"] for res in fold_results]

    for res in fold_results:
        results_table.add_row(
            str(res["fold"]),
            f"{res['f1_internal_val_0.5']:.4f}",
            f"{res['best_f1_internal_val_optimized']:.4f}",
            f"{res['optimal_threshold_internal']:.4f}"
        )
    console.print(results_table)

    avg_f1_opt = np.mean(all_f1s_opt)
    std_f1_opt = np.std(all_f1s_opt)
    avg_f1_05 = np.mean(all_f1s_05)
    std_f1_05 = np.std(all_f1s_05)
    avg_thresh = np.mean(all_thresholds)
    std_thresh = np.std(all_thresholds)

    console.print(f"\nAvg F1 (Internal Val, Thr=0.5): [bold magenta]{avg_f1_05:.4f} +/- {std_f1_05:.4f}[/]")
    console.print(f"Avg F1 (Internal Val, Optimized Thr): [bold green]{avg_f1_opt:.4f} +/- {std_f1_opt:.4f}[/]")
    console.print(f"Avg Optimal Threshold (Internal): [bold yellow]{avg_thresh:.4f} +/- {std_thresh:.4f}[/]")

    if len(best_model_paths) != N_SPLITS:
         console.print(f"[yellow]⚠️ Found {len(best_model_paths)} best models, expected {N_SPLITS}. Ensemble evaluation might be affected.[/]")

else:
    console.print("[yellow]No fold results recorded. Check for errors during training/evaluation.[/]")

# --- Cleanup Overall Temp Checkpoint Dir ---
console.print(f"[dim]Cleaning up base temporary checkpoint directory: '{TEMP_CHECKPOINT_BASE_DIR}'[/]")
try:
    shutil.rmtree(TEMP_CHECKPOINT_BASE_DIR)
except OSError as e:
    console.print(f"[yellow]⚠️ Error deleting base temp checkpoint directory {TEMP_CHECKPOINT_BASE_DIR}: {e}[/]")


# -----------------------------------------------------
# 9. FINAL EVALUATION ON HOLDOUT VALIDATION SET (valid.csv)
# Using Ensemble of Best Fold Models
# Optimize Threshold DIRECTLY on Holdout Set
# -----------------------------------------------------
console.rule("[bold magenta]Final Evaluation on Holdout Set (valid.csv)[/]")

if not best_model_paths:
    console.print("[bold red]❌ No best models saved from CV folds. Cannot perform final evaluation.[/]")
    sys.exit(1)
if len(holdout_valid_df) == 0:
    console.print("[bold red]❌ Holdout validation data ('valid.csv') is empty. Cannot perform final evaluation.[/]")
    sys.exit(1)


# --- Create Dataset & Loader for Holdout Set ---
console.print(f"Preparing holdout validation dataset ({len(holdout_valid_df)} samples)...")
try:
    # Include IDs and labels for evaluation
    holdout_dataset = VaccineDataset(
        texts=holdout_valid_df['text'].tolist(),
        labels=holdout_valid_df['label'].tolist(), # Include true labels
        ids=holdout_valid_df['id'].tolist(),       # Include IDs
        tokenizer=tokenizer, # Use the globally loaded tokenizer
        max_length=MAX_LENGTH,
        is_inference=False # We have labels for evaluation
    )
    holdout_loader = DataLoader(holdout_dataset, batch_size=PER_DEVICE_BATCH_SIZE * 2, shuffle=False, num_workers=0, pin_memory=torch.cuda.is_available())
    # Get true labels in the correct order, filtering any potential -1 from dataset errors
    holdout_y_true_raw = np.array(holdout_valid_df['label'].tolist())
    valid_true_indices = holdout_y_true_raw != -1
    holdout_y_true = holdout_y_true_raw[valid_true_indices]
    console.print(f"[dim]Using {len(holdout_y_true)} valid ground truth labels from holdout set for final evaluation.[/]")

except Exception as e:
     console.print(f"[bold red]❌ Error creating holdout dataset/loader: {e}. Cannot perform final evaluation.[/]")
     sys.exit(1)


# --- Ensemble Inference on Holdout Set ---
console.print(f"Running ensemble inference on holdout set using {len(best_model_paths)} models...")
all_holdout_probs_np = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Ensure device is set

for i, model_p in enumerate(best_model_paths):
    console.print(f"--- Loading model {i+1}/{len(best_model_paths)} from [yellow]{model_p}[/] ---") # Print full path now
    try:
        # Load model AND tokenizer specific to that fold (best practice)
        # Although we saved the global tokenizer, loading from the model dir ensures consistency if needed
        fold_tokenizer = AutoTokenizer.from_pretrained(model_p)
        model = AutoModelForSequenceClassification.from_pretrained(model_p).to(device).eval()
        # Re-create dataset/loader with fold-specific tokenizer? Could be more robust but slower.
        # Let's stick to the global tokenizer for inference speed, assuming compatibility.
    except Exception as e:
        console.print(f"[bold red]❌ Error loading model {model_p}: {e}. Skipping this model for ensemble.[/]")
        continue

    fold_holdout_probs_list = []
    with torch.no_grad():
        for batch in track(holdout_loader, description=f"Predicting (Holdout, Model {i+1})...", console=console, transient=False):
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            batch_labels = batch.get("labels", torch.tensor([-1]*len(input_ids))) # Get labels if present

            # Identify valid items in the batch (label != -1, which our Dataset uses for errors)
            valid_batch_indices_mask = (batch_labels != -1).cpu()
            valid_input_ids = input_ids[valid_batch_indices_mask]
            valid_attention_mask = attention_mask[valid_batch_indices_mask]

            # Skip batch if no valid items (e.g., all errored in __getitem__)
            if valid_input_ids.shape[0] == 0:
                 # Append placeholder NaNs for invalid items
                 batch_probs = np.full(len(input_ids), np.nan)
                 fold_holdout_probs_list.extend(batch_probs)
                 continue

            try:
                outputs = model(input_ids=valid_input_ids, attention_mask=valid_attention_mask)
                logits = outputs.logits
                if logits.shape[1] >= 2:
                    probs = torch.softmax(logits, dim=1)[:, 1] # Prob for class 1
                elif logits.shape[1] == 1:
                    probs = torch.sigmoid(logits).squeeze(-1)
                else:
                    probs = torch.full((valid_input_ids.shape[0],), 0.5, device=device) # Fallback guess

                # Place probabilities back into the original batch structure using NaN for invalid items
                batch_probs = np.full(len(input_ids), np.nan) # Initialize with NaNs
                batch_probs[valid_batch_indices_mask.numpy()] = probs.cpu().numpy()
                fold_holdout_probs_list.extend(batch_probs)


            except Exception as pred_e:
                console.print(f"\n[bold red]❌ Error during holdout prediction batch with model {i+1}: {pred_e}[/]")
                 # Add NaNs for the failed batch to maintain length and indicate failure
                fold_holdout_probs_list.extend(np.full(len(input_ids), np.nan))


    # Convert list of batch arrays to a single numpy array for the fold
    fold_holdout_probs = np.array(fold_holdout_probs_list)

    # Ensure length matches dataset size, pad with NaNs if needed (shouldn't happen with current logic)
    if len(fold_holdout_probs) != len(holdout_dataset):
         console.print(f"[yellow]⚠️ Length mismatch for fold {i+1} predictions ({len(fold_holdout_probs)}) vs dataset ({len(holdout_dataset)}). Padding with NaN.[/]")
         fold_holdout_probs = np.pad(fold_holdout_probs, (0, len(holdout_dataset) - len(fold_holdout_probs)), constant_values=np.nan)

    all_holdout_probs_np.append(fold_holdout_probs[:len(holdout_dataset)]) # Ensure correct length
    console.print(f"[green]✓ Holdout predictions collected for model {i+1}.[/]")

    # Free memory
    del model, outputs, logits, probs, fold_tokenizer; gc.collect(); torch.cuda.empty_cache()


# --- Aggregate and Evaluate Holdout Predictions ---
if not all_holdout_probs_np:
    console.print("[bold red]❌ No predictions generated for the holdout set by any valid model.[/]")
else:
    num_ensemble_models = len(all_holdout_probs_np)
    console.print(f"\n[bold cyan]Combining holdout probabilities from {num_ensemble_models} models (using nanmean)...[/]")
    # Use nanmean to average probabilities, ignoring NaNs from failed predictions/invalid data
    holdout_avg_probs_raw = np.nanmean(np.array(all_holdout_probs_np), axis=0)

    # Filter probabilities corresponding to valid true labels
    holdout_avg_probs = holdout_avg_probs_raw[valid_true_indices]

    # Check if we have valid probabilities and labels to work with
    if len(holdout_avg_probs) == 0 or np.all(np.isnan(holdout_avg_probs)):
        console.print("[bold red]❌ No valid averaged probabilities obtained for the holdout set. Cannot optimize threshold or evaluate.[/]")
    elif len(holdout_y_true) == 0:
         console.print("[bold red]❌ No valid ground truth labels found in the holdout set. Cannot optimize threshold or evaluate.[/]")
    else:
        # --- Optimize Threshold Directly on Holdout Ensemble Probs ---
        console.print("[cyan]Optimizing final threshold directly on holdout ensemble probabilities (targeting F1 for class 1)...[/]")
        final_holdout_threshold, best_f1_on_holdout = find_optimal_threshold(holdout_y_true, holdout_avg_probs, target_label=1)
        console.print(f"[bold green]✓ Optimal threshold for holdout set (Class 1 F1): {final_holdout_threshold:.4f} (yielding F1 score: {best_f1_on_holdout:.4f})[/]")

        # Apply this optimal threshold
        holdout_predictions = (holdout_avg_probs >= final_holdout_threshold).astype(int)

        # --- Final Report on Holdout Set ---
        console.rule("[bold magenta]Final Ensemble Performance on Holdout Set (valid.csv) with Optimized Threshold[/]")
        try:
            # We already filtered labels (holdout_y_true) and predictions (holdout_predictions)
            console.print(f"Evaluating on {len(holdout_y_true)} holdout samples with valid labels.")
            report = classification_report(holdout_y_true, holdout_predictions, output_dict=True, digits=4, zero_division=0)

            # Display Report Table
            report_table = Table(show_header=True, header_style="bold magenta", box=box.SIMPLE)
            report_table.add_column("Class", style="dim", width=12)
            report_table.add_column("Precision", justify="right")
            report_table.add_column("Recall", justify="right")
            report_table.add_column("F1-Score", justify="right")
            report_table.add_column("Support", justify="right")

            labels_in_report = [label for label in sorted(report.keys()) if label not in ['accuracy', 'macro avg', 'weighted avg']]
            for label in labels_in_report:
                 metrics = report[label]
                 style = "green" if label == '1' and metrics['f1-score'] > 0.96 else ""
                 report_table.add_row(str(label), f"{metrics['precision']:.4f}", f"{metrics['recall']:.4f}", f"[bold {style}]{metrics['f1-score']:.4f}[/]", f"{int(metrics['support'])}")

            report_table.add_section()
            for avg_type in ["macro avg", "weighted avg"]:
                 if avg_type in report:
                     metrics = report[avg_type]
                     name = avg_type.replace(" avg", " Avg")
                     report_table.add_row(f"[bold]{name}[/]", f"{metrics['precision']:.4f}", f"{metrics['recall']:.4f}", f"{metrics['f1-score']:.4f}", f"{int(metrics['support'])}")

            if "accuracy" in report:
                 accuracy = report["accuracy"]
                 total_support = int(report["weighted avg"]["support"]) if "weighted avg" in report else len(holdout_y_true)
                 report_table.add_section()
                 report_table.add_row("[bold]Accuracy[/]", "", "", f"[bold]{accuracy:.4f}[/]", f"{total_support}")

            console.print(report_table)

            # Display Confusion Matrix
            console.print("\n🎯 [bold blue]Holdout Confusion Matrix[/bold blue] (using optimized threshold)\n")
            cm_labels = sorted(list(set(holdout_y_true) | set(holdout_predictions)))
            if not cm_labels: cm_labels = [0, 1] # Default if only one class predicted/present
            elif len(cm_labels) == 1: cm_labels = [0, 1] # Ensure both 0 and 1 are columns if only one exists

            cm = confusion_matrix(holdout_y_true, holdout_predictions, labels=cm_labels)
            cm_table = Table(title="True \\ Predicted", box=box.SIMPLE_HEAVY, show_header=True, header_style="bold")
            cm_table.add_column("", justify="center", style="dim") # Empty top-left corner
            for label in cm_labels:
                cm_table.add_column(f"Pred {label}", justify="center")

            for i, true_label in enumerate(cm_labels):
                row_data = [f"True {true_label}"] + [str(cm[i, j]) for j in range(len(cm_labels))]
                cm_table.add_row(*row_data)
            console.print(cm_table)

            # Explicitly check the target F1 score
            f1_class_1 = report.get('1', {}).get('f1-score', 0.0)
            if f1_class_1 > 0.96:
                 console.print(Panel(f"🚀 [bold green]Success![/] F1 score for Class 1 ({f1_class_1:.4f}) is above the 0.96 target!", title="Target Check", expand=False))
            else:
                 console.print(Panel(f"⚠️ [bold yellow]Target Not Met.[/] F1 score for Class 1 ({f1_class_1:.4f}) is below 0.96.", title="Target Check", expand=False))


        except Exception as report_e:
            console.print(f"[bold red]❌ Error generating final holdout report: {report_e}[/]")


console.print(f"\n[INFO] CV completed. Best models saved under '{RUN_OUTPUT_DIR}'.") # <-- MODIFIED Log Message
console.print("[INFO] Each fold's best model is in 'fold_X/best_model/' including 'threshold.txt'.")
console.print("[INFO] Final evaluation performed on 'valid.csv' using an ensemble and a threshold optimized directly on holdout probabilities.")
console.print("[bold green]🏁 Script finished.[/]")






Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Safetensors PR exists
Using auto half precision backend


***** Running training *****
  Num examples = 2,016
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1,512
  Number of trainable parameters = 184,423,682


Epoch,Training Loss,Validation Loss,F1
1,0.3469,0.349356,0.906542
2,0.1459,0.430959,0.912
3,0.0954,0.251892,0.947368
4,0.0571,0.255335,0.951579
5,0.0348,0.240299,0.949153
6,0.0203,0.26003,0.951168



***** Running Evaluation *****
  Num examples = 505
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_1\checkpoint-252
Configuration saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_1\checkpoint-252\config.json
Model weights saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_1\checkpoint-252\model.safetensors

***** Running Evaluation *****
  Num examples = 505
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_1\checkpoint-504
Configuration saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_1\checkpoint-504\config.json
Model weights saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_1\checkpoint-504\model.safetensors

***** Running Evaluation *****
  Num examples = 505
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-


***** Running Prediction *****
  Num examples = 505
  Batch size = 16


Saving model checkpoint to ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_1\best_model
Configuration saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_1\best_model\config.json
Model weights saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_1\best_model\model.safetensors
tokenizer config file saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_1\best_model\tokenizer_config.json
Special tokens file saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_1\best_model\special_tokens_map.json


loading configuration file config.json from cache at C:\Users\Olivier\.cache\huggingface\hub\models--microsoft--deberta-v3-base\snapshots\8ccc9b6f36199bec6961081d44eb72fb3f7353f3\config.json
Model config DebertaV2Config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.50.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

loading weights file pyt

***** Running training *****
  Num examples = 2,017
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1,518
  Number of trainable parameters = 184,423,682
Safetensors PR exists


Epoch,Training Loss,Validation Loss,F1
1,0.3629,0.188479,0.948276
2,0.1458,0.209172,0.959488
3,0.0944,0.17482,0.958606
4,0.0362,0.223853,0.952586
5,0.0194,0.245156,0.95614



***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_2\checkpoint-253
Configuration saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_2\checkpoint-253\config.json
Model weights saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_2\checkpoint-253\model.safetensors

***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_2\checkpoint-506
Configuration saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_2\checkpoint-506\config.json
Model weights saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_2\checkpoint-506\model.safetensors

***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-


***** Running Prediction *****
  Num examples = 504
  Batch size = 16


Saving model checkpoint to ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_2\best_model
Configuration saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_2\best_model\config.json
Model weights saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_2\best_model\model.safetensors
tokenizer config file saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_2\best_model\tokenizer_config.json
Special tokens file saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_2\best_model\special_tokens_map.json


loading configuration file config.json from cache at C:\Users\Olivier\.cache\huggingface\hub\models--microsoft--deberta-v3-base\snapshots\8ccc9b6f36199bec6961081d44eb72fb3f7353f3\config.json
Model config DebertaV2Config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.50.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

loading weights file pyt

***** Running training *****
  Num examples = 2,017
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1,518
  Number of trainable parameters = 184,423,682
Safetensors PR exists


Epoch,Training Loss,Validation Loss,F1
1,0.3541,0.211635,0.946004
2,0.166,0.159632,0.963753
3,0.0905,0.258541,0.952381
4,0.0385,0.205654,0.965665
5,0.0176,0.241276,0.961864
6,0.0128,0.227276,0.965957



***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_3\checkpoint-253
Configuration saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_3\checkpoint-253\config.json
Model weights saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_3\checkpoint-253\model.safetensors

***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_3\checkpoint-506
Configuration saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_3\checkpoint-506\config.json
Model weights saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_3\checkpoint-506\model.safetensors

***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-


***** Running Prediction *****
  Num examples = 504
  Batch size = 16


Saving model checkpoint to ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_3\best_model
Configuration saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_3\best_model\config.json
Model weights saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_3\best_model\model.safetensors
tokenizer config file saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_3\best_model\tokenizer_config.json
Special tokens file saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_3\best_model\special_tokens_map.json


loading configuration file config.json from cache at C:\Users\Olivier\.cache\huggingface\hub\models--microsoft--deberta-v3-base\snapshots\8ccc9b6f36199bec6961081d44eb72fb3f7353f3\config.json
Model config DebertaV2Config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.50.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

loading weights file pyt

***** Running training *****
  Num examples = 2,017
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1,518
  Number of trainable parameters = 184,423,682
Safetensors PR exists


Epoch,Training Loss,Validation Loss,F1
1,0.3586,0.262883,0.928425
2,0.1911,0.245104,0.949367
3,0.1173,0.407538,0.917836
4,0.0503,0.238601,0.948936
5,0.0266,0.265879,0.951168
6,0.0067,0.334746,0.94958



***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_4\checkpoint-253
Configuration saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_4\checkpoint-253\config.json
Model weights saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_4\checkpoint-253\model.safetensors

***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_4\checkpoint-506
Configuration saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_4\checkpoint-506\config.json
Model weights saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_4\checkpoint-506\model.safetensors

***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-


***** Running Prediction *****
  Num examples = 504
  Batch size = 16


Saving model checkpoint to ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_4\best_model
Configuration saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_4\best_model\config.json
Model weights saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_4\best_model\model.safetensors
tokenizer config file saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_4\best_model\tokenizer_config.json
Special tokens file saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_4\best_model\special_tokens_map.json


loading configuration file config.json from cache at C:\Users\Olivier\.cache\huggingface\hub\models--microsoft--deberta-v3-base\snapshots\8ccc9b6f36199bec6961081d44eb72fb3f7353f3\config.json
Model config DebertaV2Config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.50.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

loading weights file pyt

***** Running training *****
  Num examples = 2,017
  Num Epochs = 6
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1,518
  Number of trainable parameters = 184,423,682
Attempting to create safetensors variant


Epoch,Training Loss,Validation Loss,F1
1,0.3554,0.521055,0.873077
2,0.1427,0.247488,0.941935
3,0.0826,0.213631,0.954839
4,0.0509,0.223042,0.958606
5,0.025,0.263705,0.952991
6,0.0157,0.287039,0.953191


Safetensors PR exists

***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_5\checkpoint-253
Configuration saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_5\checkpoint-253\config.json
Model weights saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_5\checkpoint-253\model.safetensors

***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_5\checkpoint-506
Configuration saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_5\checkpoint-506\config.json
Model weights saved in ./models\_cv_temp_checkpoints_deberta-v3-base_cv_5folds_ep6_bs8\fold_5\checkpoint-506\model.safetensors

***** Running Evaluation *****
  Num examples = 504
  Batch size = 16
Saving model checkpoint to ./models\_cv_tem


***** Running Prediction *****
  Num examples = 504
  Batch size = 16


Saving model checkpoint to ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_5\best_model
Configuration saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_5\best_model\config.json
Model weights saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_5\best_model\model.safetensors
tokenizer config file saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_5\best_model\tokenizer_config.json
Special tokens file saved in ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_5\best_model\special_tokens_map.json


loading file spm.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
loading configuration file ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_1\best_model\config.json
Model config DebertaV2Config {
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  

Output()

loading file spm.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
loading configuration file ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_2\best_model\config.json
Model config DebertaV2Config {
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  

Output()

loading file spm.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
loading configuration file ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_3\best_model\config.json
Model config DebertaV2Config {
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  

Output()

loading file spm.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
loading configuration file ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_4\best_model\config.json
Model config DebertaV2Config {
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  

Output()

loading file spm.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading file chat_template.jinja
loading configuration file ./models\deberta-v3-base_cv_5folds_ep6_bs8\fold_5\best_model\config.json
Model config DebertaV2Config {
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "legacy": true,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  

Output()