## Mdeberta

In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback # Importer le callback pour l'arrêt précoce
)
# Assurez-vous que wandb est initialisé si vous l'utilisez, sinon commentez/supprimez les lignes wandb.log
import wandb
wandb.init(project="ade-classification-mdeberta") # Exemple d'initialisation

# CORRECTION ICI: Ajout de precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
import zipfile # Pour la sauvegarde finale
import os # Pour la sauvegarde finale
import torch._dynamo
torch._dynamo.config.suppress_errors = True

# --- Configuration ---
MODEL_NAME = "microsoft/mdeberta-v3-base"  # Passer à la version Large
TRAIN_CSV = "data/train_data_SMM4H_2025_Task_1.csv" # Utiliser le CSV d'entraînement augmenté
DEV_CSV = "data/dev_data_SMM4H_2025_Task_1.csv"
OUTPUT_DIR = "results_deberta-v3-large"  # Répertoire de sortie pour le modèle et les résultats
NUM_EPOCHS = 6  # Réduire légèrement les epochs
BATCH_SIZE = 2  # Plus petit batch size
LEARNING_RATE = 1e-5  # Learning rate plus bas pour le large
GRADIENT_ACCUMULATION_STEPS = 16  # Accumulation plus importante
EARLY_STOPPING_PATIENCE = 1  # Arrêt plus rapide si stagnation
MAX_LENGTH = 256  # Réduire la longueur maximale des séquences
WEIGHT_DECAY = 0.05  # Poids de régularisation

# --- 1. Load Data ---
print("Loading data...")
try:
    train_df = pd.read_csv(TRAIN_CSV).dropna(subset=['text'])
    dev_df = pd.read_csv(DEV_CSV).dropna(subset=['text'])
except FileNotFoundError as e:
    print(f"Error loading CSV files: {e}")
    exit()

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
dataset_dict = DatasetDict({'train': train_dataset, 'validation': dev_dataset})
print("Data loaded.")

# --- 2. Tokenization ---
print("Tokenizing data...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- Modifications de la fonction de tokenization ---
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True  # Pour éviter de dépasser les limites du modèle
    )

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
print("Tokenization complete.")

# --- Clean up columns ---
print("Cleaning dataset columns...")
print("Columns before removal:", tokenized_datasets['train'].column_names)
columns_to_remove = ["text", "id", "file_name", "origin", "language", "split", "type"]
actual_columns_to_remove = [col for col in columns_to_remove if col in tokenized_datasets['train'].column_names]
print("Removing columns:", actual_columns_to_remove)
tokenized_datasets = tokenized_datasets.remove_columns(actual_columns_to_remove)
tokenized_datasets.set_format("torch")

# Rename 'label' to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
print("Columns after cleaning and rename:", tokenized_datasets['train'].column_names)

# --- 3. Compute Class Weights ---
print("Computing class weights...")
labels_train = train_df['label'].values
if len(np.unique(labels_train)) > 1: # Ensure there are at least two classes
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels_train), y=labels_train)
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Class Weights: {class_weights_tensor}")
else:
    print("Warning: Only one class found in training data. Cannot compute class weights.")
    class_weights_tensor = None # Handle this case in the loss function if needed

# --- Custom Trainer for Weighted Loss ---
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Use weights only if they were computed
        if class_weights_tensor is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        else:
            loss_fct = torch.nn.CrossEntropyLoss() # Default unweighted loss
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# --- 4. Model & Metrics ---
print("Loading model...")
# --- Chargement du modèle avec optimisations ---
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=2,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
print("Model loaded.")

# --- Increase dropout to help regularization ---
model.config.hidden_dropout_prob = 0.3  # Dropout on fully connected layers
model.config.attention_probs_dropout_prob = 0.3  # Dropout on attention probabilities
print("Dropout increased: hidden_dropout=0.3, attention_dropout=0.3")

# Activer le gradient checkpointing pour économiser la mémoire
model.gradient_checkpointing_enable()

# Fonction compute_metrics qui utilise la fonction importée
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Utilisation de precision_recall_fscore_support
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None, labels=[0, 1], zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1_pos': f1[1], # F1 for class 1
        'precision_pos': precision[1],
        'recall_pos': recall[1],
        'f1_neg': f1[0], # F1 for class 0 (for info)
    }

# --- 5. Training Arguments ---
print("Setting training arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,  # Modifié (était BATCH_SIZE*2)
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    lr_scheduler_type="cosine",  # <-- Ajouté
    warmup_ratio=0.1,            # <-- Ajouté (10% des steps en warmup)
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_pos",
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=50,
    report_to="wandb" if "wandb" in locals() else "none",
    fp16=False,  # Disabled FP16 to avoid gradient unscaling issues
    bf16=True if torch.cuda.is_available() else False,  # Use BF16 instead if available
    optim="adafactor",  # Ajouté - Optimiseur plus léger
    gradient_checkpointing=True,  # Ajouté - Économie de mémoire
    #torch_compile=False,  # Ajouté - Acceleration PyTorch 2.x+
    save_total_limit=2
    # early_stopping_patience=EARLY_STOPPING_PATIENCE,  # Déjà géré par le callback
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# --- 6. Trainer ---
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # Utilisation du DataCollator
    # Ajout du Callback pour Early Stopping
    callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE)]
)
print("Trainer configured.")

# --- 7. Train ---
print("Starting Training...")
train_result = trainer.train()
print("Training finished.")
trainer.save_model("best_model_f1") # Sauvegarde le meilleur modèle

# Log training metrics
# trainer.log_metrics("train", train_result.metrics) # Décommenter si besoin
# trainer.save_metrics("train", train_result.metrics) # Décommenter si besoin
# trainer.save_state() # Sauvegarde l'état du Trainer

# --- 8. Evaluate on Dev Set (using the best model loaded) ---
print("\nEvaluating on Development Set (Best Model)...")
eval_results = trainer.evaluate()
print("Evaluation Results:")
print(eval_results)
# trainer.log_metrics("eval", eval_results) # Décommenter si besoin
# trainer.save_metrics("eval", eval_results) # Décommenter si besoin

# --- 9. Detailed Evaluation and Submission File Generation ---
print("\nGenerating predictions and detailed metrics for Dev Set...")

# Generate predictions
predictions = trainer.predict(tokenized_datasets["validation"])
predicted_labels = predictions.predictions.argmax(-1)

# --- Threshold tuning ---
print("\nRunning threshold tuning on positive class...")

# Get predicted probabilities for class 1
logits = torch.tensor(predictions.predictions)
probs = torch.nn.functional.softmax(logits, dim=1)[:, 1]  # Probabilities for class 1
y_true = predictions.label_ids

# Baseline (argmax) F1
baseline_f1 = f1_score(y_true, predicted_labels, pos_label=1)
print(f"Baseline F1 (argmax): {baseline_f1:.4f}")

# Search best threshold
best_thresh = 0.5
best_f1 = 0.0
thresholds = np.linspace(0.1, 0.9, 81)

for t in thresholds:
    preds_thresh = (probs >= t).int()
    f1 = f1_score(y_true, preds_thresh, pos_label=1)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best threshold found: {best_thresh:.4f} → Tuned F1: {best_f1:.4f}")

# Save threshold to file
threshold_path = os.path.join(OUTPUT_DIR, "threshold.txt")
with open(threshold_path, "w") as f:
    f.write(f"{best_thresh:.4f}")
print(f"Threshold saved to {threshold_path}")

# Add predictions to the dev dataframe (ensure index alignment if necessary)
# If dev_df was filtered by dropna, indices might not match directly. Resetting index helps.
dev_df_eval = dev_df.reset_index(drop=True)
# Check lengths match before assigning
if len(dev_df_eval) == len(predicted_labels):
    dev_df_eval['predicted_label'] = predicted_labels
else:
    print(f"Error: Length mismatch! Dev DF has {len(dev_df_eval)} rows, Predictions have {len(predicted_labels)} entries.")
    # Handle error appropriately, maybe skip detailed eval or investigate dropna impact
    exit()


# Re-extract language (assuming ID format 'lang_...') - Be careful if IDs differ
dev_df_eval["language"] = dev_df_eval["id"].apply(lambda x: str(x).split("_")[0] if isinstance(x, str) and "_" in x else "unknown")

# --- Calculate Per-Language and Overall Metrics ---
languages = sorted(dev_df_eval['language'].unique())
per_language_metrics = {}
language_f1_scores = []
all_true_labels_eval = []
all_pred_labels_eval = []

print("\n--- Detailed Evaluation on Development Set ---")
wandb_logs = {} # Collect logs for wandb

for lang in languages:
    if lang == "unknown": continue # Skip if language couldn't be extracted
    lang_mask = dev_df_eval['language'] == lang
    y_true_lang = dev_df_eval.loc[lang_mask, 'label']
    y_pred_lang = dev_df_eval.loc[lang_mask, 'predicted_label']

    if len(y_true_lang) == 0: continue # Skip if no data for this language

    all_true_labels_eval.extend(y_true_lang.tolist())
    all_pred_labels_eval.extend(y_pred_lang.tolist())

    # Utilisation de precision_recall_fscore_support (qui est maintenant importé)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true_lang, y_pred_lang, average=None, labels=[0, 1], zero_division=0)
    accuracy_lang = accuracy_score(y_true_lang, y_pred_lang) # Renommé pour éviter conflit avec la fonction accuracy_score

    per_language_metrics[lang] = {'precision': precision[1], 'recall': recall[1], 'f1': f1[1], 'accuracy': accuracy_lang}
    language_f1_scores.append(f1[1]) # On stocke le F1 de la classe positive (1)

    print(f"\nMetrics for language: {lang.upper()}")
    print(f"  Precision-{lang} (Pos): {precision[1]:.4f}")
    print(f"  Recall-{lang}    (Pos): {recall[1]:.4f}")
    print(f"  F1-{lang}        (Pos): {f1[1]:.4f}")
    print(f"  Accuracy-{lang}:        {accuracy_lang:.4f}")

    # Prepare logs for wandb
    wandb_logs[f"{lang}/precision_pos"] = precision[1]
    wandb_logs[f"{lang}/recall_pos"] = recall[1]
    wandb_logs[f"{lang}/f1_pos"] = f1[1]
    wandb_logs[f"{lang}/accuracy"] = accuracy_lang


# Calculate Overall Metrics (using the full dev set lists)
cm_overall = confusion_matrix(all_true_labels_eval, all_pred_labels_eval, labels=[0, 1])
tn, fp, fn, tp = cm_overall.ravel() if cm_overall.size == 4 else (0, 0, 0, 0) # Handle cases with missing classes

overall_precision_pos = tp / (tp + fp) if (tp + fp) > 0 else 0
overall_recall_pos = tp / (tp + fn) if (tp + fn) > 0 else 0
# Overall F1 (Primary Metric for Positive Class)
overall_f1_pos = 2 * (overall_precision_pos * overall_recall_pos) / (overall_precision_pos + overall_recall_pos) if (overall_precision_pos + overall_recall_pos) > 0 else 0
# Macro F1 (Average of per-language F1s for Positive Class)
macro_f1_pos = np.mean(language_f1_scores) if language_f1_scores else 0
# Overall Accuracy
overall_accuracy = accuracy_score(all_true_labels_eval, all_pred_labels_eval)

print("\n--- Overall Evaluation Summary (Positive Class Focus) ---")
print(f"F1-score across all languages (Positive Class): {overall_f1_pos:.4f}  <-- Primary Metric")
print(f"Macro F1-score across all languages (Pos Class):{macro_f1_pos:.4f}")
print(f"Overall Precision (Positive Class):             {overall_precision_pos:.4f}")
print(f"Overall Recall (Positive Class):                {overall_recall_pos:.4f}")
print(f"Overall Accuracy across all languages:          {overall_accuracy:.4f}")

print("\nOverall Confusion Matrix (All Languages):")
print(f"[[TN={tn}  FP={fp}]")
print(f" [FN={fn}  TP={tp}]]")

# Add overall metrics to wandb logs
wandb_logs["overall/f1_pos"] = overall_f1_pos
wandb_logs["overall/macro_f1_pos"] = macro_f1_pos
wandb_logs["overall/precision_pos"] = overall_precision_pos
wandb_logs["overall/recall_pos"] = overall_recall_pos
wandb_logs["overall/accuracy"] = overall_accuracy
wandb_logs["overall/TP"] = tp
wandb_logs["overall/FP"] = fp
wandb_logs["overall/FN"] = fn
wandb_logs["overall/TN"] = tn

# Log Confusion Matrix to wandb (optional)
if "wandb" in locals() and tp+fp+fn+tn > 0:
     try:
        wandb_logs["confusion_matrix"] = wandb.plot.confusion_matrix(
             probs=None,
             y_true=all_true_labels_eval,
             preds=all_pred_labels_eval,
             class_names=["Negative (0)", "Positive (1)"]
         )
     except Exception as e:
         print(f"Could not log confusion matrix to wandb: {e}")


# Log all collected metrics to wandb (if initialized)
if "wandb" in locals():
    try:
        wandb.log(wandb_logs)
        print("Metrics logged to WandB.")
    except Exception as e:
        print(f"Could not log metrics to wandb: {e}")


# --- 10. Save Submission File ---
print("\nSaving predictions for submission...")
# Ensure results directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Prepare submission dataframe
submission_df = dev_df_eval[['id', 'predicted_label']]

# Define CSV and ZIP paths
csv_filename = "predictions_task1.csv"
zip_filename = "submission_task1.zip"
csv_path = os.path.join(OUTPUT_DIR, csv_filename)
zip_path = os.path.join(OUTPUT_DIR, zip_filename)

# Save CSV
submission_df.to_csv(csv_path, index=False)
print(f"Predictions saved to {csv_path}")

# Zip the CSV file
with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
    zf.write(csv_path, arcname=csv_filename)
print(f"{csv_filename} has been zipped into {zip_path}")


print("\nScript finished.")

Loading data...
Data loaded.
Tokenizing data...




Map:   0%|          | 0/31187 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/4625 [00:00<?, ? examples/s]

Tokenization complete.
Cleaning dataset columns...
Columns before removal: ['id', 'text', 'label', 'file_name', 'origin', 'type', 'language', 'split', 'input_ids', 'token_type_ids', 'attention_mask']
Removing columns: ['text', 'id', 'file_name', 'origin', 'language', 'split', 'type']
Columns after cleaning and rename: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
Computing class weights...
Class Weights: tensor([0.5426, 6.3621], device='cuda:0')
Loading model...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.
Dropout increased: hidden_dropout=0.3, attention_dropout=0.3
Setting training arguments...


  trainer = WeightedTrainer(


Trainer configured.
Starting Training...




Epoch,Training Loss,Validation Loss,Accuracy,F1 Pos,Precision Pos,Recall Pos,F1 Neg
0,0.2468,0.258236,0.928432,0.561589,0.593838,0.532663,0.961036
1,0.157,0.217694,0.932757,0.633687,0.596452,0.675879,0.962981
2,0.1334,0.22185,0.931892,0.654226,0.580897,0.748744,0.962226
3,0.136,0.252743,0.937081,0.661234,0.616052,0.713568,0.96532
4,0.0884,0.255575,0.935351,0.662147,0.601643,0.736181,0.964256
5,0.1538,0.256488,0.932973,0.654788,0.588,0.738693,0.962883


Training finished.

Evaluating on Development Set (Best Model)...


Evaluation Results:
{'eval_loss': 0.25557541847229004, 'eval_accuracy': 0.9353513513513514, 'eval_f1_pos': 0.6621468926553672, 'eval_precision_pos': 0.6016427104722792, 'eval_recall_pos': 0.7361809045226131, 'eval_f1_neg': 0.9642558278541542, 'eval_runtime': 51.7866, 'eval_samples_per_second': 89.309, 'eval_steps_per_second': 44.664, 'epoch': 5.999358727715788}

Generating predictions and detailed metrics for Dev Set...

Running threshold tuning on positive class...
Baseline F1 (argmax): 0.6621
Best threshold found: 0.6300 → Tuned F1: 0.6713
Threshold saved to results_deberta-v3-large\threshold.txt

--- Detailed Evaluation on Development Set ---

Metrics for language: DE
  Precision-de (Pos): 0.6216
  Recall-de    (Pos): 0.6571
  F1-de        (Pos): 0.6389
  Accuracy-de:        0.9590

Metrics for language: EN
  Precision-en (Pos): 0.6220
  Recall-en    (Pos): 0.8361
  F1-en        (Pos): 0.7133
  Accuracy-en:        0.9545

Metrics for language: FR
  Precision-fr (Pos): 0.5750
  Recal

## Optuna XLM Roberta Large

In [None]:
# --- [Imports et configuration initiale comme avant] ---
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    set_seed,
    TrainerCallback,
    TrainerState,
    TrainerControl
)
# import wandb # Décommente si utilisé
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
import optuna
import zipfile
import os
import shutil
import json
import traceback
from typing import Dict, Any

# --- Configuration Initiale ---
BASE_MODEL_NAME = "cardiffnlp/twitter-roberta-base"
TRAIN_CSV = "data/train_data_SMM4H_2025_Task_1.csv" # Adapte le chemin si nécessaire
DEV_CSV = "data/dev_data_SMM4H_2025_Task_1.csv"   # Adapte le chemin si nécessaire
BASE_OUTPUT_DIR = "optuna_cardiffnlp-twitter-roberta-base" # Nouveau nom: recherche focalisée
NUM_TRIALS = 25 # Peut-être moins d'essais suffisent avec des plages plus étroites
N_BEST_MODELS = 5
MAX_LENGTH = 512 #Gardé à 256 pour VRAM Optuna run
BATCH_SIZE = 8
GRADIENT_ACCUMULATION_STEPS = 2
EARLY_STOPPING_PATIENCE = 2
SEED = 42
WANDB_PROJECT_NAME = "ade-classification-optuna-focused-v4"
WANDB_LOGGING = False # Met à True si configuré

# --- Seed & Setup GPU ---
set_seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
    print(f"CUDA disponible. Utilisation de GPU: {torch.cuda.get_device_name(0)}")
    if not torch.cuda.is_bf16_supported():
        print("Attention: BF16 non supporté par ce GPU. Utilisation de FP32.")
else:
    print("CUDA non disponible. Utilisation de CPU.")

# --- 1. Load Data ---
print("Loading data...")
try:
    train_df = pd.read_csv(TRAIN_CSV).dropna(subset=['text'])
    dev_df = pd.read_csv(DEV_CSV).dropna(subset=['text'])
    print(f"Train data: {len(train_df)} rows, Dev data: {len(dev_df)} rows")
except FileNotFoundError as e:
    print(f"Error loading CSV files: {e}. Vérifie les chemins TRAIN_CSV et DEV_CSV.")
    exit()
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
dataset_dict = DatasetDict({'train': train_dataset, 'validation': dev_dataset})
print("Data loaded.")

# --- 2. Tokenizer & tokenize_function ---
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
def tokenize_function(examples):
    return tokenizer(
        examples["text"], truncation=True, padding=False, max_length=MAX_LENGTH
    )

# --- 3. Class Weights ---
print("Computing class weights...")
labels_train = train_df['label'].values
class_weights_tensor = None
unique_labels = np.unique(labels_train)
if len(unique_labels) > 1:
    class_weights = compute_class_weight(class_weight='balanced', classes=unique_labels, y=labels_train)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)
    print(f"Class Weights (pour classes {unique_labels}) sur {device}: {class_weights_tensor}")
else:
    print(f"Warning: Only one class ({unique_labels[0]}) found. Cannot compute class weights.")

# --- 4. WeightedTrainer ---
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        if class_weights_tensor is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor.to(logits.device))
        else:
            loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# --- 5. compute_metrics ---
def compute_metrics(pred) -> Dict[str, float]:
    labels = pred.label_ids
    if isinstance(pred.predictions, tuple): logits = pred.predictions[0]
    else: logits = pred.predictions
    if logits is None: return {'f1_pos': 0.0}
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None, labels=[0, 1], zero_division=0)
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1_pos': f1[1], 'precision_pos': precision[1], 'recall_pos': recall[1], 'f1_neg': f1[0]}

# --- 6. Data Collator ---
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# --- 7. Callback Personnalisé pour le Pruning Optuna ---
class OptunaPruningCallback(TrainerCallback):
    def __init__(self, trial: optuna.Trial):
        self.trial = trial

    def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics: Dict[str, float], **kwargs):
        metric_to_report = args.metric_for_best_model
        if not metric_to_report.startswith("eval_"): metric_to_report = f"eval_{metric_to_report}"
        metric_value = metrics.get(metric_to_report)
        if metric_value is None: return
        current_epoch = int(state.epoch) if state.epoch is not None else 0
        self.trial.report(metric_value, step=current_epoch)
        if self.trial.should_prune():
            message = f"Trial {self.trial.number} pruned at epoch {current_epoch}."
            print(message)
            raise optuna.exceptions.TrialPruned(message)

# --- 8. Fonction Objective pour Optuna ---
def objective(trial: optuna.Trial) -> float:
    torch.cuda.empty_cache()

    # --- Hyperparamètres à optimiser (PLAGES RESTREINTES) ---
    # *** MODIFICATIONS ICI ***
    learning_rate = trial.suggest_float("learning_rate", 8e-6, 2e-5, log=True) # Autour de 1e-5
    num_train_epochs = trial.suggest_int("num_train_epochs", 4, 7)             # Autour de 6
    weight_decay = trial.suggest_float("weight_decay", 0.02, 0.08)             # Autour de 0.05
    warmup_ratio = trial.suggest_float("warmup_ratio", 0.05, 0.15)             # Autour de 0.1
    lr_scheduler_type = trial.suggest_categorical("lr_scheduler_type", ["cosine", "linear"]) # Garde les deux options
    hidden_dropout_prob = trial.suggest_float("hidden_dropout_prob", 0.2, 0.4) # Autour de 0.3
    attention_probs_dropout_prob = trial.suggest_float("attention_probs_dropout_prob", 0.2, 0.4) # Autour de 0.3
    optim = "adafactor" # Gardé fixe

    # --- Configuration spécifique à l'essai ---
    trial_run_name = f"trial_{trial.number}_lr{learning_rate:.1e}_ep{num_train_epochs}_wd{weight_decay:.2f}"
    trial_output_dir = os.path.join(BASE_OUTPUT_DIR, f"trial_{trial.number}")
    os.makedirs(trial_output_dir, exist_ok=True)
    print(f"\n--- Starting Trial {trial.number} ---")
    print(f"  Output Dir: {trial_output_dir}")
    print(f"  Hyperparameters: {trial.params}")

    # --- Tokenization ---
    print("Tokenizing data for trial...")
    try:
        tokenized_datasets = dataset_dict.map(tokenize_function, batched=True, load_from_cache_file=False, desc="Running tokenizer")
        cols_before = tokenized_datasets['train'].column_names
        cols_to_remove = ["text", "id", "file_name", "origin", "language", "split", "type"] + [c for c in cols_before if c.startswith("__")]
        actual_cols_to_remove = [col for col in cols_to_remove if col in cols_before]
        tokenized_datasets = tokenized_datasets.remove_columns(actual_cols_to_remove)
        tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
        tokenized_datasets.set_format("torch")
        print(f"Tokenization complete. Remaining cols: {tokenized_datasets['train'].column_names}")
    except Exception as e: print(f"Data processing error: {e}"); return 0.0

    # --- Chargement du modèle ---
    print("Loading model for trial...")
    try:
        model_dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
        model = AutoModelForSequenceClassification.from_pretrained(
            BASE_MODEL_NAME, num_labels=2, hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob, torch_dtype=model_dtype,
            ignore_mismatched_sizes=True )
        model.gradient_checkpointing_enable()
        print(f"Model loaded (dtype: {model_dtype}).")
    except Exception as e: print(f"Model loading error: {e}"); traceback.print_exc(); return 0.0

    # --- Training Arguments ---
    training_args = TrainingArguments(
        output_dir=trial_output_dir, num_train_epochs=num_train_epochs, learning_rate=learning_rate,
        weight_decay=weight_decay, lr_scheduler_type=lr_scheduler_type, warmup_ratio=warmup_ratio, optim=optim,
        per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE * 2,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, fp16=False,
        bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(), gradient_checkpointing=True,
        eval_strategy="epoch", save_strategy="epoch", save_total_limit=1,
        load_best_model_at_end=True, metric_for_best_model="f1_pos", greater_is_better=True,
        logging_dir=os.path.join(trial_output_dir, 'logs'), logging_strategy="epoch",
        report_to="wandb" if WANDB_LOGGING else "none", seed=SEED, disable_tqdm=False )

    # --- Callbacks ---
    early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE)
    pruning_callback = OptunaPruningCallback(trial)

    # --- Trainer ---
    trainer = WeightedTrainer(
        model=model, args=training_args, train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"], tokenizer=tokenizer,
        compute_metrics=compute_metrics, data_collator=data_collator,
        callbacks=[early_stopping_callback, pruning_callback] )

    # --- W&B Run (Optionnel) ---
    if WANDB_LOGGING:
        try:
            wandb.init(project=WANDB_PROJECT_NAME, name=trial_run_name, config=trial.params,
                       reinit=True, group="Optuna Focused Search", resume="allow")
        except Exception as e: print(f"WandB init error trial {trial.number}: {e}"); trainer.args.report_to = "none"

    # --- Entraînement & Gestion Erreurs/Pruning ---
    final_f1_score = 0.0
    try:
        print(f"Starting training for trial {trial.number}...")
        train_result = trainer.train()
        print(f"Training finished for trial {trial.number}.")
        print(f"Retrieving best metric for trial {trial.number}...")
        best_metric = trainer.state.best_metric
        if best_metric is None and trainer.state.log_history:
             # Essaye de trouver la meilleure métrique dans l'historique si state.best_metric est None
             eval_logs = [log for log in trainer.state.log_history if 'eval_f1_pos' in log]
             if eval_logs:
                 final_f1_score = max(log['eval_f1_pos'] for log in eval_logs)
             else: # Si toujours rien, évalue une dernière fois
                 eval_results = trainer.evaluate()
                 final_f1_score = eval_results.get("eval_f1_pos", 0.0)
        elif best_metric is not None:
             final_f1_score = best_metric
        else: # Cas où l'entraînement a échoué très tôt
             final_f1_score = 0.0

        print(f"Trial {trial.number} - Best Validation F1_pos recorded: {final_f1_score:.4f}")
        if WANDB_LOGGING and wandb.run: wandb.log({"best_eval_f1_pos": final_f1_score})

    except optuna.exceptions.TrialPruned as e:
        print(f"!!! {e}")
        if WANDB_LOGGING and wandb.run: wandb.log({"status": "pruned", "eval_f1_pos": 0.0})
        final_f1_score = 0.0

    except Exception as e:
        print(f"!!! Trial {trial.number} failed with error: {e}")
        traceback.print_exc()
        if WANDB_LOGGING and wandb.run: wandb.log({"error": str(e), "eval_f1_pos": 0.0})
        final_f1_score = 0.0

    finally:
        if WANDB_LOGGING and wandb.run:
            try: wandb.finish()
            except Exception as e: print(f"WandB finish error: {e}")
        del model, trainer, tokenized_datasets
        torch.cuda.empty_cache()

    return final_f1_score

# --- Création et lancement de l'étude Optuna ---
print("\n--- Starting Optuna Focused Study ---")
os.makedirs(BASE_OUTPUT_DIR, exist_ok=True)
sampler = optuna.samplers.TPESampler(seed=SEED)
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=0, interval_steps=1) # Warmup 0
storage_name = f"sqlite:///{os.path.join(BASE_OUTPUT_DIR, 'optuna_study.db')}"
study = optuna.create_study(
    study_name="ade_xlmr_large_opt_focused_v4", direction="maximize", sampler=sampler, pruner=pruner,
    storage=storage_name, load_if_exists=True )

print(f"Optuna study using storage: {storage_name}")
print(f"Number of trials already completed in storage: {len(study.trials)}")

# Lancer l'optimisation
try:
    study.optimize(objective, n_trials=NUM_TRIALS, timeout=3600 * 12, gc_after_trial=True)
except KeyboardInterrupt: print("\nOptuna study interrupted.")
except Exception as e: print(f"\nOptuna study error: {e}"); traceback.print_exc()

# --- Analyse des résultats ---
print("\n--- Optuna Study Finished ---")
print(f"Number of finished trials: {len(study.trials)}")
completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE and t.value is not None and t.value > 1e-6]
if not completed_trials:
    print("No trials completed successfully with a positive score.")
else:
    best_trial = max(completed_trials, key=lambda t: t.value)
    print(f"\nBest trial number (among completed): {best_trial.number}")
    print(f"  Best Validation F1_pos: {best_trial.value:.4f}")
    print("  Best Hyperparameters:"); [print(f"    {k}: {v}") for k, v in best_trial.params.items()]

# --- Sauvegarder les N meilleurs modèles ---
print(f"\n--- Saving Top {N_BEST_MODELS} Models ---")
top_models_dir = os.path.join(BASE_OUTPUT_DIR, "top_models")
os.makedirs(top_models_dir, exist_ok=True)
best_completed_trials = sorted(completed_trials, key=lambda t: t.value, reverse=True)
saved_model_count = 0
for rank, trial in enumerate(best_completed_trials):
    if saved_model_count >= N_BEST_MODELS: break
    trial_output_dir = os.path.join(BASE_OUTPUT_DIR, f"trial_{trial.number}")
    source_model_dir = trial_output_dir
    model_file = "model.safetensors" if os.path.exists(os.path.join(source_model_dir, "model.safetensors")) else "pytorch_model.bin"
    required_files = ["config.json", model_file, "tokenizer_config.json", "special_tokens_map.json"]
    if not all(os.path.exists(os.path.join(source_model_dir, f)) for f in required_files):
         print(f"!!! Warning: Missing files in {source_model_dir} for trial {trial.number}. Skipping save.")
         continue
    destination_dir = os.path.join(top_models_dir, f"rank_{rank+1}_trial_{trial.number}_f1_{trial.value:.4f}")
    try:
        ignore_patterns = shutil.ignore_patterns('optimizer.pt', 'scheduler.pt', 'trainer_state.json', 'training_args.bin','logs', '*.log', 'rng_state.pth', 'events.out.tfevents.*','checkpoint-*')
        shutil.copytree(source_model_dir, destination_dir, ignore=ignore_patterns, dirs_exist_ok=True)
        trial_info = {"rank": rank + 1, "trial_number": trial.number, "validation_f1_pos": trial.value,"hyperparameters": trial.params}
        with open(os.path.join(destination_dir, "trial_info.json"), "w") as f: json.dump(trial_info, f, indent=4)
        print(f"  Rank {rank+1}: Saved model from trial {trial.number} to {destination_dir} (F1: {trial.value:.4f})")
        saved_model_count += 1
    except Exception as e: print(f"!!! Error saving model for trial {trial.number}: {e}"); traceback.print_exc()

# --- Nettoyage Optionnel ---
# Décommente pour supprimer les dossiers d'essais non conservés
# print("\n--- Cleaning up non-top trial directories ---")
# ...

# --- Message Final ---
print(f"\nScript finished. Optuna focused study completed.")
if saved_model_count > 0:
    print(f"Top {saved_model_count} models saved in '{top_models_dir}'.")
    print("You can now use these models for ensembling.")
else:
    print("No models were saved.")

CUDA disponible. Utilisation de GPU: NVIDIA GeForce RTX 3070
Loading data...
Train data: 31187 rows, Dev data: 4625 rows
Data loaded.


config.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

[I 2025-04-11 17:03:51,696] A new study created in RDB with name: ade_xlmr_large_opt_focused_v4


Computing class weights...
Class Weights (pour classes [0 1]) sur cuda: tensor([0.5426, 6.3621], device='cuda:0')

--- Starting Optuna Focused Study ---
Optuna study using storage: sqlite:///optuna_cardiffnlp-twitter-roberta-base\optuna_study.db
Number of trials already completed in storage: 0

--- Starting Trial 0 ---
  Output Dir: optuna_cardiffnlp-twitter-roberta-base\trial_0
  Hyperparameters: {'learning_rate': 1.1275465620953943e-05, 'num_train_epochs': 7, 'weight_decay': 0.0639196365086843, 'warmup_ratio': 0.10986584841970365, 'lr_scheduler_type': 'cosine', 'hidden_dropout_prob': 0.2116167224336399, 'attention_probs_dropout_prob': 0.3732352291549871}
Tokenizing data for trial...


Running tokenizer:   0%|          | 0/31187 [00:00<?, ? examples/s]

Running tokenizer:   0%|          | 0/4625 [00:00<?, ? examples/s]

Tokenization complete. Remaining cols: ['labels', 'input_ids', 'attention_mask']
Loading model for trial...


pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Model loaded (dtype: torch.bfloat16).
Starting training for trial 0...


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Pos,Precision Pos,Recall Pos,F1 Neg
1,0.6523,0.63143,0.868108,0.2375,0.236318,0.238693,0.927811
2,0.5304,0.600862,0.771027,0.284943,0.194829,0.530151,0.863689
3,0.4969,0.571057,0.634811,0.265971,0.160799,0.768844,0.756943
4,0.4721,0.573957,0.645838,0.272647,0.165588,0.771357,0.765933


[I 2025-04-11 17:34:10,658] Trial 0 finished with value: 0.2849426063470628 and parameters: {'learning_rate': 1.1275465620953943e-05, 'num_train_epochs': 7, 'weight_decay': 0.0639196365086843, 'warmup_ratio': 0.10986584841970365, 'lr_scheduler_type': 'cosine', 'hidden_dropout_prob': 0.2116167224336399, 'attention_probs_dropout_prob': 0.3732352291549871}. Best is trial 0 with value: 0.2849426063470628.


Training finished for trial 0.
Retrieving best metric for trial 0...
Trial 0 - Best Validation F1_pos recorded: 0.2849

--- Starting Trial 1 ---
  Output Dir: optuna_cardiffnlp-twitter-roberta-base\trial_1
  Hyperparameters: {'learning_rate': 1.3877067474879668e-05, 'num_train_epochs': 6, 'weight_decay': 0.021235069657748146, 'warmup_ratio': 0.1469909852161994, 'lr_scheduler_type': 'cosine', 'hidden_dropout_prob': 0.23636499344142015, 'attention_probs_dropout_prob': 0.23668090197068678}
Tokenizing data for trial...


Running tokenizer:   0%|          | 0/31187 [00:00<?, ? examples/s]

Running tokenizer:   0%|          | 0/4625 [00:00<?, ? examples/s]

Tokenization complete. Remaining cols: ['labels', 'input_ids', 'attention_mask']
Loading model for trial...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(


Model loaded (dtype: torch.bfloat16).
Starting training for trial 1...


Epoch,Training Loss,Validation Loss,Accuracy,F1 Pos,Precision Pos,Recall Pos,F1 Neg
1,0.6545,0.636211,0.884973,0.273224,0.299401,0.251256,0.937544
2,0.5318,0.752782,0.912432,0.201183,0.46789,0.128141,0.953677
3,0.5015,0.550574,0.760649,0.305956,0.203843,0.613065,0.855389
4,0.4793,0.589865,0.848649,0.332061,0.267692,0.437186,0.914655
5,0.4762,0.612825,0.862919,0.33543,0.28777,0.40201,0.923578


[I 2025-04-11 18:17:19,594] Trial 1 finished with value: 0.33542976939203356 and parameters: {'learning_rate': 1.3877067474879668e-05, 'num_train_epochs': 6, 'weight_decay': 0.021235069657748146, 'warmup_ratio': 0.1469909852161994, 'lr_scheduler_type': 'cosine', 'hidden_dropout_prob': 0.23636499344142015, 'attention_probs_dropout_prob': 0.23668090197068678}. Best is trial 1 with value: 0.33542976939203356.


Training finished for trial 1.
Retrieving best metric for trial 1...
Trial 1 - Best Validation F1_pos recorded: 0.3354

--- Starting Trial 2 ---
  Output Dir: optuna_cardiffnlp-twitter-roberta-base\trial_2
  Hyperparameters: {'learning_rate': 1.0572072866769706e-05, 'num_train_epochs': 6, 'weight_decay': 0.04591670111852694, 'warmup_ratio': 0.07912291401980419, 'lr_scheduler_type': 'cosine', 'hidden_dropout_prob': 0.2584289297070437, 'attention_probs_dropout_prob': 0.2732723686587384}
Tokenizing data for trial...


Running tokenizer:   0%|          | 0/31187 [00:00<?, ? examples/s]

## Best model ever

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback # Importer le callback pour l'arrêt précoce
)
# Assurez-vous que wandb est initialisé si vous l'utilisez, sinon commentez/supprimez les lignes wandb.log
import wandb
wandb.init(project="ade-classification-chatgptuamgneted_xlmr") # Exemple d'initialisation

# CORRECTION ICI: Ajout de precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
import zipfile # Pour la sauvegarde finale
import os # Pour la sauvegarde finale
import torch._dynamo
torch._dynamo.config.suppress_errors = True

# --- Configuration ---
MODEL_NAME = "xlm-roberta-large"  # Passer à la version Large
TRAIN_CSV = "data/train_data_SMM4H_2025_Task_1.csv" # Utiliser le CSV d'entraînement augmenté
DEV_CSV = "data/dev_data_SMM4H_2025_Task_1.csv"
OUTPUT_DIR = "easy_xlmr_large_256"  # Répertoire de sortie pour le modèle et les résultats
NUM_EPOCHS = 6  # Réduire légèrement les epochs
BATCH_SIZE = 2  # Plus petit batch size
LEARNING_RATE = 1e-5  # Learning rate plus bas pour le large
GRADIENT_ACCUMULATION_STEPS = 16  # Accumulation plus importante
EARLY_STOPPING_PATIENCE = 1  # Arrêt plus rapide si stagnation
MAX_LENGTH = 512  # Réduire la longueur maximale des séquences
WEIGHT_DECAY = 0.05  # Poids de régularisation

# --- 1. Load Data ---
print("Loading data...")
try:
    train_df = pd.read_csv(TRAIN_CSV).dropna(subset=['text'])
    dev_df = pd.read_csv(DEV_CSV).dropna(subset=['text'])
except FileNotFoundError as e:
    print(f"Error loading CSV files: {e}")
    exit()

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
dataset_dict = DatasetDict({'train': train_dataset, 'validation': dev_dataset})
print("Data loaded.")

# --- 2. Tokenization ---
print("Tokenizing data...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- Modifications de la fonction de tokenization ---
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True  # Pour éviter de dépasser les limites du modèle
    )

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
print("Tokenization complete.")

# --- Clean up columns ---
print("Cleaning dataset columns...")
print("Columns before removal:", tokenized_datasets['train'].column_names)
columns_to_remove = ["text", "id", "file_name", "origin", "language", "split", "type"]
actual_columns_to_remove = [col for col in columns_to_remove if col in tokenized_datasets['train'].column_names]
print("Removing columns:", actual_columns_to_remove)
tokenized_datasets = tokenized_datasets.remove_columns(actual_columns_to_remove)
tokenized_datasets.set_format("torch")

# Rename 'label' to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
print("Columns after cleaning and rename:", tokenized_datasets['train'].column_names)

# --- 3. Compute Class Weights ---
print("Computing class weights...")
labels_train = train_df['label'].values
if len(np.unique(labels_train)) > 1: # Ensure there are at least two classes
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels_train), y=labels_train)
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Class Weights: {class_weights_tensor}")
else:
    print("Warning: Only one class found in training data. Cannot compute class weights.")
    class_weights_tensor = None # Handle this case in the loss function if needed

# --- Custom Trainer for Weighted Loss ---
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Use weights only if they were computed
        if class_weights_tensor is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        else:
            loss_fct = torch.nn.CrossEntropyLoss() # Default unweighted loss
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# --- 4. Model & Metrics ---
print("Loading model...")
# --- Chargement du modèle avec optimisations ---
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=2,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
print("Model loaded.")

# --- Increase dropout to help regularization ---
model.config.hidden_dropout_prob = 0.3  # Dropout on fully connected layers
model.config.attention_probs_dropout_prob = 0.3  # Dropout on attention probabilities
print("Dropout increased: hidden_dropout=0.3, attention_dropout=0.3")

# Activer le gradient checkpointing pour économiser la mémoire
model.gradient_checkpointing_enable()

# Fonction compute_metrics qui utilise la fonction importée
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Utilisation de precision_recall_fscore_support
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None, labels=[0, 1], zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1_pos': f1[1], # F1 for class 1
        'precision_pos': precision[1],
        'recall_pos': recall[1],
        'f1_neg': f1[0], # F1 for class 0 (for info)
    }

# --- 5. Training Arguments ---
print("Setting training arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,  # Modifié (était BATCH_SIZE*2)
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    lr_scheduler_type="cosine",  # <-- Ajouté
    warmup_ratio=0.1,            # <-- Ajouté (10% des steps en warmup)
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_pos",
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=50,
    report_to="wandb" if "wandb" in locals() else "none",
    fp16=False,  # Disabled FP16 to avoid gradient unscaling issues
    bf16=True if torch.cuda.is_available() else False,  # Use BF16 instead if available
    optim="adafactor",  # Ajouté - Optimiseur plus léger
    gradient_checkpointing=True,  # Ajouté - Économie de mémoire
    #torch_compile=False,  # Ajouté - Acceleration PyTorch 2.x+
    save_total_limit=2
    # early_stopping_patience=EARLY_STOPPING_PATIENCE,  # Déjà géré par le callback
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# --- 6. Trainer ---
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # Utilisation du DataCollator
    # Ajout du Callback pour Early Stopping
    callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE)]
)
print("Trainer configured.")

# --- 7. Train ---
print("Starting Training...")
train_result = trainer.train()
print("Training finished.")
trainer.save_model("best_model_f1") # Sauvegarde le meilleur modèle

# Log training metrics
# trainer.log_metrics("train", train_result.metrics) # Décommenter si besoin
# trainer.save_metrics("train", train_result.metrics) # Décommenter si besoin
# trainer.save_state() # Sauvegarde l'état du Trainer

# --- 8. Evaluate on Dev Set (using the best model loaded) ---
print("\nEvaluating on Development Set (Best Model)...")
eval_results = trainer.evaluate()
print("Evaluation Results:")
print(eval_results)
# trainer.log_metrics("eval", eval_results) # Décommenter si besoin
# trainer.save_metrics("eval", eval_results) # Décommenter si besoin

# --- 9. Detailed Evaluation and Submission File Generation ---
print("\nGenerating predictions and detailed metrics for Dev Set...")

# Generate predictions
predictions = trainer.predict(tokenized_datasets["validation"])
predicted_labels = predictions.predictions.argmax(-1)

# --- Threshold tuning ---
print("\nRunning threshold tuning on positive class...")

# Get predicted probabilities for class 1
logits = torch.tensor(predictions.predictions)
probs = torch.nn.functional.softmax(logits, dim=1)[:, 1]  # Probabilities for class 1
y_true = predictions.label_ids

# Baseline (argmax) F1
baseline_f1 = f1_score(y_true, predicted_labels, pos_label=1)
print(f"Baseline F1 (argmax): {baseline_f1:.4f}")

# Search best threshold
best_thresh = 0.5
best_f1 = 0.0
thresholds = np.linspace(0.1, 0.9, 81)

for t in thresholds:
    preds_thresh = (probs >= t).int()
    f1 = f1_score(y_true, preds_thresh, pos_label=1)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best threshold found: {best_thresh:.4f} → Tuned F1: {best_f1:.4f}")

# Save threshold to file
threshold_path = os.path.join(OUTPUT_DIR, "threshold.txt")
with open(threshold_path, "w") as f:
    f.write(f"{best_thresh:.4f}")
print(f"Threshold saved to {threshold_path}")

# Add predictions to the dev dataframe (ensure index alignment if necessary)
# If dev_df was filtered by dropna, indices might not match directly. Resetting index helps.
dev_df_eval = dev_df.reset_index(drop=True)
# Check lengths match before assigning
if len(dev_df_eval) == len(predicted_labels):
    dev_df_eval['predicted_label'] = predicted_labels
else:
    print(f"Error: Length mismatch! Dev DF has {len(dev_df_eval)} rows, Predictions have {len(predicted_labels)} entries.")
    # Handle error appropriately, maybe skip detailed eval or investigate dropna impact
    exit()


# Re-extract language (assuming ID format 'lang_...') - Be careful if IDs differ
dev_df_eval["language"] = dev_df_eval["id"].apply(lambda x: str(x).split("_")[0] if isinstance(x, str) and "_" in x else "unknown")

# --- Calculate Per-Language and Overall Metrics ---
languages = sorted(dev_df_eval['language'].unique())
per_language_metrics = {}
language_f1_scores = []
all_true_labels_eval = []
all_pred_labels_eval = []

print("\n--- Detailed Evaluation on Development Set ---")
wandb_logs = {} # Collect logs for wandb

for lang in languages:
    if lang == "unknown": continue # Skip if language couldn't be extracted
    lang_mask = dev_df_eval['language'] == lang
    y_true_lang = dev_df_eval.loc[lang_mask, 'label']
    y_pred_lang = dev_df_eval.loc[lang_mask, 'predicted_label']

    if len(y_true_lang) == 0: continue # Skip if no data for this language

    all_true_labels_eval.extend(y_true_lang.tolist())
    all_pred_labels_eval.extend(y_pred_lang.tolist())

    # Utilisation de precision_recall_fscore_support (qui est maintenant importé)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true_lang, y_pred_lang, average=None, labels=[0, 1], zero_division=0)
    accuracy_lang = accuracy_score(y_true_lang, y_pred_lang) # Renommé pour éviter conflit avec la fonction accuracy_score

    per_language_metrics[lang] = {'precision': precision[1], 'recall': recall[1], 'f1': f1[1], 'accuracy': accuracy_lang}
    language_f1_scores.append(f1[1]) # On stocke le F1 de la classe positive (1)

    print(f"\nMetrics for language: {lang.upper()}")
    print(f"  Precision-{lang} (Pos): {precision[1]:.4f}")
    print(f"  Recall-{lang}    (Pos): {recall[1]:.4f}")
    print(f"  F1-{lang}        (Pos): {f1[1]:.4f}")
    print(f"  Accuracy-{lang}:        {accuracy_lang:.4f}")

    # Prepare logs for wandb
    wandb_logs[f"{lang}/precision_pos"] = precision[1]
    wandb_logs[f"{lang}/recall_pos"] = recall[1]
    wandb_logs[f"{lang}/f1_pos"] = f1[1]
    wandb_logs[f"{lang}/accuracy"] = accuracy_lang


# Calculate Overall Metrics (using the full dev set lists)
cm_overall = confusion_matrix(all_true_labels_eval, all_pred_labels_eval, labels=[0, 1])
tn, fp, fn, tp = cm_overall.ravel() if cm_overall.size == 4 else (0, 0, 0, 0) # Handle cases with missing classes

overall_precision_pos = tp / (tp + fp) if (tp + fp) > 0 else 0
overall_recall_pos = tp / (tp + fn) if (tp + fn) > 0 else 0
# Overall F1 (Primary Metric for Positive Class)
overall_f1_pos = 2 * (overall_precision_pos * overall_recall_pos) / (overall_precision_pos + overall_recall_pos) if (overall_precision_pos + overall_recall_pos) > 0 else 0
# Macro F1 (Average of per-language F1s for Positive Class)
macro_f1_pos = np.mean(language_f1_scores) if language_f1_scores else 0
# Overall Accuracy
overall_accuracy = accuracy_score(all_true_labels_eval, all_pred_labels_eval)

print("\n--- Overall Evaluation Summary (Positive Class Focus) ---")
print(f"F1-score across all languages (Positive Class): {overall_f1_pos:.4f}  <-- Primary Metric")
print(f"Macro F1-score across all languages (Pos Class):{macro_f1_pos:.4f}")
print(f"Overall Precision (Positive Class):             {overall_precision_pos:.4f}")
print(f"Overall Recall (Positive Class):                {overall_recall_pos:.4f}")
print(f"Overall Accuracy across all languages:          {overall_accuracy:.4f}")

print("\nOverall Confusion Matrix (All Languages):")
print(f"[[TN={tn}  FP={fp}]")
print(f" [FN={fn}  TP={tp}]]")

# Add overall metrics to wandb logs
wandb_logs["overall/f1_pos"] = overall_f1_pos
wandb_logs["overall/macro_f1_pos"] = macro_f1_pos
wandb_logs["overall/precision_pos"] = overall_precision_pos
wandb_logs["overall/recall_pos"] = overall_recall_pos
wandb_logs["overall/accuracy"] = overall_accuracy
wandb_logs["overall/TP"] = tp
wandb_logs["overall/FP"] = fp
wandb_logs["overall/FN"] = fn
wandb_logs["overall/TN"] = tn

# Log Confusion Matrix to wandb (optional)
if "wandb" in locals() and tp+fp+fn+tn > 0:
     try:
        wandb_logs["confusion_matrix"] = wandb.plot.confusion_matrix(
             probs=None,
             y_true=all_true_labels_eval,
             preds=all_pred_labels_eval,
             class_names=["Negative (0)", "Positive (1)"]
         )
     except Exception as e:
         print(f"Could not log confusion matrix to wandb: {e}")


# Log all collected metrics to wandb (if initialized)
if "wandb" in locals():
    try:
        wandb.log(wandb_logs)
        print("Metrics logged to WandB.")
    except Exception as e:
        print(f"Could not log metrics to wandb: {e}")


# --- 10. Save Submission File ---
print("\nSaving predictions for submission...")
# Ensure results directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Prepare submission dataframe
submission_df = dev_df_eval[['id', 'predicted_label']]

# Define CSV and ZIP paths
csv_filename = "predictions_task1.csv"
zip_filename = "submission_task1.zip"
csv_path = os.path.join(OUTPUT_DIR, csv_filename)
zip_path = os.path.join(OUTPUT_DIR, zip_filename)

# Save CSV
submission_df.to_csv(csv_path, index=False)
print(f"Predictions saved to {csv_path}")

# Zip the CSV file
with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
    zf.write(csv_path, arcname=csv_filename)
print(f"{csv_filename} has been zipped into {zip_path}")


print("\nScript finished.")