## XLM roberta large full finetuning BEST


In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    EarlyStoppingCallback
)


from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import torch
import zipfile # Pour la sauvegarde finale
import os # Pour la sauvegarde finale
import torch._dynamo
torch._dynamo.config.suppress_errors = True

# --- Configuration ---
MODEL_NAME = "xlm-roberta-large"  
TRAIN_CSV = "data/train_data_SMM4H_2025_Task_1.csv" 
DEV_CSV = "data/dev_data_SMM4H_2025_Task_1.csv"
OUTPUT_DIR = "easy_xlmr_large_256"  
NUM_EPOCHS = 6  
BATCH_SIZE = 2  
LEARNING_RATE = 1e-5  
GRADIENT_ACCUMULATION_STEPS = 16  
EARLY_STOPPING_PATIENCE = 1 
MAX_LENGTH = 256 # I tried with 512 but it was not possible on my machine  
WEIGHT_DECAY = 0.05

# --- 1. Load Data ---
print("Loading data...")
try:
    train_df = pd.read_csv(TRAIN_CSV).dropna(subset=['text'])
    dev_df = pd.read_csv(DEV_CSV).dropna(subset=['text'])
except FileNotFoundError as e:
    print(f"Error loading CSV files: {e}")
    exit()

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
dataset_dict = DatasetDict({'train': train_dataset, 'validation': dev_dataset})
print("Data loaded.")

# --- 2. Tokenization ---
print("Tokenizing data...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- Modifications de la fonction de tokenization ---
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True  # Pour éviter de dépasser les limites du modèle
    )

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
print("Tokenization complete.")

# --- Clean up columns ---
print("Cleaning dataset columns...")
print("Columns before removal:", tokenized_datasets['train'].column_names)
columns_to_remove = ["text", "id", "file_name", "origin", "language", "split", "type"]
actual_columns_to_remove = [col for col in columns_to_remove if col in tokenized_datasets['train'].column_names]
print("Removing columns:", actual_columns_to_remove)
tokenized_datasets = tokenized_datasets.remove_columns(actual_columns_to_remove)
tokenized_datasets.set_format("torch")

# Rename 'label' to 'labels'
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
print("Columns after cleaning and rename:", tokenized_datasets['train'].column_names)

# --- 3. Compute Class Weights ---
print("Computing class weights...")
labels_train = train_df['label'].values
if len(np.unique(labels_train)) > 1: # Ensure there are at least two classes
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(labels_train), y=labels_train)
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Class Weights: {class_weights_tensor}")
else:
    print("Warning: Only one class found in training data. Cannot compute class weights.")
    class_weights_tensor = None # Handle this case in the loss function if needed

# --- Custom Trainer for Weighted Loss ---
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Use weights only if they were computed
        if class_weights_tensor is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        else:
            loss_fct = torch.nn.CrossEntropyLoss() # Default unweighted loss
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# --- 4. Model & Metrics ---
print("Loading model...")
# --- Chargement du modèle avec optimisations ---
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=2,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
print("Model loaded.")

# --- Increase dropout to help regularization ---
model.config.hidden_dropout_prob = 0.3  # Dropout on fully connected layers
model.config.attention_probs_dropout_prob = 0.3  # Dropout on attention probabilities
print("Dropout increased: hidden_dropout=0.3, attention_dropout=0.3")

# Activer le gradient checkpointing pour économiser la mémoire
model.gradient_checkpointing_enable()

# Fonction compute_metrics qui utilise la fonction importée
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    # Utilisation de precision_recall_fscore_support
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None, labels=[0, 1], zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1_pos': f1[1], # F1 for class 1
        'precision_pos': precision[1],
        'recall_pos': recall[1],
        'f1_neg': f1[0], # F1 for class 0 (for info)
    }

# --- 5. Training Arguments ---
print("Setting training arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,  # Modifié (était BATCH_SIZE*2)
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
    lr_scheduler_type="cosine",  # <-- Ajouté
    warmup_ratio=0.1,            # <-- Ajouté (10% des steps en warmup)
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_pos",
    greater_is_better=True,
    logging_dir='./logs',
    logging_steps=50,
    report_to="wandb" if "wandb" in locals() else "none",
    fp16=False,  # Disabled FP16 to avoid gradient unscaling issues
    bf16=True if torch.cuda.is_available() else False,  # Use BF16 instead if available
    optim="adafactor",  # Ajouté - Optimiseur plus léger
    gradient_checkpointing=True,  # Ajouté - Économie de mémoire
    #torch_compile=False,  # Ajouté - Acceleration PyTorch 2.x+
    save_total_limit=2
    # early_stopping_patience=EARLY_STOPPING_PATIENCE,  # Déjà géré par le callback
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# --- 6. Trainer ---
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator,  # Utilisation du DataCollator
    # Ajout du Callback pour Early Stopping
    callbacks=[EarlyStoppingCallback(early_stopping_patience=EARLY_STOPPING_PATIENCE)]
)
print("Trainer configured.")

# --- 7. Train ---
print("Starting Training...")
train_result = trainer.train()
print("Training finished.")
trainer.save_model("best_model_f1") # Sauvegarde le meilleur modèle

# Log training metrics
# trainer.log_metrics("train", train_result.metrics) # Décommenter si besoin
# trainer.save_metrics("train", train_result.metrics) # Décommenter si besoin
# trainer.save_state() # Sauvegarde l'état du Trainer

# --- 8. Evaluate on Dev Set (using the best model loaded) ---
print("\nEvaluating on Development Set (Best Model)...")
eval_results = trainer.evaluate()
print("Evaluation Results:")
print(eval_results)
# trainer.log_metrics("eval", eval_results) # Décommenter si besoin
# trainer.save_metrics("eval", eval_results) # Décommenter si besoin

# --- 9. Detailed Evaluation and Submission File Generation ---
print("\nGenerating predictions and detailed metrics for Dev Set...")

# Generate predictions
predictions = trainer.predict(tokenized_datasets["validation"])
predicted_labels = predictions.predictions.argmax(-1)

# --- Threshold tuning ---
print("\nRunning threshold tuning on positive class...")

# Get predicted probabilities for class 1
logits = torch.tensor(predictions.predictions)
probs = torch.nn.functional.softmax(logits, dim=1)[:, 1]  # Probabilities for class 1
y_true = predictions.label_ids

# Baseline (argmax) F1
baseline_f1 = f1_score(y_true, predicted_labels, pos_label=1)
print(f"Baseline F1 (argmax): {baseline_f1:.4f}")

# Search best threshold
best_thresh = 0.5
best_f1 = 0.0
thresholds = np.linspace(0.1, 0.9, 81)

for t in thresholds:
    preds_thresh = (probs >= t).int()
    f1 = f1_score(y_true, preds_thresh, pos_label=1)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best threshold found: {best_thresh:.4f} → Tuned F1: {best_f1:.4f}")

# Save threshold to file
threshold_path = os.path.join(OUTPUT_DIR, "threshold.txt")
with open(threshold_path, "w") as f:
    f.write(f"{best_thresh:.4f}")
print(f"Threshold saved to {threshold_path}")

# Add predictions to the dev dataframe (ensure index alignment if necessary)
# If dev_df was filtered by dropna, indices might not match directly. Resetting index helps.
dev_df_eval = dev_df.reset_index(drop=True)
# Check lengths match before assigning
if len(dev_df_eval) == len(predicted_labels):
    dev_df_eval['predicted_label'] = predicted_labels
else:
    print(f"Error: Length mismatch! Dev DF has {len(dev_df_eval)} rows, Predictions have {len(predicted_labels)} entries.")
    # Handle error appropriately, maybe skip detailed eval or investigate dropna impact
    exit()


# Re-extract language (assuming ID format 'lang_...') - Be careful if IDs differ
dev_df_eval["language"] = dev_df_eval["id"].apply(lambda x: str(x).split("_")[0] if isinstance(x, str) and "_" in x else "unknown")

# --- Calculate Per-Language and Overall Metrics ---
languages = sorted(dev_df_eval['language'].unique())
per_language_metrics = {}
language_f1_scores = []
all_true_labels_eval = []
all_pred_labels_eval = []

print("\n--- Detailed Evaluation on Development Set ---")
wandb_logs = {} # Collect logs for wandb

for lang in languages:
    if lang == "unknown": continue # Skip if language couldn't be extracted
    lang_mask = dev_df_eval['language'] == lang
    y_true_lang = dev_df_eval.loc[lang_mask, 'label']
    y_pred_lang = dev_df_eval.loc[lang_mask, 'predicted_label']

    if len(y_true_lang) == 0: continue # Skip if no data for this language

    all_true_labels_eval.extend(y_true_lang.tolist())
    all_pred_labels_eval.extend(y_pred_lang.tolist())

    # Utilisation de precision_recall_fscore_support (qui est maintenant importé)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true_lang, y_pred_lang, average=None, labels=[0, 1], zero_division=0)
    accuracy_lang = accuracy_score(y_true_lang, y_pred_lang) # Renommé pour éviter conflit avec la fonction accuracy_score

    per_language_metrics[lang] = {'precision': precision[1], 'recall': recall[1], 'f1': f1[1], 'accuracy': accuracy_lang}
    language_f1_scores.append(f1[1]) # On stocke le F1 de la classe positive (1)

    print(f"\nMetrics for language: {lang.upper()}")
    print(f"  Precision-{lang} (Pos): {precision[1]:.4f}")
    print(f"  Recall-{lang}    (Pos): {recall[1]:.4f}")
    print(f"  F1-{lang}        (Pos): {f1[1]:.4f}")
    print(f"  Accuracy-{lang}:        {accuracy_lang:.4f}")

    # Prepare logs for wandb
    wandb_logs[f"{lang}/precision_pos"] = precision[1]
    wandb_logs[f"{lang}/recall_pos"] = recall[1]
    wandb_logs[f"{lang}/f1_pos"] = f1[1]
    wandb_logs[f"{lang}/accuracy"] = accuracy_lang


# Calculate Overall Metrics (using the full dev set lists)
cm_overall = confusion_matrix(all_true_labels_eval, all_pred_labels_eval, labels=[0, 1])
tn, fp, fn, tp = cm_overall.ravel() if cm_overall.size == 4 else (0, 0, 0, 0) # Handle cases with missing classes

overall_precision_pos = tp / (tp + fp) if (tp + fp) > 0 else 0
overall_recall_pos = tp / (tp + fn) if (tp + fn) > 0 else 0
# Overall F1 (Primary Metric for Positive Class)
overall_f1_pos = 2 * (overall_precision_pos * overall_recall_pos) / (overall_precision_pos + overall_recall_pos) if (overall_precision_pos + overall_recall_pos) > 0 else 0
# Macro F1 (Average of per-language F1s for Positive Class)
macro_f1_pos = np.mean(language_f1_scores) if language_f1_scores else 0
# Overall Accuracy
overall_accuracy = accuracy_score(all_true_labels_eval, all_pred_labels_eval)

print("\n--- Overall Evaluation Summary (Positive Class Focus) ---")
print(f"F1-score across all languages (Positive Class): {overall_f1_pos:.4f}  <-- Primary Metric")
print(f"Macro F1-score across all languages (Pos Class):{macro_f1_pos:.4f}")
print(f"Overall Precision (Positive Class):             {overall_precision_pos:.4f}")
print(f"Overall Recall (Positive Class):                {overall_recall_pos:.4f}")
print(f"Overall Accuracy across all languages:          {overall_accuracy:.4f}")

print("\nOverall Confusion Matrix (All Languages):")
print(f"[[TN={tn}  FP={fp}]")
print(f" [FN={fn}  TP={tp}]]")

# Add overall metrics to wandb logs
wandb_logs["overall/f1_pos"] = overall_f1_pos
wandb_logs["overall/macro_f1_pos"] = macro_f1_pos
wandb_logs["overall/precision_pos"] = overall_precision_pos
wandb_logs["overall/recall_pos"] = overall_recall_pos
wandb_logs["overall/accuracy"] = overall_accuracy
wandb_logs["overall/TP"] = tp
wandb_logs["overall/FP"] = fp
wandb_logs["overall/FN"] = fn
wandb_logs["overall/TN"] = tn

# Log Confusion Matrix to wandb (optional)
if "wandb" in locals() and tp+fp+fn+tn > 0:
     try:
        wandb_logs["confusion_matrix"] = wandb.plot.confusion_matrix(
             probs=None,
             y_true=all_true_labels_eval,
             preds=all_pred_labels_eval,
             class_names=["Negative (0)", "Positive (1)"]
         )
     except Exception as e:
         print(f"Could not log confusion matrix to wandb: {e}")


# Log all collected metrics to wandb (if initialized)
if "wandb" in locals():
    try:
        wandb.log(wandb_logs)
        print("Metrics logged to WandB.")
    except Exception as e:
        print(f"Could not log metrics to wandb: {e}")


# --- 10. Save Submission File ---
print("\nSaving predictions for submission...")
# Ensure results directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Prepare submission dataframe
submission_df = dev_df_eval[['id', 'predicted_label']]

# Define CSV and ZIP paths
csv_filename = "predictions_task1.csv"
zip_filename = "submission_task1.zip"
csv_path = os.path.join(OUTPUT_DIR, csv_filename)
zip_path = os.path.join(OUTPUT_DIR, zip_filename)

# Save CSV
submission_df.to_csv(csv_path, index=False)
print(f"Predictions saved to {csv_path}")

# Zip the CSV file
with zipfile.ZipFile(zip_path, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
    zf.write(csv_path, arcname=csv_filename)
print(f"{csv_filename} has been zipped into {zip_path}")


print("\nScript finished.")




wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: caron-olivier-80 (caron-olivier-80-universit-paris-dauphine-psl) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


Loading data...
Data loaded.
Tokenizing data...


Map:   0%|          | 0/31187 [00:00<?, ? examples/s]

Map:   0%|          | 0/4625 [00:00<?, ? examples/s]

Tokenization complete.
Cleaning dataset columns...
Columns before removal: ['id', 'text', 'label', 'file_name', 'origin', 'type', 'language', 'split', 'input_ids', 'attention_mask']
Removing columns: ['text', 'id', 'file_name', 'origin', 'language', 'split', 'type']
Columns after cleaning and rename: ['labels', 'input_ids', 'attention_mask']
Computing class weights...
Class Weights: tensor([0.5426, 6.3621], device='cuda:0')
Loading model...


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.
Dropout increased: hidden_dropout=0.3, attention_dropout=0.3
Setting training arguments...


  trainer = WeightedTrainer(


Trainer configured.
Starting Training...




Epoch,Training Loss,Validation Loss,Accuracy,F1 Pos,Precision Pos,Recall Pos,F1 Neg
0,0.2374,0.242307,0.946162,0.628912,0.772894,0.530151,0.970976
1,0.1689,0.196939,0.936216,0.674033,0.601578,0.766332,0.964649
2,0.1383,0.270457,0.936432,0.687898,0.595588,0.81407,0.964612
3,0.1111,0.277541,0.95027,0.718137,0.700957,0.736181,0.972729
4,0.0516,0.30639,0.953081,0.730435,0.722359,0.738693,0.974304
5,0.0737,0.305603,0.952865,0.731527,0.717391,0.746231,0.974164


Training finished.

Evaluating on Development Set (Best Model)...


Evaluation Results:
{'eval_loss': 0.3056026101112366, 'eval_accuracy': 0.9528648648648649, 'eval_f1_pos': 0.7315270935960592, 'eval_precision_pos': 0.717391304347826, 'eval_recall_pos': 0.7462311557788944, 'eval_f1_neg': 0.9741644939559138, 'eval_runtime': 47.3617, 'eval_samples_per_second': 97.653, 'eval_steps_per_second': 48.837, 'epoch': 5.999358727715788}

Generating predictions and detailed metrics for Dev Set...

Running threshold tuning on positive class...
Baseline F1 (argmax): 0.7315
Best threshold found: 0.6000 → Tuned F1: 0.7338
Threshold saved to easy_xlmr_large_256\threshold.txt

--- Detailed Evaluation on Development Set ---

Metrics for language: DE
  Precision-de (Pos): 0.6053
  Recall-de    (Pos): 0.6571
  F1-de        (Pos): 0.6301
  Accuracy-de:        0.9574

Metrics for language: EN
  Precision-en (Pos): 0.7727
  Recall-en    (Pos): 0.8361
  F1-en        (Pos): 0.8031
  Accuracy-en:        0.9723

Metrics for language: FR
  Precision-fr (Pos): 0.7297
  Recall-fr   