# 4. Global Demo & Model Comparison (Batch Evaluation)

Ce notebook permet de charger les mod√®les et de les **tester massivement** sur le dataset de validation.

**Fonctionnalit√©s :**
1.  Chargement des mod√®les (Baseline + Fine-Tuned).
2.  **Batch Test** : On prend 50 (ou plus) tweets au hasard dans la validation.
3.  **Comparatif** : On affiche les scores de chacun et un tableau d√©taill√© des erreurs.

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import joblib
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display, HTML
from tqdm.notebook import tqdm

# Config
VAL_PATH = "../data/twitter_val_clean.csv"
BASELINE_PATH = "../models/baseline/baseline_model.joblib"
FINETUNED_PATH = "../models/bert_finetuned"

LABEL_MAP = {0: "Negative", 1: "Neutral", 2: "Positive", 3: "Irrelevant"}

# Chargement Dataset
print("‚è≥ Chargement Dataset Validation...")
val_df = pd.read_csv(VAL_PATH)
print(f"‚úÖ {len(val_df)} tweets disponibles.")

## 1. Chargement des Mod√®les

In [None]:
# 1. Baseline
print("‚è≥ Chargement Baseline...")
bl_clf = joblib.load(BASELINE_PATH)
bl_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bl_bert = AutoModel.from_pretrained("bert-base-uncased")
bl_bert.eval()
print("‚úÖ Baseline OK.")

# 2. Fine-Tuned
print("‚è≥ Chargement Fine-Tuned...")
ft_tokenizer = AutoTokenizer.from_pretrained(FINETUNED_PATH)
ft_model = AutoModelForSequenceClassification.from_pretrained(FINETUNED_PATH)
ft_model.eval()
print("‚úÖ Fine-Tuned OK.")

In [None]:
def get_baseline_pred(texts):
    # Batch encoding for speed
    inputs = bl_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = bl_bert(**inputs)
    embs = outputs.last_hidden_state[:, 0, :].numpy()
    preds = bl_clf.predict(embs)
    return preds

def get_finetuned_pred(texts):
    inputs = ft_tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = ft_model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    preds = torch.argmax(probs, dim=1).numpy()
    return preds

## 2. Ex√©cution du Test (Batch)

In [None]:
# Choisissez le nombre de tweets √† tester
N_TEST = 100 
print(f"üß™ Lancement du test sur {N_TEST} tweets al√©atoires...")

sample = val_df.sample(n=N_TEST, random_state=None).reset_index(drop=True)
texts = sample["clean_text"].astype(str).tolist()
labels = sample["label"].values

# Pr√©dictions (Batch par Batch si n√©cessaire, ici N est petit donc tout d'un coup)
print("   ... processing Baseline")
pred_bl = get_baseline_pred(texts)

print("   ... processing Fine-Tuning")
pred_ft = get_finetuned_pred(texts)

print("‚úÖ Termin√©.")

## 3. R√©sultats et Comparaison

In [None]:
acc_bl = accuracy_score(labels, pred_bl)
acc_ft = accuracy_score(labels, pred_ft)

print(f"üìä R√âSULTATS SUR {N_TEST} TWEETS :")
print(f"-----------------------------------")
print(f"üéØ Baseline Accuracy   : {acc_bl*100:.2f}%")
print(f"üöÄ Fine-Tuned Accuracy : {acc_ft*100:.2f}%")
print(f"-----------------------------------")

if acc_ft > acc_bl:
    print("üèÜ Le Fine-Tuned gagne !")
elif acc_bl > acc_ft:
    print("üèÜ La Baseline gagne ! (Surprise ?)")
else:
    print("ü§ù √âgalit√© parfaite.")

## 4. Analyse D√©tail√©e (Tableau)

In [None]:
# Cr√©ation d'un DataFrame pour voir les erreurs
results = pd.DataFrame({
    "Tweet": texts,
    "True": [LABEL_MAP[l] for l in labels],
    "Baseline": [LABEL_MAP[p] for p in pred_bl],
    "FineTuned": [LABEL_MAP[p] for p in pred_ft],
    "BL_Correct": labels == pred_bl,
    "FT_Correct": labels == pred_ft
})

# Fonction de style pour colorier
def highlight_vals(row):
    styles = [''] * len(row)
    # Baseline col index = 2
    if row['Baseline'] == row['True']:
        styles[2] = 'background-color: #d4edda; color: #155724' # Green
    else:
        styles[2] = 'background-color: #f8d7da; color: #721c24' # Red
        
    # FineTuned col index = 3
    if row['FineTuned'] == row['True']:
        styles[3] = 'background-color: #d4edda; color: #155724'
    else:
        styles[3] = 'background-color: #f8d7da; color: #721c24'
    return styles

print("Affichage des 20 premiers r√©sultats (Vert = Correct, Rouge = Erreur) :")
display(results.head(20).style.apply(highlight_vals, axis=1))

### Exemples o√π les mod√®les ne sont pas d'accord

In [None]:
disagreement = results[results["Baseline"] != results["FineTuned"]]
print(f"Il y a {len(disagreement)} tweets o√π les mod√®les ne sont pas d'accord.")
display(disagreement.head(10).style.apply(highlight_vals, axis=1))