# üçΩÔ∏è Projet V11 - CAMEMBERT-LARGE + R-DROP

**Combinaison du meilleur mod√®le + meilleure technique**

| Version | Mod√®le | Technique | Accuracy |
|---------|--------|-----------|----------|
| V9 | camembert-large | Mean Pooling | 88.44% |
| V10 | camembertav2-base | R-Drop | 87.78% |
| **V11** | **camembert-large** | **R-Drop** | **~89%+** |

In [None]:
!pip install -q transformers torch pandas numpy tqdm accelerate

In [None]:
import torch
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# ============================================
# CONFIG V11 - CAMEMBERT-LARGE + R-DROP
# ============================================
CONFIG = {
    "model_name": "camembert/camembert-large",  # 338M params
    "num_epochs": 4,
    "batch_size": 12,
    "learning_rate": 2e-5,
    "max_length": 256,
    "dropout": 0.1,
    "label_smoothing": 0.1,
    "weight_decay": 0.01,
    "warmup_ratio": 0.1,
    "hidden_dim": 384,
    "patience": 2,
    "use_fp16": True,
    "use_mean_pooling": True,
    "use_rdrop": True,
    "rdrop_alpha": 0.5,  # Poids KL l√©g√®rement r√©duit
}
print("üìã Config V11:")
for k, v in CONFIG.items(): print(f"  {k}: {v}")

In [None]:
import os
from google.colab import files
os.makedirs('/content/data', exist_ok=True)
print("Uploadez ftdataset_train.tsv et ftdataset_val.tsv:")
for f in files.upload().keys(): os.rename(f, f'/content/data/{f}')
!ls /content/data/

In [None]:
import torch, torch.nn as nn, torch.nn.functional as F, pandas as pd, time
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.amp import autocast, GradScaler
from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup
from tqdm.auto import tqdm

LABEL_TO_IDX = {"Positive": 0, "N√©gative": 1, "Neutre": 2, "NE": 3}
IDX_TO_LABEL = {v: k for k, v in LABEL_TO_IDX.items()}
ASPECTS = ["Prix", "Cuisine", "Service"]

class DS(Dataset):
    def __init__(s, t, tok, l=None, ml=256):
        s.e = tok(t, truncation=True, padding=True, max_length=ml, return_tensors="pt")
        s.l = l
    def __len__(s): return len(s.e["input_ids"])
    def __getitem__(s, i):
        it = {"input_ids": s.e["input_ids"][i], "attention_mask": s.e["attention_mask"][i]}
        if s.l:
            for a in ASPECTS: it[f"label_{a.lower()}"] = torch.tensor(s.l[a][i], dtype=torch.long)
        return it

def prep_l(d): return {a: [LABEL_TO_IDX.get(x[a], 3) for x in d] for a in ASPECTS}
def get_t(d): return [x["Avis"] for x in d]
def coll(f):
    b = {"input_ids": torch.stack([x["input_ids"] for x in f]), "attention_mask": torch.stack([x["attention_mask"] for x in f])}
    for a in ASPECTS:
        k = f"label_{a.lower()}"
        if k in f[0]: b[k] = torch.stack([x[k] for x in f])
    return b

def kl_div_loss(p, q):
    """KL divergence sym√©trique pour R-Drop"""
    p_loss = F.kl_div(F.log_softmax(p, dim=-1), F.softmax(q, dim=-1), reduction='batchmean')
    q_loss = F.kl_div(F.log_softmax(q, dim=-1), F.softmax(p, dim=-1), reduction='batchmean')
    return (p_loss + q_loss) / 2

class ClassifierV11(nn.Module):
    def __init__(s, mn, hd=384, dr=0.1, use_mean_pooling=True):
        super().__init__()
        s.enc = AutoModel.from_pretrained(mn)
        hs = s.enc.config.hidden_size
        s.use_mean_pooling = use_mean_pooling
        
        s.cls = nn.ModuleDict({
            a: nn.Sequential(
                nn.Dropout(dr),
                nn.Linear(hs, hd),
                nn.GELU(),
                nn.LayerNorm(hd),
                nn.Dropout(dr),
                nn.Linear(hd, 4)
            ) for a in ASPECTS
        })
    
    def mean_pooling(s, hidden_states, attention_mask):
        mask_expanded = attention_mask.unsqueeze(-1).expand(hidden_states.size()).float()
        sum_embeddings = torch.sum(hidden_states * mask_expanded, dim=1)
        sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
        return sum_embeddings / sum_mask
    
    def forward(s, ids, mask):
        o = s.enc(input_ids=ids, attention_mask=mask)
        if s.use_mean_pooling:
            pooled = s.mean_pooling(o.last_hidden_state, mask)
        else:
            pooled = o.last_hidden_state[:, 0, :]
        return {a: s.cls[a](pooled) for a in ASPECTS}

class Trainer:
    def __init__(s, c):
        s.c = c; s.dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"üîß Chargement de {c['model_name']}...")
        s.tok = AutoTokenizer.from_pretrained(c['model_name'])
        s.m = ClassifierV11(
            c['model_name'], c['hidden_dim'], c['dropout'], c['use_mean_pooling']
        ).to(s.dev)
        s.ce_crit = nn.CrossEntropyLoss(label_smoothing=c['label_smoothing'])
        s.scaler = GradScaler('cuda') if c['use_fp16'] else None
        params = sum(p.numel() for p in s.m.parameters())/1e6
        print(f"‚úÖ Mod√®le sur {s.dev} ({params:.0f}M params)")
        print(f"   Mean Pooling: {c['use_mean_pooling']} | FP16: {c['use_fp16']} | R-Drop: {c['use_rdrop']}")

    def train(s, td, vd):
        c = s.c; st = time.time()
        tds = DS(get_t(td), s.tok, prep_l(td), c['max_length'])
        vds = DS(get_t(vd), s.tok, prep_l(vd), c['max_length'])
        tl = DataLoader(tds, batch_size=c['batch_size'], shuffle=True, collate_fn=coll)
        vl = DataLoader(vds, batch_size=c['batch_size'], shuffle=False, collate_fn=coll)
        
        opt = AdamW(s.m.parameters(), lr=c['learning_rate'], weight_decay=c['weight_decay'])
        ts = len(tl) * c['num_epochs']
        sch = get_cosine_schedule_with_warmup(opt, int(ts * c['warmup_ratio']), ts)
        
        ba, bs, pa = 0, None, 0
        
        for ep in range(c['num_epochs']):
            t0 = time.time()
            print(f"\n{'='*60}\nEpoch {ep+1}/{c['num_epochs']}\n{'='*60}")
            s.m.train(); tls = 0; kls = 0
            
            for b in tqdm(tl, desc="Training"):
                ids, mask = b["input_ids"].to(s.dev), b["attention_mask"].to(s.dev)
                opt.zero_grad()
                
                if c['use_fp16']:
                    with autocast('cuda'):
                        lo1 = s.m(ids, mask)
                        ce_loss = sum(s.ce_crit(lo1[a], b[f"label_{a.lower()}"].to(s.dev)) for a in ASPECTS)
                        
                        if c['use_rdrop']:
                            lo2 = s.m(ids, mask)
                            ce_loss2 = sum(s.ce_crit(lo2[a], b[f"label_{a.lower()}"].to(s.dev)) for a in ASPECTS)
                            kl_loss = sum(kl_div_loss(lo1[a], lo2[a]) for a in ASPECTS)
                            loss = (ce_loss + ce_loss2) / 2 + c['rdrop_alpha'] * kl_loss
                            kls += kl_loss.item()
                        else:
                            loss = ce_loss
                    
                    s.scaler.scale(loss).backward()
                    s.scaler.unscale_(opt)
                    torch.nn.utils.clip_grad_norm_(s.m.parameters(), 1.0)
                    s.scaler.step(opt)
                    s.scaler.update()
                else:
                    lo1 = s.m(ids, mask)
                    ce_loss = sum(s.ce_crit(lo1[a], b[f"label_{a.lower()}"].to(s.dev)) for a in ASPECTS)
                    if c['use_rdrop']:
                        lo2 = s.m(ids, mask)
                        ce_loss2 = sum(s.ce_crit(lo2[a], b[f"label_{a.lower()}"].to(s.dev)) for a in ASPECTS)
                        kl_loss = sum(kl_div_loss(lo1[a], lo2[a]) for a in ASPECTS)
                        loss = (ce_loss + ce_loss2) / 2 + c['rdrop_alpha'] * kl_loss
                        kls += kl_loss.item()
                    else:
                        loss = ce_loss
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(s.m.parameters(), 1.0)
                    opt.step()
                
                sch.step()
                tls += ce_loss.item()
            
            avg_ce = tls/len(tl)
            avg_kl = kls/len(tl) if c['use_rdrop'] else 0
            print(f"CE Loss: {avg_ce:.4f}" + (f" | KL Loss: {avg_kl:.4f}" if c['use_rdrop'] else ""))
            
            ac, dt = s._ev(vl)
            print(f"Val: {ac:.2f}% | {dt} | ‚è±Ô∏è {time.time()-t0:.0f}s")
            
            if ac > ba:
                ba, bs, pa = ac, {k: v.cpu().clone() for k, v in s.m.state_dict().items()}, 0
                print("‚≠ê Best!")
            else:
                pa += 1
                print(f"‚è≥ Patience: {pa}/{c['patience']}")
                if pa >= c['patience']: print("‚ö†Ô∏è Early stop"); break
        
        if bs: s.m.load_state_dict(bs); s.m.to(s.dev)
        tt = time.time() - st
        print(f"\n{'='*60}\nüèÜ BEST: {ba:.2f}% | ‚è±Ô∏è {tt/60:.1f} min\n{'='*60}")
        return ba, tt

    def _ev(s, ld):
        s.m.eval(); cor, tot = {a: 0 for a in ASPECTS}, 0
        with torch.no_grad():
            for b in ld:
                ids, mask = b["input_ids"].to(s.dev), b["attention_mask"].to(s.dev)
                if s.c['use_fp16']:
                    with autocast('cuda'):
                        lo = s.m(ids, mask)
                else:
                    lo = s.m(ids, mask)
                for a in ASPECTS:
                    cor[a] += (torch.argmax(lo[a], -1) == b[f"label_{a.lower()}"].to(s.dev)).sum().item()
                tot += ids.size(0)
        dt = {a: round(100*cor[a]/tot, 1) for a in ASPECTS}
        return sum(dt.values())/3, dt

    def predict(s, t):
        s.m.eval(); p = []
        for i in range(0, len(t), 32):
            e = s.tok(t[i:i+32], truncation=True, padding=True, max_length=s.c['max_length'], return_tensors="pt")
            with torch.no_grad():
                if s.c['use_fp16']:
                    with autocast('cuda'):
                        lo = s.m(e["input_ids"].to(s.dev), e["attention_mask"].to(s.dev))
                else:
                    lo = s.m(e["input_ids"].to(s.dev), e["attention_mask"].to(s.dev))
            for j in range(len(t[i:i+32])):
                p.append({a: IDX_TO_LABEL[torch.argmax(lo[a][j]).item()] for a in ASPECTS})
        return p

print("‚úÖ Code V11 pr√™t!")

In [None]:
df_train = pd.read_csv("/content/data/ftdataset_train.tsv", sep=' *\t *', encoding='utf-8', engine='python')
df_val = pd.read_csv("/content/data/ftdataset_val.tsv", sep=' *\t *', encoding='utf-8', engine='python')
train_data, val_data = df_train.to_dict('records'), df_val.to_dict('records')
print(f"‚úÖ Train={len(train_data)}, Val={len(val_data)}")

In [None]:
trainer = Trainer(CONFIG)
best_acc, total_time = trainer.train(train_data, val_data)

In [None]:
print("\nüìà √âvaluation finale...")
preds = trainer.predict(get_t(val_data))
correct = {a: sum(1 for p, r in zip(preds, val_data) if p[a] == r[a]) for a in ASPECTS}
n = len(val_data)
print("\n" + "="*60)
print("üìä R√âSULTATS V11 (CAMEMBERT-LARGE + R-DROP)")
print("="*60)
for a in ASPECTS: print(f"  {a}: {100*correct[a]/n:.2f}%")
macro = sum(100*correct[a]/n for a in ASPECTS)/3
print(f"\nüéØ MACRO: {macro:.2f}% | ‚è±Ô∏è {total_time/60:.1f} min")
print("="*60)
print(f"\nüìà Am√©lioration vs V9 (88.44%): {macro - 88.44:+.2f}%")

In [None]:
torch.save({'model': trainer.m.state_dict(), 'config': CONFIG, 'acc': best_acc}, '/content/model_v11.pt')
print(f"‚úÖ Sauvegard√© (acc: {best_acc:.2f}%)")
from google.colab import files
files.download('/content/model_v11.pt')