# üçΩÔ∏è Projet Fouille d'Opinions - V4 OPTIMIS√â (Meilleur compromis temps/accuracy)

**Optimisations :**
- ‚úÖ **camembert-base** au lieu de large (~3x plus rapide)
- ‚úÖ **4 epochs** au lieu de 5 
- ‚úÖ **batch_size=32** (2x plus rapide)
- ‚úÖ **max_length=128** (textes plus courts)
- ‚úÖ Architecture optimis√©e V2

| Version | Temps | Accuracy |
|---------|-------|----------|
| V2 (large, 5 epochs) | ~50 min | 88.72% |
| **V4 (base, 4 epochs, optimis√©)** | **~10-12 min** | **~85-86%** |

In [None]:
!pip install -q transformers torch pandas numpy tqdm

In [None]:
import torch
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# ============================================
# CONFIG V4 - Optimis√© temps/accuracy
# ============================================
CONFIG = {
    "model_name": "camembert-base",  # Base = ~3x plus rapide que large
    "num_epochs": 4,                  # 4 epochs suffisent
    "batch_size": 32,                 # Plus grand = plus rapide
    "learning_rate": 2e-5,
    "max_length": 128,                # Suffisant pour la plupart des avis
    "dropout": 0.1,
    "label_smoothing": 0.1,
    "weight_decay": 0.01,
    "warmup_ratio": 0.1,
    "hidden_dim": 256,
    "patience": 2,
}
print("üìã Config V4:", CONFIG)

In [None]:
import os
from google.colab import files
os.makedirs('/content/data', exist_ok=True)
print("Uploadez ftdataset_train.tsv et ftdataset_val.tsv:")
for f in files.upload().keys():
    os.rename(f, f'/content/data/{f}')
!ls /content/data/

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoConfig, AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup
from tqdm.auto import tqdm
import pandas as pd
import time

LABEL_TO_IDX = {"Positive": 0, "N√©gative": 1, "Neutre": 2, "NE": 3}
IDX_TO_LABEL = {v: k for k, v in LABEL_TO_IDX.items()}
ASPECTS = ["Prix", "Cuisine", "Service"]

print("‚úÖ Imports OK")

In [None]:
class OpinionDataset(Dataset):
    def __init__(self, texts, tokenizer, labels=None, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = labels
    def __len__(self): return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        item = {"input_ids": self.encodings["input_ids"][idx], "attention_mask": self.encodings["attention_mask"][idx]}
        if self.labels:
            for a in ASPECTS: item[f"label_{a.lower()}"] = torch.tensor(self.labels[a][idx], dtype=torch.long)
        return item

def prepare_labels(data): return {a: [LABEL_TO_IDX.get(d[a], 3) for d in data] for a in ASPECTS}
def get_texts(data): return [d["Avis"] for d in data]
def collate_fn(f):
    b = {"input_ids": torch.stack([x["input_ids"] for x in f]), "attention_mask": torch.stack([x["attention_mask"] for x in f])}
    for a in ASPECTS:
        k = f"label_{a.lower()}"
        if k in f[0]: b[k] = torch.stack([x[k] for x in f])
    return b

print("‚úÖ Dataset OK")

In [None]:
class OptimizedClassifier(nn.Module):
    def __init__(self, model_name, hidden_dim=256, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hidden_size = self.encoder.config.hidden_size
        
        # T√™tes optimis√©es avec couche cach√©e
        self.classifiers = nn.ModuleDict({
            a: nn.Sequential(
                nn.Dropout(dropout),
                nn.Linear(hidden_size, hidden_dim),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_dim, 4)
            ) for a in ASPECTS
        })

    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]
        return {a: self.classifiers[a](cls) for a in ASPECTS}

print("‚úÖ Mod√®le OK")

In [None]:
class Trainer:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"üîß Chargement de {config['model_name']}...")
        self.tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
        self.model = OptimizedClassifier(config['model_name'], config['hidden_dim'], config['dropout']).to(self.device)
        self.criterion = nn.CrossEntropyLoss(label_smoothing=config['label_smoothing'])
        
        params = sum(p.numel() for p in self.model.parameters())/1e6
        print(f"‚úÖ Mod√®le sur {self.device} ({params:.1f}M params)")

    def train(self, train_data, val_data):
        cfg = self.config
        start_time = time.time()
        
        train_ds = OpinionDataset(get_texts(train_data), self.tokenizer, prepare_labels(train_data), cfg['max_length'])
        val_ds = OpinionDataset(get_texts(val_data), self.tokenizer, prepare_labels(val_data), cfg['max_length'])
        train_loader = DataLoader(train_ds, batch_size=cfg['batch_size'], shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_ds, batch_size=cfg['batch_size'], shuffle=False, collate_fn=collate_fn)
        
        optimizer = AdamW(self.model.parameters(), lr=cfg['learning_rate'], weight_decay=cfg['weight_decay'])
        total_steps = len(train_loader) * cfg['num_epochs']
        scheduler = get_cosine_schedule_with_warmup(optimizer, int(total_steps * cfg['warmup_ratio']), total_steps)
        
        best_acc, best_state, patience = 0, None, 0
        
        for epoch in range(cfg['num_epochs']):
            epoch_start = time.time()
            print(f"\n{'='*50}\nEpoch {epoch+1}/{cfg['num_epochs']}\n{'='*50}")
            
            self.model.train()
            total_loss = 0
            for batch in tqdm(train_loader, desc="Training"):
                ids, mask = batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device)
                logits = self.model(ids, mask)
                loss = sum(self.criterion(logits[a], batch[f"label_{a.lower()}"].to(self.device)) for a in ASPECTS)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                total_loss += loss.item()
            
            print(f"Loss: {total_loss/len(train_loader):.4f}")
            
            acc, details = self._eval(val_loader)
            epoch_time = time.time() - epoch_start
            print(f"Val Acc: {acc:.2f}% | {details} | ‚è±Ô∏è {epoch_time:.1f}s")
            
            if acc > best_acc:
                best_acc, best_state, patience = acc, {k: v.cpu().clone() for k, v in self.model.state_dict().items()}, 0
                print("‚≠ê Nouveau meilleur!")
            else:
                patience += 1
                if patience >= cfg['patience']: print("‚ö†Ô∏è Early stop"); break
        
        if best_state: self.model.load_state_dict(best_state); self.model.to(self.device)
        total_time = time.time() - start_time
        print(f"\n{'='*50}\nüèÜ BEST: {best_acc:.2f}% | ‚è±Ô∏è Total: {total_time/60:.1f} min\n{'='*50}")
        return best_acc, total_time

    def _eval(self, loader):
        self.model.eval()
        correct = {a: 0 for a in ASPECTS}
        total = 0
        with torch.no_grad():
            for batch in loader:
                ids, mask = batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device)
                logits = self.model(ids, mask)
                for a in ASPECTS:
                    preds = torch.argmax(logits[a], dim=-1)
                    correct[a] += (preds == batch[f"label_{a.lower()}"].to(self.device)).sum().item()
                total += ids.size(0)
        details = {a: round(100*correct[a]/total, 1) for a in ASPECTS}
        return sum(details.values())/3, details

    def predict(self, texts):
        self.model.eval()
        preds = []
        for i in range(0, len(texts), 64):
            enc = self.tokenizer(texts[i:i+64], truncation=True, padding=True, max_length=self.config['max_length'], return_tensors="pt")
            with torch.no_grad():
                logits = self.model(enc["input_ids"].to(self.device), enc["attention_mask"].to(self.device))
            for j in range(len(texts[i:i+64])):
                preds.append({a: IDX_TO_LABEL[torch.argmax(logits[a][j]).item()] for a in ASPECTS})
        return preds

print("‚úÖ Trainer OK")

In [None]:
df_train = pd.read_csv("/content/data/ftdataset_train.tsv", sep=' *\t *', encoding='utf-8', engine='python')
df_val = pd.read_csv("/content/data/ftdataset_val.tsv", sep=' *\t *', encoding='utf-8', engine='python')
train_data, val_data = df_train.to_dict('records'), df_val.to_dict('records')
print(f"‚úÖ Train={len(train_data)}, Val={len(val_data)}")

In [None]:
trainer = Trainer(CONFIG)
best_acc, total_time = trainer.train(train_data, val_data)

In [None]:
print("\nüìà √âvaluation finale...")
preds = trainer.predict(get_texts(val_data))
correct = {a: sum(1 for p, r in zip(preds, val_data) if p[a] == r[a]) for a in ASPECTS}
n = len(val_data)

print("\n" + "="*50)
print("üìä R√âSULTATS FINAUX V4")
print("="*50)
for a in ASPECTS: print(f"  {a}: {100*correct[a]/n:.2f}%")
macro = sum(100*correct[a]/n for a in ASPECTS)/3
print(f"\nüéØ MACRO ACCURACY: {macro:.2f}%")
print(f"‚è±Ô∏è Temps total: {total_time/60:.1f} min")
print("="*50)

In [None]:
test_texts = [
    "Excellente cuisine, plats savoureux. Service un peu lent mais correct. Prix raisonnables.",
    "Tr√®s d√©√ßu. Nourriture froide et serveur d√©sagr√©able. Bien trop cher.",
    "Bon rapport qualit√©-prix. Service efficace. Cuisine correcte."
]
print("üß™ Test:\n")
for t, p in zip(test_texts, trainer.predict(test_texts)):
    print(f"'{t[:50]}...'\n  ‚Üí {p}\n")

In [None]:
torch.save({'model': trainer.model.state_dict(), 'config': CONFIG, 'acc': best_acc, 'time': total_time}, '/content/model_v4_optimized.pt')
print(f"‚úÖ Sauvegard√© (acc: {best_acc:.2f}%, temps: {total_time/60:.1f} min)")
from google.colab import files
files.download('/content/model_v4_optimized.pt')