# üçΩÔ∏è Projet Fouille d'Opinions - V6 OPTIMIS√â+

| Version | Temps | Accuracy |
|---------|-------|----------|
| V2 (large) | ~50 min | 88.72% |
| V5 (base) | ~12 min | 83.33% |
| **V6 (base+)** | **~15-18 min** | **~84-85%** |

In [None]:
!pip install -q transformers torch pandas numpy tqdm

In [None]:
import torch
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available(): print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# ============================================
# CONFIG V6 - Optimis√© pour meilleure accuracy
# ============================================
CONFIG = {
    "model_name": "camembert-base",
    "num_epochs": 6,          # +1 epoch
    "batch_size": 16,
    "learning_rate": 2e-5,
    "max_length": 256,        # Plus de contexte
    "dropout": 0.1,
    "label_smoothing": 0.1,
    "weight_decay": 0.01,
    "warmup_ratio": 0.1,
    "hidden_dim": 256,
    "patience": 3,            # Plus de patience
}
print("üìã Config V6:", CONFIG)

In [None]:
import os
from google.colab import files
os.makedirs('/content/data', exist_ok=True)
print("Uploadez ftdataset_train.tsv et ftdataset_val.tsv:")
for f in files.upload().keys():
    os.rename(f, f'/content/data/{f}')
!ls /content/data/

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, get_cosine_schedule_with_warmup
from tqdm.auto import tqdm
import pandas as pd
import time

LABEL_TO_IDX = {"Positive": 0, "N√©gative": 1, "Neutre": 2, "NE": 3}
IDX_TO_LABEL = {v: k for k, v in LABEL_TO_IDX.items()}
ASPECTS = ["Prix", "Cuisine", "Service"]

class OpinionDataset(Dataset):
    def __init__(self, texts, tokenizer, labels=None, max_length=256):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = labels
    def __len__(self): return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        item = {"input_ids": self.encodings["input_ids"][idx], "attention_mask": self.encodings["attention_mask"][idx]}
        if self.labels:
            for a in ASPECTS: item[f"label_{a.lower()}"] = torch.tensor(self.labels[a][idx], dtype=torch.long)
        return item

def prepare_labels(data): return {a: [LABEL_TO_IDX.get(d[a], 3) for d in data] for a in ASPECTS}
def get_texts(data): return [d["Avis"] for d in data]
def collate_fn(f):
    b = {"input_ids": torch.stack([x["input_ids"] for x in f]), "attention_mask": torch.stack([x["attention_mask"] for x in f])}
    for a in ASPECTS:
        k = f"label_{a.lower()}"
        if k in f[0]: b[k] = torch.stack([x[k] for x in f])
    return b

class Classifier(nn.Module):
    def __init__(self, model_name, hidden_dim=256, dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        hs = self.encoder.config.hidden_size
        self.classifiers = nn.ModuleDict({
            a: nn.Sequential(nn.Dropout(dropout), nn.Linear(hs, hidden_dim), nn.GELU(), nn.Dropout(dropout), nn.Linear(hidden_dim, 4))
            for a in ASPECTS
        })
    def forward(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]
        return {a: self.classifiers[a](cls) for a in ASPECTS}

class Trainer:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"üîß Chargement de {config['model_name']}...")
        self.tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
        self.model = Classifier(config['model_name'], config['hidden_dim'], config['dropout']).to(self.device)
        self.criterion = nn.CrossEntropyLoss(label_smoothing=config['label_smoothing'])
        print(f"‚úÖ Mod√®le sur {self.device}")

    def train(self, train_data, val_data):
        cfg = self.config
        start = time.time()
        train_ds = OpinionDataset(get_texts(train_data), self.tokenizer, prepare_labels(train_data), cfg['max_length'])
        val_ds = OpinionDataset(get_texts(val_data), self.tokenizer, prepare_labels(val_data), cfg['max_length'])
        train_loader = DataLoader(train_ds, batch_size=cfg['batch_size'], shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_ds, batch_size=cfg['batch_size'], shuffle=False, collate_fn=collate_fn)
        
        optimizer = AdamW(self.model.parameters(), lr=cfg['learning_rate'], weight_decay=cfg['weight_decay'])
        total_steps = len(train_loader) * cfg['num_epochs']
        scheduler = get_cosine_schedule_with_warmup(optimizer, int(total_steps * cfg['warmup_ratio']), total_steps)
        
        best_acc, best_state, patience = 0, None, 0
        
        for epoch in range(cfg['num_epochs']):
            t0 = time.time()
            print(f"\n{'='*50}\nEpoch {epoch+1}/{cfg['num_epochs']}\n{'='*50}")
            self.model.train()
            total_loss = 0
            for batch in tqdm(train_loader, desc="Training"):
                ids, mask = batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device)
                logits = self.model(ids, mask)
                loss = sum(self.criterion(logits[a], batch[f"label_{a.lower()}"].to(self.device)) for a in ASPECTS)
                optimizer.zero_grad(); loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                optimizer.step(); scheduler.step()
                total_loss += loss.item()
            
            print(f"Loss: {total_loss/len(train_loader):.4f}")
            acc, det = self._eval(val_loader)
            print(f"Val: {acc:.2f}% | {det} | ‚è±Ô∏è {time.time()-t0:.0f}s")
            
            if acc > best_acc:
                best_acc, best_state, patience = acc, {k: v.cpu().clone() for k, v in self.model.state_dict().items()}, 0
                print("‚≠ê Best!")
            else:
                patience += 1
                print(f"‚è≥ Patience: {patience}/{cfg['patience']}")
                if patience >= cfg['patience']: print("‚ö†Ô∏è Early stop"); break
        
        if best_state: self.model.load_state_dict(best_state); self.model.to(self.device)
        total_time = time.time() - start
        print(f"\nüèÜ BEST: {best_acc:.2f}% | ‚è±Ô∏è {total_time/60:.1f} min")
        return best_acc, total_time

    def _eval(self, loader):
        self.model.eval()
        correct, total = {a: 0 for a in ASPECTS}, 0
        with torch.no_grad():
            for batch in loader:
                ids, mask = batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device)
                logits = self.model(ids, mask)
                for a in ASPECTS:
                    correct[a] += (torch.argmax(logits[a], -1) == batch[f"label_{a.lower()}"].to(self.device)).sum().item()
                total += ids.size(0)
        det = {a: round(100*correct[a]/total, 1) for a in ASPECTS}
        return sum(det.values())/3, det

    def predict(self, texts):
        self.model.eval()
        preds = []
        for i in range(0, len(texts), 32):
            enc = self.tokenizer(texts[i:i+32], truncation=True, padding=True, max_length=self.config['max_length'], return_tensors="pt")
            with torch.no_grad():
                logits = self.model(enc["input_ids"].to(self.device), enc["attention_mask"].to(self.device))
            for j in range(len(texts[i:i+32])):
                preds.append({a: IDX_TO_LABEL[torch.argmax(logits[a][j]).item()] for a in ASPECTS})
        return preds

print("‚úÖ Tout pr√™t!")

In [None]:
df_train = pd.read_csv("/content/data/ftdataset_train.tsv", sep=' *\t *', encoding='utf-8', engine='python')
df_val = pd.read_csv("/content/data/ftdataset_val.tsv", sep=' *\t *', encoding='utf-8', engine='python')
train_data, val_data = df_train.to_dict('records'), df_val.to_dict('records')
print(f"‚úÖ Train={len(train_data)}, Val={len(val_data)}")

In [None]:
trainer = Trainer(CONFIG)
best_acc, total_time = trainer.train(train_data, val_data)

In [None]:
print("\nüìà √âvaluation finale...")
preds = trainer.predict(get_texts(val_data))
correct = {a: sum(1 for p, r in zip(preds, val_data) if p[a] == r[a]) for a in ASPECTS}
n = len(val_data)
print("\n" + "="*50)
print("üìä R√âSULTATS V6")
print("="*50)
for a in ASPECTS: print(f"  {a}: {100*correct[a]/n:.2f}%")
macro = sum(100*correct[a]/n for a in ASPECTS)/3
print(f"\nüéØ MACRO: {macro:.2f}% | ‚è±Ô∏è {total_time/60:.1f} min")
print("="*50)

In [None]:
torch.save({'model': trainer.model.state_dict(), 'config': CONFIG, 'acc': best_acc}, '/content/model_v6.pt')
print(f"‚úÖ Sauvegard√©")
from google.colab import files
files.download('/content/model_v6.pt')