# üçΩÔ∏è Projet Fouille d'Opinions - V3 FREEZE (~3x plus rapide)

**Changement principal :** L'encodeur CamemBERT est gel√©, seules les t√™tes de classification sont entra√Æn√©es.

| Version | Temps | Accuracy |
|---------|-------|----------|
| V2 (large, full) | ~50 min | 88.72% |
| **V3 (large, freeze)** | **~15 min** | ~85-87% |

In [None]:
!pip install -q transformers torch pandas numpy tqdm

In [None]:
import torch
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
CONFIG = {
    "model_name": "camembert/camembert-large",
    "num_epochs": 5,
    "batch_size": 16,       # Plus grand car moins de m√©moire utilis√©e
    "learning_rate": 1e-3,  # Plus grand car on n'entra√Æne que les t√™tes
    "max_length": 256,
    "dropout": 0.2,
    "hidden_dim": 256,
    "freeze_encoder": True,  # ‚ö° NOUVEAU: Geler l'encodeur
    "patience": 2,
}
print("üìã Config:", CONFIG)

In [None]:
import os
from google.colab import files
os.makedirs('/content/data', exist_ok=True)
print("Uploadez ftdataset_train.tsv et ftdataset_val.tsv:")
for f in files.upload().keys():
    os.rename(f, f'/content/data/{f}')
!ls /content/data/

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

LABEL_TO_IDX = {"Positive": 0, "N√©gative": 1, "Neutre": 2, "NE": 3}
IDX_TO_LABEL = {v: k for k, v in LABEL_TO_IDX.items()}
ASPECTS = ["Prix", "Cuisine", "Service"]

class OpinionDataset(Dataset):
    def __init__(self, texts, tokenizer, labels=None, max_length=256):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors="pt")
        self.labels = labels
    def __len__(self): return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        item = {"input_ids": self.encodings["input_ids"][idx], "attention_mask": self.encodings["attention_mask"][idx]}
        if self.labels:
            for a in ASPECTS: item[f"label_{a.lower()}"] = torch.tensor(self.labels[a][idx], dtype=torch.long)
        return item

def prepare_labels(data): return {a: [LABEL_TO_IDX.get(d[a], 3) for d in data] for a in ASPECTS}
def get_texts(data): return [d["Avis"] for d in data]
def collate_fn(f): 
    b = {"input_ids": torch.stack([x["input_ids"] for x in f]), "attention_mask": torch.stack([x["attention_mask"] for x in f])}
    for a in ASPECTS:
        k = f"label_{a.lower()}"
        if k in f[0]: b[k] = torch.stack([x[k] for x in f])
    return b

print("‚úÖ Data utils OK")

In [None]:
import torch.nn as nn
from torch.optim import AdamW
from transformers import AutoConfig, AutoTokenizer, AutoModel
from tqdm.auto import tqdm

class FreezeClassifier(nn.Module):
    def __init__(self, model_name, hidden_dim=256, dropout=0.2, freeze_encoder=True):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        
        # ‚ö° FREEZE: Geler l'encodeur
        if freeze_encoder:
            for param in self.encoder.parameters():
                param.requires_grad = False
            print("üîí Encodeur GEL√â - seules les t√™tes sont entra√Æn√©es")
        
        hidden_size = self.encoder.config.hidden_size
        self.classifiers = nn.ModuleDict({
            a: nn.Sequential(
                nn.Dropout(dropout),
                nn.Linear(hidden_size, hidden_dim),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_dim, 4)
            ) for a in ASPECTS
        })

    def forward(self, input_ids, attention_mask):
        with torch.no_grad() if not self.encoder.training else torch.enable_grad():
            out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]
        return {a: self.classifiers[a](cls) for a in ASPECTS}


class Trainer:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"üîß Chargement de {config['model_name']}...")
        self.tokenizer = AutoTokenizer.from_pretrained(config['model_name'])
        self.model = FreezeClassifier(
            config['model_name'], config['hidden_dim'], 
            config['dropout'], config['freeze_encoder']
        ).to(self.device)
        self.criterion = nn.CrossEntropyLoss()
        
        trainable = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        total = sum(p.numel() for p in self.model.parameters())
        print(f"‚úÖ Mod√®le sur {self.device}")
        print(f"   Param√®tres entra√Ænables: {trainable/1e6:.2f}M / {total/1e6:.1f}M")

    def train(self, train_data, val_data):
        cfg = self.config
        train_ds = OpinionDataset(get_texts(train_data), self.tokenizer, prepare_labels(train_data), cfg['max_length'])
        val_ds = OpinionDataset(get_texts(val_data), self.tokenizer, prepare_labels(val_data), cfg['max_length'])
        train_loader = DataLoader(train_ds, batch_size=cfg['batch_size'], shuffle=True, collate_fn=collate_fn)
        val_loader = DataLoader(val_ds, batch_size=cfg['batch_size'], shuffle=False, collate_fn=collate_fn)
        
        optimizer = AdamW(filter(lambda p: p.requires_grad, self.model.parameters()), lr=cfg['learning_rate'])
        
        best_acc, best_state, patience = 0, None, 0
        history = {'loss': [], 'acc': []}
        
        for epoch in range(cfg['num_epochs']):
            print(f"\n--- Epoch {epoch+1}/{cfg['num_epochs']} ---")
            self.model.train()
            total_loss = 0
            for batch in tqdm(train_loader, desc="Training"):
                ids, mask = batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device)
                logits = self.model(ids, mask)
                loss = sum(self.criterion(logits[a], batch[f"label_{a.lower()}"].to(self.device)) for a in ASPECTS)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
            
            avg_loss = total_loss / len(train_loader)
            history['loss'].append(avg_loss)
            print(f"Loss: {avg_loss:.4f}")
            
            # Validation
            acc, details = self._eval(val_loader)
            history['acc'].append(acc)
            print(f"Val Acc: {acc:.2f}% | {details}")
            
            if acc > best_acc:
                best_acc, best_state, patience = acc, {k: v.cpu().clone() for k, v in self.model.state_dict().items()}, 0
                print("‚≠ê Best!")
            else:
                patience += 1
                if patience >= cfg['patience']: 
                    print("‚ö†Ô∏è Early stop")
                    break
        
        if best_state: self.model.load_state_dict(best_state); self.model.to(self.device)
        print(f"\nüèÜ BEST: {best_acc:.2f}%")
        return history, best_acc

    def _eval(self, loader):
        self.model.eval()
        correct = {a: 0 for a in ASPECTS}
        total = 0
        with torch.no_grad():
            for batch in loader:
                ids, mask = batch["input_ids"].to(self.device), batch["attention_mask"].to(self.device)
                logits = self.model(ids, mask)
                for a in ASPECTS:
                    preds = torch.argmax(logits[a], dim=-1)
                    correct[a] += (preds == batch[f"label_{a.lower()}"].to(self.device)).sum().item()
                total += ids.size(0)
        details = {a: round(100*correct[a]/total, 1) for a in ASPECTS}
        return sum(details.values())/3, details

    def predict(self, texts):
        self.model.eval()
        preds = []
        for i in range(0, len(texts), 32):
            enc = self.tokenizer(texts[i:i+32], truncation=True, padding=True, max_length=self.config['max_length'], return_tensors="pt")
            with torch.no_grad():
                logits = self.model(enc["input_ids"].to(self.device), enc["attention_mask"].to(self.device))
            for j in range(len(texts[i:i+32])):
                preds.append({a: IDX_TO_LABEL[torch.argmax(logits[a][j]).item()] for a in ASPECTS})
        return preds

print("‚úÖ Mod√®le OK")

In [None]:
import pandas as pd
df_train = pd.read_csv("/content/data/ftdataset_train.tsv", sep=' *\t *', encoding='utf-8', engine='python')
df_val = pd.read_csv("/content/data/ftdataset_val.tsv", sep=' *\t *', encoding='utf-8', engine='python')
train_data, val_data = df_train.to_dict('records'), df_val.to_dict('records')
print(f"‚úÖ Train={len(train_data)}, Val={len(val_data)}")

In [None]:
trainer = Trainer(CONFIG)
history, best_acc = trainer.train(train_data, val_data)

In [None]:
print("üìà √âvaluation finale...")
preds = trainer.predict(get_texts(val_data))
correct = {a: sum(1 for p, r in zip(preds, val_data) if p[a] == r[a]) for a in ASPECTS}
n = len(val_data)
print("\n" + "="*40)
for a in ASPECTS: print(f"  {a}: {100*correct[a]/n:.2f}%")
print(f"\nüéØ MACRO: {sum(100*correct[a]/n for a in ASPECTS)/3:.2f}%")
print("="*40)

In [None]:
torch.save({'model': trainer.model.state_dict(), 'config': CONFIG, 'acc': best_acc}, '/content/model_v3_freeze.pt')
print(f"‚úÖ Sauvegard√© (acc: {best_acc:.2f}%)")
from google.colab import files
files.download('/content/model_v3_freeze.pt')