# üçΩÔ∏è Projet Fouille d'Opinions - Classification Multi-Aspects

Ce notebook entra√Æne un classificateur CamemBERT pour pr√©dire les opinions sur 3 aspects de restaurants:
- **Prix** : prix des plats et boissons
- **Cuisine** : qualit√© de la nourriture
- **Service** : qualit√© du service

**Instructions:**
1. Allez dans `Runtime` > `Change runtime type` > S√©lectionnez `T4 GPU`
2. Uploadez vos fichiers de donn√©es dans le r√©pertoire `/content/data/`
3. Ex√©cutez toutes les cellules

## 1. Installation des d√©pendances

In [None]:
!pip install -q transformers datasets torch lightning pandas numpy tqdm

In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Upload des donn√©es

Ex√©cutez cette cellule et uploadez vos fichiers:
- `ftdataset_train.tsv`
- `ftdataset_val.tsv`

In [None]:
import os
from google.colab import files

# Cr√©er le r√©pertoire data
os.makedirs('/content/data', exist_ok=True)

print("Uploadez vos fichiers de donn√©es (ftdataset_train.tsv et ftdataset_val.tsv):")
uploaded = files.upload()

# D√©placer les fichiers vers le r√©pertoire data
for filename in uploaded.keys():
    os.rename(filename, f'/content/data/{filename}')
    print(f"Fichier {filename} d√©plac√© vers /content/data/")

# V√©rifier les fichiers
print("\nFichiers dans /content/data:")
!ls -la /content/data/

## 3. D√©finition des utilitaires de donn√©es

In [None]:
from typing import Optional
import torch
from torch.utils.data import Dataset
from transformers import PreTrainedTokenizer

# Mapping des labels vers indices pour les 4 classes d'opinion
LABEL_TO_IDX = {
    "Positive": 0,
    "N√©gative": 1,
    "Neutre": 2,
    "NE": 3
}

# Mapping inverse : indices vers labels
IDX_TO_LABEL = {v: k for k, v in LABEL_TO_IDX.items()}

# Liste des aspects √† classifier
ASPECTS = ["Prix", "Cuisine", "Service"]


class OpinionDataset(Dataset):
    """Dataset PyTorch pour les avis de restaurants."""

    def __init__(
        self,
        texts: list[str],
        tokenizer: PreTrainedTokenizer,
        labels: Optional[dict[str, list[int]]] = None,
        max_length: int = 256
    ):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=max_length,
            return_tensors="pt"
        )
        self.labels = labels

    def __len__(self) -> int:
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx: int) -> dict:
        item = {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
        }

        if self.labels is not None:
            for aspect in ASPECTS:
                item[f"label_{aspect.lower()}"] = torch.tensor(
                    self.labels[aspect][idx], dtype=torch.long
                )

        return item


def prepare_labels(data: list[dict]) -> dict[str, list[int]]:
    """Pr√©pare les labels num√©riques √† partir des donn√©es d'entra√Ænement."""
    labels = {aspect: [] for aspect in ASPECTS}

    for item in data:
        for aspect in ASPECTS:
            label_text = item[aspect]
            if label_text in LABEL_TO_IDX:
                labels[aspect].append(LABEL_TO_IDX[label_text])
            else:
                labels[aspect].append(LABEL_TO_IDX["NE"])

    return labels


def get_texts(data: list[dict]) -> list[str]:
    """Extrait les textes d'avis des donn√©es."""
    return [item["Avis"] for item in data]


class DataCollatorWithPadding:
    """Collator pour le padding dynamique des batches."""

    def __init__(self, tokenizer: PreTrainedTokenizer, padding: bool = True):
        self.tokenizer = tokenizer
        self.padding = padding

    def __call__(self, features: list[dict]) -> dict:
        batch = {
            "input_ids": torch.stack([f["input_ids"] for f in features]),
            "attention_mask": torch.stack([f["attention_mask"] for f in features]),
        }

        for aspect in ASPECTS:
            key = f"label_{aspect.lower()}"
            if key in features[0]:
                batch[key] = torch.stack([f[key] for f in features])

        return batch

print("‚úÖ Utilitaires de donn√©es charg√©s")

## 4. D√©finition du mod√®le CamemBERT Multi-T√™tes

In [None]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup
)
from tqdm.auto import tqdm


class MultiHeadClassifier(nn.Module):
    """Mod√®le de classification multi-aspects bas√© sur CamemBERT."""

    def __init__(self, plm_name: str = "camembert-base", num_classes: int = 4, dropout: float = 0.1):
        super().__init__()

        self.config = AutoConfig.from_pretrained(plm_name)
        self.encoder = AutoModel.from_pretrained(plm_name)

        hidden_size = self.config.hidden_size

        # T√™tes de classification pour chaque aspect
        self.classifier_prix = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_classes)
        )

        self.classifier_cuisine = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_classes)
        )

        self.classifier_service = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_size, num_classes)
        )

        self.classifiers = {
            "Prix": self.classifier_prix,
            "Cuisine": self.classifier_cuisine,
            "Service": self.classifier_service
        }

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> dict[str, torch.Tensor]:
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]

        logits = {
            "Prix": self.classifier_prix(cls_output),
            "Cuisine": self.classifier_cuisine(cls_output),
            "Service": self.classifier_service(cls_output)
        }

        return logits


class PLMClassifier:
    """Wrapper pour le classificateur d'opinions multi-aspects."""

    def __init__(self, device="cuda" if torch.cuda.is_available() else "cpu"):
        self.plm_name = "camembert-base"
        self.num_classes = 4
        self.max_length = 256
        self.batch_size = 16
        self.learning_rate = 2e-5
        self.num_epochs = 3
        self.warmup_ratio = 0.1
        self.device = torch.device(device)

        print(f"Chargement de CamemBERT...")
        self.tokenizer = AutoTokenizer.from_pretrained(self.plm_name)
        self.model = MultiHeadClassifier(
            plm_name=self.plm_name,
            num_classes=self.num_classes,
            dropout=0.1
        ).to(self.device)

        self.criterion = nn.CrossEntropyLoss()
        print(f"‚úÖ Mod√®le charg√© sur {self.device}")

    def train(self, train_data: list[dict], val_data: list[dict]) -> None:
        print(f"\nüìä Entra√Ænement sur {self.device}")
        print(f"  - Exemples d'entra√Ænement: {len(train_data)}")
        print(f"  - Exemples de validation: {len(val_data)}")

        train_texts = get_texts(train_data)
        train_labels = prepare_labels(train_data)
        val_texts = get_texts(val_data)
        val_labels = prepare_labels(val_data)

        train_dataset = OpinionDataset(train_texts, self.tokenizer, labels=train_labels, max_length=self.max_length)
        val_dataset = OpinionDataset(val_texts, self.tokenizer, labels=val_labels, max_length=self.max_length)

        collator = DataCollatorWithPadding(self.tokenizer)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True, collate_fn=collator)
        val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=False, collate_fn=collator)

        optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        total_steps = len(train_loader) * self.num_epochs
        warmup_steps = int(total_steps * self.warmup_ratio)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

        best_val_acc = 0.0

        for epoch in range(self.num_epochs):
            print(f"\n--- Epoch {epoch + 1}/{self.num_epochs} ---")

            self.model.train()
            total_train_loss = 0.0

            progress_bar = tqdm(train_loader, desc="Training")

            for batch in progress_bar:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)

                logits = self.model(input_ids, attention_mask)

                loss = 0.0
                for aspect in ASPECTS:
                    labels = batch[f"label_{aspect.lower()}"].to(self.device)
                    loss += self.criterion(logits[aspect], labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()

                total_train_loss += loss.item()
                progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

            avg_train_loss = total_train_loss / len(train_loader)
            print(f"Train Loss: {avg_train_loss:.4f}")

            val_acc = self._evaluate(val_loader)
            print(f"Validation Accuracy: {val_acc:.2f}%")

            if val_acc > best_val_acc:
                best_val_acc = val_acc

        print(f"\nüèÜ Meilleure exactitude de validation: {best_val_acc:.2f}%")

    def _evaluate(self, dataloader: DataLoader) -> float:
        self.model.eval()
        correct_counts = {aspect: 0 for aspect in ASPECTS}
        total_counts = 0

        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch["input_ids"].to(self.device)
                attention_mask = batch["attention_mask"].to(self.device)

                logits = self.model(input_ids, attention_mask)

                for aspect in ASPECTS:
                    labels = batch[f"label_{aspect.lower()}"].to(self.device)
                    preds = torch.argmax(logits[aspect], dim=-1)
                    correct_counts[aspect] += (preds == labels).sum().item()

                total_counts += input_ids.size(0)

        accuracies = {aspect: 100 * correct_counts[aspect] / total_counts for aspect in ASPECTS}
        avg_accuracy = sum(accuracies.values()) / len(ASPECTS)

        return avg_accuracy

    def predict(self, texts: list[str]) -> list[dict[str, str]]:
        self.model.eval()
        all_predictions = []
        batch_size = 32

        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]

            encodings = self.tokenizer(
                batch_texts,
                truncation=True,
                padding=True,
                max_length=self.max_length,
                return_tensors="pt"
            )

            input_ids = encodings["input_ids"].to(self.device)
            attention_mask = encodings["attention_mask"].to(self.device)

            with torch.no_grad():
                logits = self.model(input_ids, attention_mask)

            for j in range(len(batch_texts)):
                prediction = {}
                for aspect in ASPECTS:
                    pred_idx = torch.argmax(logits[aspect][j]).item()
                    prediction[aspect] = IDX_TO_LABEL[pred_idx]
                all_predictions.append(prediction)

        return all_predictions

print("‚úÖ Mod√®le d√©fini")

## 5. Chargement des donn√©es

In [None]:
import pandas as pd

# Charger les donn√©es
df_train = pd.read_csv("/content/data/ftdataset_train.tsv", sep=' *\t *', encoding='utf-8', engine='python')
df_val = pd.read_csv("/content/data/ftdataset_val.tsv", sep=' *\t *', encoding='utf-8', engine='python')

train_data = df_train.to_dict(orient='records')
val_data = df_val.to_dict(orient='records')

print(f"‚úÖ Donn√©es charg√©es:")
print(f"  - Train: {len(train_data)} exemples")
print(f"  - Validation: {len(val_data)} exemples")
print(f"\nExemple d'avis:")
print(train_data[0])

## 6. Entra√Ænement du mod√®le üöÄ

In [None]:
# Initialiser et entra√Æner le classificateur
classifier = PLMClassifier()
classifier.train(train_data, val_data)

## 7. √âvaluation finale

In [None]:
# √âvaluation sur les donn√©es de validation
print("\nüìà √âvaluation finale sur les donn√©es de validation...")

val_texts = get_texts(val_data)
predictions = classifier.predict(val_texts)

# Calculer les m√©triques
correct_counts = {aspect: 0 for aspect in ASPECTS}
n = len(val_data)

for pred, ref in zip(predictions, val_data):
    for aspect in ASPECTS:
        if pred[aspect] == ref[aspect]:
            correct_counts[aspect] += 1

print("\nüìä R√©sultats par aspect:")
for aspect in ASPECTS:
    acc = 100 * correct_counts[aspect] / n
    print(f"  - {aspect}: {acc:.2f}%")

macro_acc = sum(100 * correct_counts[aspect] / n for aspect in ASPECTS) / len(ASPECTS)
print(f"\nüéØ Exactitude moyenne (macro_acc): {macro_acc:.2f}%")

## 8. Test sur quelques exemples

In [None]:
# Tester sur quelques exemples
test_texts = [
    "Excellente cuisine, plats savoureux et copieux. Le service √©tait un peu lent mais correct. Prix raisonnables.",
    "Tr√®s d√©√ßu par ce restaurant. La nourriture √©tait froide et le serveur d√©sagr√©able. Bien trop cher pour ce que c'est.",
    "Bon rapport qualit√©-prix. Service efficace et souriant. La cuisine √©tait correcte sans √™tre exceptionnelle."
]

print("\nüß™ Test sur quelques exemples:\n")
predictions = classifier.predict(test_texts)

for text, pred in zip(test_texts, predictions):
    print(f"üìù Avis: {text[:80]}...")
    print(f"   ‚Üí Prix: {pred['Prix']}, Cuisine: {pred['Cuisine']}, Service: {pred['Service']}")
    print()

## 9. Sauvegarde du mod√®le (optionnel)

In [None]:
# Sauvegarder le mod√®le
torch.save(classifier.model.state_dict(), '/content/model_weights.pt')
print("‚úÖ Mod√®le sauvegard√© dans /content/model_weights.pt")

# T√©l√©charger le mod√®le
from google.colab import files
files.download('/content/model_weights.pt')