<a href="https://colab.research.google.com/github/ludoveltz/test_github_fev25/blob/main/Exc_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Installation des d√©pendances
!pip install -q rouge-score transformers torch tqdm

import pandas as pd
import numpy as np
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    GPT2LMHeadModel,
    GPT2Tokenizer
)
from rouge_score import rouge_scorer
from typing import Dict, List
from tqdm import tqdm
import torch
from google.colab import drive

class ModelEvaluator:
    def __init__(self):
        """Initialisation des mod√®les et m√©triques"""
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"üñ•Ô∏è Utilisation de: {self.device}")

        # Initialisation des mod√®les
        self.models = {
            't5-small': (
                T5ForConditionalGeneration.from_pretrained('t5-small').to(self.device),
                T5Tokenizer.from_pretrained('t5-small')
            ),
            't5-base': (
                T5ForConditionalGeneration.from_pretrained('t5-base').to(self.device),
                T5Tokenizer.from_pretrained('t5-base')
            ),
            'gpt2': (
                GPT2LMHeadModel.from_pretrained('gpt2').to(self.device),
                GPT2Tokenizer.from_pretrained('gpt2', pad_token='<|endoftext|>')  # D√©finition explicite du pad_token
            )
        }

        # Configuration sp√©ciale pour GPT2
        self.models['gpt2'][1].padding_side = 'left'  # Important pour GPT2

        # Initialisation du scorer ROUGE
        self.rouge_scorer = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'],
            use_stemmer=True
        )

    def summarize_with_t5(self, text: str, model_name: str) -> str:
        """G√©n√©ration de r√©sum√© avec T5"""
        try:
            model, tokenizer = self.models[model_name]

            inputs = tokenizer(
                f"summarize: {text}",
                max_length=512,
                truncation=True,
                padding=True,
                return_tensors="pt"
            ).to(self.device)

            with torch.no_grad():
                outputs = model.generate(
                    inputs.input_ids,
                    max_new_tokens=150,  # Utilisation de max_new_tokens au lieu de max_length
                    min_length=40,
                    num_beams=4,
                    length_penalty=2.0,
                    early_stopping=True
                )

            return tokenizer.decode(outputs[0], skip_special_tokens=True)
        except Exception as e:
            print(f"‚ö†Ô∏è Erreur T5 ({model_name}): {e}")
            return ""

    def summarize_with_gpt2(self, text: str) -> str:
        """G√©n√©ration de r√©sum√© avec GPT-2"""
        try:
            model, tokenizer = self.models['gpt2']

            input_text = f"{text}\nTL;DR:"
            inputs = tokenizer(
                input_text,
                max_length=1024,
                truncation=True,
                padding=True,
                return_tensors="pt",
                return_attention_mask=True  # Explicitement demander le masque d'attention
            ).to(self.device)

            with torch.no_grad():
                outputs = model.generate(
                    input_ids=inputs.input_ids,
                    attention_mask=inputs.attention_mask,
                    max_new_tokens=150,  # Utilisation de max_new_tokens
                    min_length=30,
                    num_beams=4,
                    length_penalty=2.0,
                    pad_token_id=tokenizer.eos_token_id,
                    early_stopping=True
                )

            summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
            return summary.split("TL;DR:")[-1].strip()
        except Exception as e:
            print(f"‚ö†Ô∏è Erreur GPT2: {e}")
            return ""

    def compute_rouge_score(self, reference: str, prediction: str) -> Dict[str, float]:
        """Calcul des scores ROUGE"""
        scores = self.rouge_scorer.score(reference, prediction)
        return {
            'rouge1': scores['rouge1'].fmeasure,
            'rouge2': scores['rouge2'].fmeasure,
            'rougeL': scores['rougeL'].fmeasure
        }

    def compute_rouge_per_row(self, df: pd.DataFrame) -> Dict[str, pd.DataFrame]:
        """Calcul des scores ROUGE pour chaque ligne et chaque mod√®le"""
        results = {}

        for model_name in self.models.keys():
            model_results = []

            for _, row in tqdm(df.iterrows(), desc=f"√âvaluation de {model_name}"):
                # G√©n√©ration du r√©sum√©
                if model_name.startswith('t5'):
                    prediction = self.summarize_with_t5(row['premise'], model_name)
                else:
                    prediction = self.summarize_with_gpt2(row['premise'])

                # Calcul des scores ROUGE
                scores = self.compute_rouge_score(row['hypothesis'], prediction)

                model_results.append({
                    'id': row['id'],
                    'language': row['language'],
                    'original': row['premise'],
                    'reference': row['hypothesis'],
                    'prediction': prediction,
                    **scores
                })

            results[model_name] = pd.DataFrame(model_results)

        return results

def main():
    try:
        # Montage du Drive et chargement des donn√©es
        drive.mount('/content/drive')
        base_path = "/content/drive/MyDrive/DATASET"

        # Chargement des donn√©es
        train_df = pd.read_csv(f"{base_path}/train.csv")
        print(f"üìä Donn√©es charg√©es: {len(train_df)} exemples")

        # √âchantillonnage pour l'√©valuation (r√©duit pour test)
        sample_size = 20  # R√©duit √† 20 pour test
        eval_df = train_df.sample(n=sample_size, random_state=42)

        # Initialisation et √©valuation
        evaluator = ModelEvaluator()
        results = evaluator.compute_rouge_per_row(eval_df)

        # Affichage et sauvegarde des r√©sultats
        from datetime import datetime
        timestamp = datetime.now().strftime("%Y%m%d_%H%M")

        print("\nüìä R√©sultats de l'√©valuation:")
        for model_name, df in results.items():
            print(f"\nü§ñ Mod√®le: {model_name}")
            print("\nScores moyens:")
            print(df[['rouge1', 'rouge2', 'rougeL']].mean())

            # Sauvegarde avec timestamp
            output_file = f"{base_path}/results_{model_name}_{timestamp}.csv"
            df.to_csv(output_file, index=False)
            print(f"‚úÖ R√©sultats sauvegard√©s: {output_file}")

    except Exception as e:
        print(f"‚ùå Erreur: {e}")
        import traceback
        print(traceback.format_exc())

if __name__ == "__main__":
    main()



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üìä Donn√©es charg√©es: 12120 exemples
üñ•Ô∏è Utilisation de: cpu


√âvaluation de t5-small: 20it [00:54,  2.74s/it]
√âvaluation de t5-base: 20it [02:47,  8.36s/it]
√âvaluation de gpt2: 20it [05:55, 17.77s/it]



üìä R√©sultats de l'√©valuation:

ü§ñ Mod√®le: t5-small

Scores moyens:
rouge1    0.203724
rouge2    0.055118
rougeL    0.191737
dtype: float64
‚úÖ R√©sultats sauvegard√©s: /content/drive/MyDrive/DATASET/results_t5-small_20250319_2232.csv

ü§ñ Mod√®le: t5-base

Scores moyens:
rouge1    0.194170
rouge2    0.055759
rougeL    0.181527
dtype: float64
‚úÖ R√©sultats sauvegard√©s: /content/drive/MyDrive/DATASET/results_t5-base_20250319_2232.csv

ü§ñ Mod√®le: gpt2

Scores moyens:
rouge1    0.035225
rouge2    0.005264
rougeL    0.035225
dtype: float64
‚úÖ R√©sultats sauvegard√©s: /content/drive/MyDrive/DATASET/results_gpt2_20250319_2232.csv


Performance Comparative des Mod√®les

T5-Small (meilleure performance globale) :
- ROUGE-1 : 0.204 (20.4%) - Meilleur score de correspondance de mots uniques
- ROUGE-2 : 0.055 (5.5%) - Capture des bi-grammes
- ROUGE-L : 0.192 (19.2%) - Meilleure s√©quence commune

T5-Base (performance similaire) :
- ROUGE-1 : 0.194 (19.4%)
- ROUGE-2 : 0.056 (5.6%) - L√©g√®rement meilleur en bi-grammes
- ROUGE-L : 0.182 (18.2%)

GPT-2 (performance significativement inf√©rieure) :
- ROUGE-1 : 0.035 (3.5%)
- ROUGE-2 : 0.005 (0.5%)
- ROUGE-L : 0.035 (3.5%)

Analyse des Temps d'Ex√©cution
- T5-Small : 2.74s/it
- T5-Base : 8.36s/it
- GPT-2 : 17.77s/it

Points Cl√©s üîç
a) Efficacit√© des Mod√®les T5 :
- T5-Small surpasse T5-Base malgr√© sa taille r√©duite
- Meilleur compromis performance/rapidit√©
- Particuli√®rement efficace pour la correspondance lexicale (ROUGE-1)

b) Limitations de GPT-2 :
- Performances nettement inf√©rieures
- Temps d'ex√©cution le plus long
- Possible inad√©quation avec la t√¢che de r√©sum√©

Choix de Mod√®le : T5-Small appara√Æt comme le meilleur choix pour :
- Meilleure performance globale
- Temps d'ex√©cution plus rapide
- Utilisation efficace des ressources