# Projet de Resumé de texte

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install datasets
!pip install sentence_transformers
!pip install nltk sentence_transformers rouge-score

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

## Importation des packages

In [4]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, concatenate_datasets, DatasetDict
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
from datasets import Dataset
import pandas as pd
import spacy
import json
import nltk



## Chargement de données

In [5]:

# Charger le jeu de données initial CNN/Daily Mail
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Combiner toutes les données en une seule
combined_dataset = concatenate_datasets([dataset["train"], dataset["validation"], dataset["test"]])

# Réaliser une nouvelle subdivision en 80/10/10
new_split = combined_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
# Split du 20% obtenu en deux ensembles de validation et de test (chacun représentant 10% de l'ensemble initial)
valid_test_split = new_split["test"].train_test_split(test_size=0.5, shuffle=True, seed=42)

# Création du nouveau dictionnaire de données
new_dataset = DatasetDict({
    "train": new_split["train"],
    "validation": valid_test_split["train"],
    "test": valid_test_split["test"]
})

# Afficher la taille des nouvelles subdivisions
print(new_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 249576
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 31197
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 31198
    })
})


## Sauvegarde du dataset nettoyé

In [None]:
# Charger le modèle de langue de SpaCy (par exemple, l'anglais)
nlp = spacy.load("en_core_web_sm")

# Charger et subdiviser l'ensemble de données comme précédemment
dataset = load_dataset("cnn_dailymail", "3.0.0")
combined_dataset = concatenate_datasets([dataset["train"], dataset["validation"], dataset["test"]])
new_split = combined_dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
valid_test_split = new_split["test"].train_test_split(test_size=0.5, shuffle=True, seed=42)
new_dataset = DatasetDict({
    "train": new_split["train"],
    "validation": valid_test_split["train"],
    "test": valid_test_split["test"]
})

# Fonction de nettoyage avec SpaCy
def clean_text(text):
    doc = nlp(text)
    cleaned_tokens = [
        token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct
    ]
    return " ".join(cleaned_tokens)

# Appliquer la fonction de nettoyage à chaque sous-ensemble
for split in ['train', 'validation', 'test']:
    new_dataset[split] = new_dataset[split].map(
        lambda example: {"article": clean_text(example["article"]), "highlights": clean_text(example["highlights"])}
    )

# Sauvegarder chaque sous-ensemble nettoyé dans un fichier JSON
for split in ['train', 'validation', 'test']:
    with open(f"{split}_cleaned.json", "w") as f:
        for example in new_dataset[split]:
            json.dump({"article": example["article"], "highlights": example["highlights"]}, f)
            f.write("\n")

print("Nettoyage et sauvegarde terminés.")


Map:   0%|          | 0/249576 [00:00<?, ? examples/s]

Map:   0%|          | 0/31197 [00:00<?, ? examples/s]

Map:   0%|          | 0/31198 [00:00<?, ? examples/s]

Nettoyage et sauvegarde terminés.


## Chargement des fichiers sauvegardés

In [6]:
def load_jsonl(file_path):
    """Loads a JSON Lines file into a list of dictionaries."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:  # Try to parse the line as JSON
                data.append(json.loads(line))
            except json.JSONDecodeError as e:
                print(f"Warning: Skipping invalid JSON line: {line.strip()} - Error: {e}")
                # Handle invalid lines (e.g., skip or attempt to repair)
    return data

# Load data using the function
train_cleaned = pd.DataFrame(load_jsonl('/content/drive/MyDrive/Projet_Kader/train_cleaned.json'))
test_cleaned = pd.DataFrame(load_jsonl('/content/drive/MyDrive/Projet_Kader/test_cleaned.json'))
validation_cleaned = pd.DataFrame(load_jsonl('/content/drive/MyDrive/Projet_Kader/validation_cleaned.json'))



## Modèle de résumé extractif (Utiliser le jeu de données de tests uniquement)

### Approche et Implementation

In [7]:
# Télécharger les ressources NLTK nécessaires
nltk.download('punkt')

# Initialisation du modèle de sentence embedding
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Fonction pour extraire les phrases pertinentes
def extract_key_sentences(text, num_sentences=3):
    sentences = text.split('.')  # Diviser le texte en phrases
    embeddings = embedder.encode(sentences, convert_to_tensor=True)  # Embedding des phrases

    # Calculer l'embedding du texte entier (texte global)
    text_embedding = embedder.encode([text], convert_to_tensor=True)

    # Calculer la similarité cosine entre chaque phrase et le texte entier
    similarities = util.pytorch_cos_sim(text_embedding, embeddings)[0]

    # Trier les phrases par similarité et sélectionner les meilleures
    ranked_sentences = [sentences[i].strip() for i in similarities.argsort(descending=True)[:num_sentences]]

    return ranked_sentences


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

### Evaluation

In [13]:
# Fonction pour calculer le score BLEU
def calculate_bleu(reference_summary, generated_summary):
    # Calcul du score BLEU
    score = sentence_bleu([reference_summary], generated_summary)
    return score

# Fonction pour calculer les scores ROUGE
def calculate_rouge(reference_summary, generated_summary):
    # Initialisation du calculateur ROUGE avec les métriques ROUGE-1, ROUGE-2 et ROUGE-L
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    # Calcul des scores ROUGE
    scores = scorer.score(reference_summary, generated_summary)
    return scores

# Exemple d'article et résumé de référence
text = test_cleaned['article'][1]  # Extraction de l'article
reference_summary = test_cleaned['highlights'][1]  # Extraction du résumé de référence

# Extraction des phrases clés
key_sentences = extract_key_sentences(text, num_sentences=3)

# Affichage des phrases extraites et calcul des scores
print("Phrases extraites :")
for idx, sentence in enumerate(key_sentences):
    # Calcul du score BLEU pour chaque phrase extraite
    bleu_score = calculate_bleu(reference_summary, sentence)
    # Calcul des scores ROUGE pour chaque phrase extraite
    rouge_scores = calculate_rouge(reference_summary, sentence)
    # Affichage des résultats
    print(f"\nPhrase {idx + 1}:")
    print(f"Phrase extraite : {sentence}")
    print(f"Score BLEU : {bleu_score:.4f}")
    print(f"Score ROUGE-1 : {rouge_scores['rouge1'].fmeasure:.4f}")
    print(f"Score ROUGE-2 : {rouge_scores['rouge2'].fmeasure:.4f}")
    print(f"Score ROUGE-L : {rouge_scores['rougeL'].fmeasure:.4f}")


Phrases extraites :

Phrase 1:
Score BLEU : 0.1924
Score ROUGE-1 : 0.3655
Score ROUGE-2 : 0.1436
Score ROUGE-L : 0.1827


## Modèle de résumé abstrait (Encoder-Decoder)

### Architecture

In [14]:
# Convertir les dataframes en objets Dataset de Hugging Face
train_dataset = Dataset.from_pandas(train_cleaned)
validation_dataset = Dataset.from_pandas(validation_cleaned)
test_dataset = Dataset.from_pandas(test_cleaned)

# Charger le tokenizer et le modèle
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def preprocess_function(examples):
    # Tokeniser le texte d'entrée et le résumé, ajuster la longueur maximale selon les besoins
    model_inputs = tokenizer(examples['article'], max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokeniser les jeux de données
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(preprocess_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/94716 [00:00<?, ? examples/s]



Map:   0%|          | 0/31197 [00:00<?, ? examples/s]

### Entraînement du modèle

In [None]:
# Arguments d'entraînement
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
)

# Entraîneur
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
    tokenizer=tokenizer,
)

# Entraîner le modèle
trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,1.0309,0.967481
