## Membres du Groupe de Projet

- Maxence KAMIONKA
- Mikhaïl BENALI
- Hadja BAH
- Emmanuel DAGNOGO


In [1]:
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


Using device: cuda


In [2]:
import gc

# Libérer la RAM Python
gc.collect()
# Libérer la mémoire GPU inutilisée
if torch.cuda.is_available():
    torch.cuda.empty_cache()

## Test du modèle de base

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_NAME = "google/flan-t5-base"
INSTRUCTION = "Translate English to French: "

base_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(DEVICE)

def translate_base(text):
    inputs = base_tokenizer(INSTRUCTION + text, return_tensors="pt").to(DEVICE)
    outputs = base_model.generate(
        **inputs,
        num_beams=4,
        max_new_tokens=128,
        no_repeat_ngram_size=3,
    )
    return base_tokenizer.decode(outputs[0], skip_special_tokens=True)

for s in [
    "Hello, how are you?",
    "This project is about automatic translation.",
    "The weather is nice today.",
]:
    print("EN:", s)
    print("FR:", translate_base(s))


  from .autonotebook import tqdm as notebook_tqdm


EN: Hello, how are you?
FR: Bonjour, c'est-à-dire?
EN: This project is about automatic translation.
FR: Cette projet concerne la traduction automatique.
EN: The weather is nice today.
FR: Le temps est bon aujourd'hui.


## Ce qu’il ne faut pas faire pour le modèle T5

- Pas de lemmatisation / stemming : tu casserais la forme exacte que le modèle attend.

- Pas de suppression de stopwords (“le, de, et…”) : les Transformers utilisent ces mots pour comprendre la syntaxe.

- Pas de passage forcé en minuscules si le modèle a été pré-entraîné en respectant la casse.

- Pas de suppression massive de ponctuation (les modèles utilisent “?”, “.”, “,”, “:” pour le sens et la segmentation).

### Choix des jeux de données

**OPUS et Europarl**

**OPUS-100 en-fr** : mélange de sources, phrases souvent plus courtes, variées (titres, sous-phrases, etc.).

**Europarl en-fr** : phrases plus longues, style plus formel, discours parlementaires → plus de contexte par phrase.

Du point de vue du modèle :

- OPUS -> donne de la diversité (beaucoup de styles, domaines).

- Europarl ->  apprend à gérer des phrases longues / syntaxe compliquée (subordonnées, tournures formelles) => **augmenter** ***max_length*** pour moins tronquer les phrases


**Ajouter les TED Talks plus tard pour le langage oral, plus “conversational”, avec un ton explicatif.**

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import LoraConfig, TaskType, get_peft_model

MODEL_NAME = "google/flan-t5-base"
INSTRUCTION = "Translate English to French: "


def load_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

    # Paramètres de décodage par défaut pour .generate()
    # pendant l’éval (puisque predict_with_generate=True), les traductions seront générées avec ces paramètres -> métriques qui reflètent ce décodage
    # pendant l’inférence manuelle (model.generate(...) sans redonner ces arguments), ce seront les valeurs par défaut.
    
    model.config.num_beams = 4              # beam search
    model.config.length_penalty = 0.9       # < 1 = un peu plus court, > 1 = plus long
    model.config.no_repeat_ngram_size = 3   # évite de répéter des 3-grammes

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        inference_mode=False,
        r=8,
        lora_alpha=16,
        lora_dropout=0.05,
    )

    model = get_peft_model(model, peft_config)
    model.to(DEVICE)
    model.print_trainable_parameters()

    return model, tokenizer


### Charger et échantilloner le dataset OPUS et Europarl

In [5]:
from datasets import load_dataset, concatenate_datasets

def load_data_old(sample_size_train=10000, sample_size_val=2000):

    ### OPUS ###
    opus = load_dataset("Helsinki-NLP/opus-100", "en-fr")
    opus_train = opus["train"].shuffle(seed=42).select(range(sample_size_train))
    opus_val = opus["validation"].shuffle(seed=42).select(range(sample_size_val))

    ### EUROPARL EN-FR ###
    europarl = load_dataset("Helsinki-NLP/europarl", "en-fr", split="train").shuffle(seed=42)
    euro_train = europarl.select(range(sample_size_train))
    euro_val = europarl.select(range(sample_size_train, sample_size_train + sample_size_val))

    ### FUSION des 2 datasets ###
    train_ds = concatenate_datasets([opus_train, euro_train])
    val_ds = concatenate_datasets([opus_val, euro_val])
    
    # re-shuffle global
    train_ds = train_ds.shuffle(seed=43)
    val_ds   = val_ds.shuffle(seed=43)
    
    return train_ds, val_ds


def load_data(sample_size_train=80000, sample_size_val=800):

    ### EUROPARL EN-FR ###
    europarl = load_dataset("Helsinki-NLP/europarl", "en-fr", split="train").shuffle(seed=42)
    euro_train = europarl.select(range(sample_size_train))
    euro_val = europarl.select(range(sample_size_train, sample_size_train + sample_size_val))
    
    return euro_train, euro_val


In [6]:
# Charger les données brutes
train_ds, val_ds = load_data()

def show_raw_example(ds, idx=0, prefix="train"):
    ex = ds[idx]
    print(f"--- {prefix} example {idx} ---")
    print("EN :", ex["translation"]["en"])
    print("FR :", ex["translation"]["fr"])
    print()

# 3 exemples du train et 3 de la val
for i in range(3):
    show_raw_example(train_ds, i, prefix="train")

for i in range(3):
    show_raw_example(val_ds, i, prefix="val")


--- train example 0 ---
EN : As Europeans, with our experience, our culture of peace and our economic opportunities, we too are called upon to make our contribution towards a better future for Iraq.
FR : Les Européens que nous sommes, avec leur expérience, leur culture de la paix et leurs moyens économiques, sont appelés à apporter leur contribution en faveur d'un avenir meilleur en Irak.

--- train example 1 ---
EN : It does indeed speak for itself that those who are around the negotiating table are most sensitive to their own issues; this is always the case.
FR : Il est d'ailleurs évident que les personnes qui siègent autour de la table de négociations sont les plus sensibles à leurs propres problèmes. Il en est toujours ainsi.

--- train example 2 ---
EN : We nevertheless believe that the compromise is sound overall because it constitutes a clear improvement upon the original proposal.
FR : Cependant, nous estimons que le compromis est dans l’ensemble satisfaisant, en ce sens qu’il 

### Encoder l’anglais comme input et le français comme labels

In [7]:
import numpy as np

def preprocess_function(examples, tokenizer):
    inputs = [INSTRUCTION + ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]

    # Encodage des entrées
    model_inputs = tokenizer(
        inputs,
        max_length=256,
        truncation=True,
        padding="max_length",
    )

    # Encodage des cibles
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=256,
            truncation=True,
            padding="max_length",
        )

    label_ids = labels["input_ids"]
    pad_token_id = tokenizer.pad_token_id

    # ignorer les PAD dans la loss
    label_ids = [
        [(tok if tok != pad_token_id else -100) for tok in seq]
        for seq in label_ids
    ]

    model_inputs["labels"] = label_ids
    return model_inputs


In [8]:
def tokenize_datasets(train, val, tokenizer):
    tokenized_train = train.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=train.column_names,
    )
    tokenized_val = val.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=val.column_names,
    )
    return tokenized_train, tokenized_val


In [9]:
# Charger modèle & tokenizer
model, tokenizer = load_model_and_tokenizer()

# Re-tokeniser
train_ds, val_ds = load_data()
tokenized_train, tokenized_val = tokenize_datasets(train_ds, val_ds, tokenizer)


trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


Map: 100%|██████████████████████████████████████████████████████████████| 80000/80000 [00:10<00:00, 7818.47 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 800/800 [00:00<00:00, 7417.93 examples/s]


In [10]:
def show_processed_example(raw_ds, tokenized_ds, idx=0, prefix="train"):
    raw = raw_ds[idx]
    tok = tokenized_ds[idx]

    print(f"=== {prefix} example {idx} ===")
    print("RAW EN :", raw["translation"]["en"])
    print("RAW FR :", raw["translation"]["fr"])
    print()

    # Inputs
    print("input_ids[:20] :", tok["input_ids"][:20])
    print("Decoded input  :", tokenizer.decode(tok["input_ids"], skip_special_tokens=True))
    print()

    # Labels (enlevant les -100 pour re-décoder)
    labels = tok["labels"]
    # On remplace les -100 par pad_token_id pour pouvoir décoder
    pad_id = tokenizer.pad_token_id
    labels_for_decode = [pad_id if x == -100 else x for x in labels]
    print("labels[:20]     :", labels[:20])
    print("Decoded labels  :", tokenizer.decode(labels_for_decode, skip_special_tokens=True))
    print()


In [11]:
for i in range(3):
    show_processed_example(train_ds, tokenized_train, i, prefix="train")

for i in range(3):
    show_processed_example(val_ds, tokenized_val, i, prefix="val")


=== train example 0 ===
RAW EN : As Europeans, with our experience, our culture of peace and our economic opportunities, we too are called upon to make our contribution towards a better future for Iraq.
RAW FR : Les Européens que nous sommes, avec leur expérience, leur culture de la paix et leurs moyens économiques, sont appelés à apporter leur contribution en faveur d'un avenir meilleur en Irak.

input_ids[:20] : [30355, 15, 1566, 12, 2379, 10, 282, 1611, 7, 6, 28, 69, 351, 6, 69, 1543, 13, 3065, 11, 69]
Decoded input  : Translate English to French: As Europeans, with our experience, our culture of peace and our economic opportunities, we too are called upon to make our contribution towards a better future for Iraq.

labels[:20]     : [622, 2430, 3890, 35, 7, 238, 678, 7056, 6, 393, 1089, 11183, 6, 1089, 1543, 20, 50, 25060, 3, 15]
Decoded labels  : Les Européens que nous sommes, avec leur expérience, leur culture de la paix et leurs moyens économiques, sont appelés à apporter leur co

### Métrique d'évaluation de la traduction

In [12]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

## Pour nettoyer le texte
def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels


def compute_metrics(eval_preds, tokenizer):
    # eval_preds peut être un tuple (preds, labels)
    # ou un objet EvalPrediction avec .predictions et .label_ids
    if hasattr(eval_preds, "predictions"):
        preds = eval_preds.predictions
        labels = eval_preds.label_ids
    else:
        preds, labels = eval_preds

    # Certains modèles renvoient (logits, ...) -> on garde seulement le 1er élément
    if isinstance(preds, tuple):
        preds = preds[0]

    # On met tout en np.array pour être tranquille
    preds = np.array(preds)

    # Cas où preds = logits (batch, seq_len, vocab_size) -> on prend l'argmax
    if preds.ndim == 3:
        preds = np.argmax(preds, axis=-1)

    # On s'assure que ce sont bien des entiers
    preds = preds.astype("int64")

    # si jamais il y a des valeurs négatives dans preds on les remplace par pad_token_id avant decode
    preds[preds < 0] = tokenizer.pad_token_id

    # Gestion des labels : on remet pad_token_id à la place des -100 pour décoder
    labels = np.array(labels)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode des prédictions et des labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Nettoyage simple
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # sacreBLEU / METEOR / ROUGE
    # sacreBLEU attend une liste de listes pour les références
    refs_list = [[r] for r in decoded_labels]

    bleu_res = sacrebleu.compute(
        predictions=decoded_preds,
        references=refs_list,
    )

    rouge_res = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )

    meteor_res = meteor.compute(
        predictions=decoded_preds,
        references=decoded_labels,
    )

    return {
        "bleu": bleu_res["score"],
        "meteor": meteor_res["meteor"],
        "rouge1": rouge_res["rouge1"],
        "rouge2": rouge_res["rouge2"],
        "rougeL": rouge_res["rougeL"],
    }

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maxka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\maxka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\maxka\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Métriques sur modèle de base

In [13]:
from transformers import DataCollatorForSeq2Seq


BASE_MODEL_NAME = "google/flan-t5-base"

tokenizer_base = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
model_base = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL_NAME).to(DEVICE)

train_ds, val_ds = load_data()
tokenized_train_base, tokenized_val_base = tokenize_datasets(train_ds, val_ds, tokenizer_base)


data_collator_base = DataCollatorForSeq2Seq(
    tokenizer=tokenizer_base,
    model=model_base,
)

eval_args_base = Seq2SeqTrainingArguments(
    output_dir="baseline_flan_t5_en_fr",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    # paramètres de génération cohérents avec ceux du fine-tuning
    generation_max_length=128,
    generation_num_beams=4,
    do_train=False,
    do_eval=True,
    logging_dir="logs_baseline",
)

trainer_base = Seq2SeqTrainer(
    model=model_base,
    args=eval_args_base,
    eval_dataset=tokenized_val_base,
    tokenizer=tokenizer_base,
    data_collator=data_collator_base,
    compute_metrics=lambda p: compute_metrics(p, tokenizer_base),
)

baseline_metrics = trainer_base.evaluate()
print(baseline_metrics)


Map: 100%|██████████████████████████████████████████████████████████████████| 800/800 [00:00<00:00, 7169.00 examples/s]
  trainer_base = Seq2SeqTrainer(


{'eval_loss': 1.4469153881072998, 'eval_model_preparation_time': 0.003, 'eval_bleu': 24.503120896483793, 'eval_meteor': 0.49564799912880275, 'eval_rouge1': 0.5573144763373334, 'eval_rouge2': 0.33833429470799736, 'eval_rougeL': 0.5113933142445597, 'eval_runtime': 170.6516, 'eval_samples_per_second': 4.688, 'eval_steps_per_second': 0.586}


In [14]:
from transformers import TrainerCallback

class GarbageCollectorCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        # Libérer la RAM Python
        gc.collect()
        # Libérer la mémoire GPU inutilisée
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        return control


In [15]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

def train(model, tokenizer, tokenized_train, tokenized_val):
    training_args = Seq2SeqTrainingArguments(
        output_dir="finetuned_flan_t5_en_fr",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        predict_with_generate=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_bleu",  
        greater_is_better=True,       
        logging_steps=50,
    )

    gc_callback = GarbageCollectorCallback()  

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        compute_metrics=lambda p: compute_metrics(p, tokenizer),
        callbacks=[gc_callback],
    )

    trainer.train()

    # Sauvegarde du modèle LoRA + tokenizer dans le même dossier
    trainer.model.save_pretrained(training_args.output_dir)
    tokenizer.save_pretrained(training_args.output_dir)
    print("Best eval :", trainer.state.best_metric)
    print("Best checkpoint :", trainer.state.best_model_checkpoint)

    return trainer



In [16]:
INSTRUCTION = "Translate English to French: "

def translate_sentence(sentence, model, tokenizer, max_length=256):
    model.eval()

    input_text = INSTRUCTION + sentence
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
    ).to(DEVICE)  # les tensors vont sur le même device que le modèle

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [17]:
if __name__ == "__main__":
    model, tokenizer = load_model_and_tokenizer()
    train_ds, val_ds = load_data()
    tokenized_train, tokenized_val = tokenize_datasets(train_ds, val_ds, tokenizer)
    trainer = train(model, tokenizer, tokenized_train, tokenized_val)


trainable params: 884,736 || all params: 248,462,592 || trainable%: 0.3561


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu,Meteor,Rouge1,Rouge2,Rougel
1,1.7832,1.44507,6.318779,0.269898,0.38266,0.216753,0.348862
2,1.6509,1.437017,6.397862,0.270863,0.382085,0.217926,0.349424


KeyboardInterrupt: 

In [None]:
import pandas as pd
logs = pd.DataFrame(trainer.state.log_history)

# Lignes d'éval (celles qui ont une eval_loss)
eval_logs = logs[logs["eval_loss"].notna()]

# Colonnes qui nous intéressent
cols = ["epoch", "step", "eval_loss", "eval_bleu", "eval_meteor", "eval_rougeL"]

print(eval_logs[cols])

# Recharger le modèle fine-tuné

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftConfig, PeftModel

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

PEFT_DIR = "./finetuned_flan_t5_en_fr"


## Charger le modèle LoRA + tokenizer

In [None]:
# 1) Lire la config PEFT (LoRA) depuis le dossier local
peft_config = PeftConfig.from_pretrained(PEFT_DIR, local_files_only=True)

# 2) Charger le modèle de base utilisé pendant le fine-tuning
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    peft_config.base_model_name_or_path
)

# 3) Appliquer les poids LoRA entraînés
model = PeftModel.from_pretrained(
    base_model,
    PEFT_DIR,
    local_files_only=True,
)

model.to(DEVICE)
model.eval()

# 4) Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

print("Modèle LoRA + tokenizer rechargés")


## Fonction de traduction + tests

In [None]:
INSTRUCTION = "Translate English to French: "

def translate_sentence(sentence, model, tokenizer, max_length=256):
    model.eval()
    inputs = tokenizer(
        INSTRUCTION + sentence,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
    ).to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
# 1) Quelques phrases du dataset train
for i in range(3):
    en = train_ds[i]["translation"]["en"]
    fr_gold = train_ds[i]["translation"]["fr"]
    fr_pred = translate_sentence(en, model, tokenizer)
    print("=== Exemple train", i, "===")
    print("EN     :", en)
    print("FR gold:", fr_gold)
    print("FR pred:", fr_pred)
    print()

# 2) phrases de test perso
tests = [
    "Hello, how are you?",
    "This project is about automatic translation.",
    "The weather is nice today.",
]

for s in tests:
    print("EN :", s)
    print("FR :", translate_sentence(s, model, tokenizer))
    print()


In [None]:
print(translate_sentence("This is a small translation test.", model, tokenizer))
