In [1]:
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


Using device: cuda


In [2]:
import gc

# Libérer la RAM Python
gc.collect()
# Libérer la mémoire GPU inutilisée
if torch.cuda.is_available():
    torch.cuda.empty_cache()

## Ce qu’il ne faut pas faire pour le modèle T5

- Pas de lemmatisation / stemming : tu casserais la forme exacte que le modèle attend.

- Pas de suppression de stopwords (“le, de, et…”) : les Transformers utilisent ces mots pour comprendre la syntaxe.

- Pas de passage forcé en minuscules si le modèle a été pré-entraîné en respectant la casse.

- Pas de suppression massive de ponctuation (les modèles utilisent “?”, “.”, “,”, “:” pour le sens et la segmentation).

### Choix des jeux de données

**OPUS et Europarl**

**OPUS-100 en-fr** : mélange de sources, phrases souvent plus courtes, variées (titres, sous-phrases, etc.).

**Europarl en-fr** : phrases plus longues, style plus formel, discours parlementaires → plus de contexte par phrase.

Du point de vue du modèle :

- OPUS -> donne de la diversité (beaucoup de styles, domaines).

- Europarl ->  apprend à gérer des phrases longues / syntaxe compliquée (subordonnées, tournures formelles) => **augmenter** ***max_length*** pour moins tronquer les phrases


**Ajouter les TED Talks plus tard pour le langage oral, plus “conversational”, avec un ton explicatif.**

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import LoraConfig, TaskType, get_peft_model

MODEL_NAME = "google/flan-t5-base"
INSTRUCTION = "Translate English to French: "


def load_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

    # Paramètres de décodage par défaut pour .generate()
    # pendant l’éval (puisque predict_with_generate=True), les traductions seront générées avec ces paramètres -> métriques qui reflètent ce décodage
    # pendant l’inférence manuelle (model.generate(...) sans redonner ces arguments), ce seront les valeurs par défaut.
    
    model.config.num_beams = 4              # beam search
    model.config.length_penalty = 0.9       # < 1 = un peu plus court, > 1 = plus long
    model.config.no_repeat_ngram_size = 3   # évite de répéter des 3-grammes

    peft_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        inference_mode=False,
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
    )

    model = get_peft_model(model, peft_config)
    model.to(DEVICE)
    model.print_trainable_parameters()

    return model, tokenizer


  from .autonotebook import tqdm as notebook_tqdm


### Charger et échantilloner le dataset OPUS et Europarl

In [4]:
from datasets import load_dataset, concatenate_datasets

def load_data(sample_size_train=10000, sample_size_val=300):

    ### OPUS ###
    opus = load_dataset("Helsinki-NLP/opus-100", "en-fr")
    opus_train = opus["train"].shuffle(seed=42).select(range(sample_size_train))
    opus_val = opus["validation"].shuffle(seed=42).select(range(sample_size_val))

    ### EUROPARL EN-FR ###
    europarl = load_dataset("Helsinki-NLP/europarl", "en-fr", split="train").shuffle(seed=43)
    euro_train = europarl.select(range(sample_size_train))
    euro_val   = europarl.select(range(sample_size_train, sample_size_train + sample_size_val))

    ### FUSION des 2 datasets ###
    train_ds = concatenate_datasets([opus_train, euro_train])
    val_ds   = concatenate_datasets([opus_val, euro_val])
    
    # re-shuffle global
    train_ds = train_ds.shuffle(seed=43)
    val_ds   = val_ds.shuffle(seed=43)
    
    return train_ds, val_ds


### Encoder l’anglais comme input et le français comme labels

In [5]:
import numpy as np

def preprocess_function(examples, tokenizer):
    inputs = [INSTRUCTION + ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]

    model_inputs = tokenizer(
        inputs,
        max_length=256,
        truncation=True,
        padding="max_length",
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=256,
            truncation=True,
            padding="max_length",
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [6]:
def tokenize_datasets(train, val, tokenizer):
    tokenized_train = train.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=train.column_names,
    )
    tokenized_val = val.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=val.column_names,
    )
    return tokenized_train, tokenized_val


### Métrique d'évaluation de la traduction

In [7]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

## Pour nettoyer le texte
def postprocess_text(preds, labels):
    preds = [p.strip() for p in preds]
    labels = [l.strip() for l in labels]
    return preds, labels


def compute_metrics(eval_preds, tokenizer):
    preds, labels = eval_preds

    # Certains modèles renvoient (logits, ...) -> on garde seulement les ids
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode des prédictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # On remet pad_token_id à la place des -100 pour décoder correctement les labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Nettoyage simple
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    # sacreBLEU / METEOR / ROUGE
    # sacreBLEU attend une liste de listes pour les références
    refs_list = [[r] for r in decoded_labels]

    bleu_res = sacrebleu.compute(
        predictions=decoded_preds,
        references=refs_list,
    )

    rouge_res = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,

        
    )

    meteor_res = meteor.compute(
        predictions=decoded_preds,
        references=decoded_labels,
    )

    return {
        "bleu": bleu_res["score"],
        "meteor": meteor_res["meteor"],
        "rouge1": rouge_res["rouge1"],
        "rouge2": rouge_res["rouge2"],
        "rougeL": rouge_res["rougeL"],
    }


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\maxka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\maxka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\maxka\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
from transformers import TrainerCallback

class GarbageCollectorCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        # Libérer la RAM Python
        gc.collect()
        # Libérer la mémoire GPU inutilisée
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        return control


In [9]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

def train(model, tokenizer, tokenized_train, tokenized_val):
    training_args = Seq2SeqTrainingArguments(
        output_dir="finetuned_flan_t5_en_fr",
        learning_rate=1e-4,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=30,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        predict_with_generate=True,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",  
        greater_is_better=False,       
        logging_steps=50,
    )

    gc_callback = GarbageCollectorCallback()  

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
        compute_metrics=lambda p: compute_metrics(p, tokenizer),
        callbacks=[gc_callback],
    )

    trainer.train()

    # Sauvegarde du modèle LoRA + tokenizer dans le même dossier
    trainer.model.save_pretrained(training_args.output_dir)
    tokenizer.save_pretrained(training_args.output_dir)
    print("Best eval :", trainer.state.best_metric)
    print("Best checkpoint :", trainer.state.best_model_checkpoint)

    return trainer



In [10]:
INSTRUCTION = "Translate English to French: "

def translate_sentence(sentence, model, tokenizer, max_length=256):
    model.eval()

    input_text = INSTRUCTION + sentence
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
    ).to(DEVICE)  # les tensors vont sur le même device que le modèle

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [11]:
if __name__ == "__main__":
    model, tokenizer = load_model_and_tokenizer()
    train_ds, val_ds = load_data()
    tokenized_train, tokenized_val = tokenize_datasets(train_ds, val_ds, tokenizer)
    trainer = train(model, tokenizer, tokenized_train, tokenized_val)


trainable params: 1,769,472 || all params: 249,347,328 || trainable%: 0.7096


  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss,Bleu,Meteor,Rouge1,Rouge2,Rougel
1,0.3166,0.277945,6.258771,0.26133,0.359372,0.177789,0.332368
2,0.3213,0.272326,6.509489,0.263228,0.36134,0.180543,0.332797
3,0.2926,0.269727,6.417688,0.262687,0.363001,0.178816,0.333536
4,0.2802,0.267651,6.433599,0.264593,0.363964,0.180733,0.335548
5,0.3006,0.267071,6.669093,0.266846,0.366098,0.182794,0.338517
6,0.2993,0.265574,6.647753,0.265751,0.364865,0.185565,0.3371
7,0.2867,0.264946,6.514973,0.264489,0.365101,0.183196,0.335902
8,0.286,0.264099,6.601684,0.269214,0.369466,0.186579,0.341583
9,0.2921,0.263701,6.693993,0.273334,0.373951,0.186501,0.345634
10,0.2841,0.263391,6.713036,0.271984,0.37347,0.188961,0.344164


Best BLEU : 0.2604176104068756
Best checkpoint : finetuned_flan_t5_en_fr\checkpoint-67500


In [13]:
import pandas as pd
logs = pd.DataFrame(trainer.state.log_history)

# Lignes d'éval (celles qui ont une eval_loss)
eval_logs = logs[logs["eval_loss"].notna()]

# Colonnes qui nous intéressent
cols = ["epoch", "step", "eval_loss", "eval_bleu", "eval_meteor", "eval_rougeL"]

print(eval_logs[cols])

      epoch   step  eval_loss  eval_bleu  eval_meteor  eval_rougeL
50      1.0   2500   0.277945   6.258771     0.261330     0.332368
101     2.0   5000   0.272326   6.509489     0.263228     0.332797
152     3.0   7500   0.269727   6.417688     0.262687     0.333536
203     4.0  10000   0.267651   6.433599     0.264593     0.335548
254     5.0  12500   0.267071   6.669093     0.266846     0.338517
305     6.0  15000   0.265574   6.647753     0.265751     0.337100
356     7.0  17500   0.264946   6.514973     0.264489     0.335902
407     8.0  20000   0.264099   6.601684     0.269214     0.341583
458     9.0  22500   0.263701   6.693993     0.273334     0.345634
509    10.0  25000   0.263391   6.713036     0.271984     0.344164
560    11.0  27500   0.263937   6.572273     0.272996     0.344244
611    12.0  30000   0.262175   6.500738     0.270215     0.343555
662    13.0  32500   0.262407   6.729949     0.271882     0.345584
713    14.0  35000   0.261609   6.670890     0.273576     0.34

# Recharger le modèle fine-tuné

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftConfig, PeftModel

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

PEFT_DIR = "./finetuned_flan_t5_en_fr/checkpoint-"


Using device: cuda


## Charger le modèle LoRA + tokenizer

In [15]:
# 1) Lire la config PEFT (LoRA) depuis le dossier local
peft_config = PeftConfig.from_pretrained(PEFT_DIR, local_files_only=True)

# 2) Charger le modèle de base utilisé pendant le fine-tuning
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    peft_config.base_model_name_or_path
)

# 3) Appliquer les poids LoRA entraînés
model = PeftModel.from_pretrained(
    base_model,
    PEFT_DIR,
    local_files_only=True,
)

model.to(DEVICE)
model.eval()

# 4) Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

print("Modèle LoRA + tokenizer rechargés")


Modèle LoRA + tokenizer rechargés


## Fonction de traduction + tests

In [16]:
INSTRUCTION = "Translate English to French: "

def translate_sentence(sentence, model, tokenizer, max_length=256):
    model.eval()
    inputs = tokenizer(
        INSTRUCTION + sentence,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
    ).to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=128)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

test_sentences = [
    "Hello, how are you?",
    "This project is about automatic translation.",
    "The weather is nice today.",
]

for s in test_sentences:
    print("\nEN :", s)
    print("FR :", translate_sentence(s, model, tokenizer))



EN : Hello, how are you?
FR : Bien, tu ?

EN : This project is about automatic translation.
FR : Cette projet concerne la traduction automatique.

EN : The weather is nice today.
FR : Le temps est agréable aujourd'hui.


In [17]:
print(translate_sentence("This is a small translation test.", model, tokenizer))


C'est un petit test de traduction.
