In [18]:
import torch
torch.cuda.is_available(), torch.cuda.get_device_name(0)

(True, 'Tesla T4')

In [19]:
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/My Drive/DLAA_2025"
%ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/DLAA_2025
plotloss.py                         qwen_train.py     time_series.ipynb
[0m[01;34mqwen2.5-0.5b-instruct-lora-output[0m/  test_jsonl.jsonl  ts_check_training.py


In [None]:
!head dataset.jsonl

{"index": 0, "input": "\n/think Describe the time series in three sentences. First sentence: describe trend (increasing/decreasing/flat). Second sentence: noise intensity (low/medium/high). Third sentence: approximate localisation of global maximum (beginning/middle/end) and global minimum (beginning/middle/end).\nPut the description in a JSON format with the following pattern\n```json\n{ \"trend\": <sentence1>,\n  \"noise\": <sentence2>,\n  \"extrema\": <sentence3> }\n```\n Series: [00, 01, 00, 05, 07, 06, 09, 08, 10, 10, 12, 13, 16, 16, 17, 17, 15, 14, 14, 16, 18, 18, 19, 22, 26, 29, 26, 25, 24, 25, 24, 24, 26, 24, 24, 25, 27, 28, 26, 24, 26, 31, 30, 30, 30, 33, 34, 32, 33, 35, 35, 40, 41, 41, 44, 45, 45, 48, 48, 51, 49, 51, 53, 55, 54, 56, 59, 60, 57, 57, 58, 58, 60, 59, 60, 62, 61, 60, 60, 59, 59, 62, 65, 65, 63, 64, 63, 65, 68, 70, 74, 69, 71, 70, 70, 69, 70, 66, 69, 70, 69, 70, 72, 72, 74, 74, 74, 76, 76, 79, 79, 78, 79, 78, 80, 81, 82, 87, 90, 90, 92, 92, 90, 91, 92, 95, 95, 99]

In [12]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    TrainerCallback
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset, DatasetDict
import torch
import json
import os
import re
import logging

from sentence_transformers import SentenceTransformer, util

In [None]:
# === CONFIGURATION ===
MODEL_NAME = "Qwen/Qwen2.5-0.5B-Instruct"
DATASET_PATH = "dataset.jsonl"

# Extraire le nom court du modèle
model_name_parts = MODEL_NAME.split('/')
model_short_name = model_name_parts[-1] if model_name_parts else "unknown_model"
OUTPUT_DIR = f"./{model_short_name.lower()}-lora-output"

print("Training outputs saved in:", OUTPUT_DIR)

Training outputs saved in: ./qwen2.5-0.5b-instruct-lora-output


In [14]:
# === CHARGEMENT DU DATASET JSONL ===
def load_jsonl_dataset(path):
    with open(path, 'r') as f:
        lines = [json.loads(l) for l in f]
    return Dataset.from_list(lines)

raw_dataset = load_jsonl_dataset(DATASET_PATH)
# only keep 50 first samples
raw_dataset = raw_dataset.select(range(50)) ###########################################
split_dataset = raw_dataset.train_test_split(test_size=0.10)
dataset = DatasetDict({
    "train": split_dataset["train"],
    "test": split_dataset["test"]
})

print(f"Train size: {len(dataset['train'])}")
print(f"Test size: {len(dataset['test'])}")

Train size: 45
Test size: 5


In [15]:
# === TOKENIZER ===
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
####### xxxxx tokenizer.pad_token = tokenizer.eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [16]:
# === PREPROCESSING ===
def tokenize(example):
    prompt = example["input"]
    response = example["output"]

    # Tokenise prompt et réponse séparément
    prompt_ids = tokenizer(prompt, truncation=False)["input_ids"]
    response_ids = tokenizer(response, truncation=True, max_length=256)["input_ids"]

    # Calcul de l'espace disponible pour le prompt
    max_total_len = 1024
    max_prompt_len = max_total_len - len(response_ids)
    prompt_ids = prompt_ids[-max_prompt_len:]  # Coupe si trop long

    input_ids = prompt_ids + response_ids
    attention_mask = [1] * len(input_ids)

    # Labels : on ignore les tokens du prompt
    labels = [-100] * len(prompt_ids) + response_ids

    # Padding si nécessaire
    pad_len = max_total_len - len(input_ids)
    if pad_len > 0:
        input_ids += [tokenizer.pad_token_id] * pad_len
        attention_mask += [0] * pad_len
        labels += [-100] * pad_len

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

print("Tokenisation des données...")
tokenized_dataset = dataset.map(tokenize, remove_columns=dataset["train"].column_names)
print("Tokenisation terminée.")


Tokenisation des données...


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenisation terminée.


In [21]:
# === MODÈLE ===
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True
)

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

In [22]:
# === CONFIGURATION LORA ===
lora_config = LoraConfig(
    r=4,  # r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [23]:
# === MODÈLE DE SIMILARITÉ SÉMANTIQUE ===
model_st = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")

# === FONCTION D'ÉVALUATION SÉMANTIQUE ===
def compute_semantic_similarity(model, tokenizer, dataset, output_file=None):
    model.eval()
    examples = dataset.to_list()
    inputs = [ex["input"] for ex in examples]
    gold_outputs = [ex["output"] for ex in examples]

    generated_outputs = []
    for inp in inputs:
        inputs_tokenized = tokenizer(inp, return_tensors="pt").to(model.device)
        prompt_len = inputs_tokenized.input_ids.shape[1]

        with torch.no_grad():
            output_ids = model.generate(
                **inputs_tokenized,
                max_new_tokens=256,
                do_sample=True,
                temperature=0.7
            )

        generated_only_ids = output_ids[0][prompt_len:]
        output_text = tokenizer.decode(generated_only_ids, skip_special_tokens=True)
        generated_outputs.append(output_text)

    emb_generated = model_st.encode(generated_outputs, convert_to_tensor=True)
    emb_gold = model_st.encode(gold_outputs, convert_to_tensor=True)
    scores = torch.nn.functional.cosine_similarity(emb_generated, emb_gold)
    avg_score = float(scores.mean())

    if output_file:
        output_data = [{
            "input": inp,
            "generated_output": gen_out,
            "gold_output": gold_out,
            "score": float(score)
        } for inp, gen_out, gold_out, score in zip(inputs, generated_outputs, gold_outputs, scores)]

        with open(output_file, "w") as f:
            json.dump(output_data, f, indent=4)

    model.train()
    return avg_score


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
# === CALLBACK POUR ÉVALUATION À CHAQUE CHECKPOINT ===
class SemanticSimilarityCallback(TrainerCallback):
    def __init__(self, model, tokenizer, test_dataset, output_dir):
        self.model = model
        self.tokenizer = tokenizer
        self.test_dataset = test_dataset
        self.output_dir = output_dir

    def on_step_end(self, args, state, control, **kwargs):
        print(f"🔄 Étape terminée : {state.global_step}")

    def on_evaluate(self, args, state, control, **kwargs):
        print("\n✨ Évaluation de la similarité sémantique à la sauvegarde du checkpoint ✨\n")
        if not hasattr(self, "model") or self.model is None:
            print("Erreur : self.model est None.")
            return

        #trainer.model.eval()
        step = state.global_step
        print(f"création du fichier d'évaluation pour le checkpoint {step}")
        output_file = os.path.join(self.output_dir, f"evaluation_checkpoint-{step}.json")
        print(f"fichier d'évaluation : {output_file}")

        print(f"calcul de la similarité sémantique pour le checkpoint {step}")
        score = compute_semantic_similarity(
            self.model,
            self.tokenizer,
            self.test_dataset,
            output_file=output_file
        )

        #trainer.model.train()
        print(f"\n✅ Checkpoint {step}: Similarité sémantique = {score:.4f}\n")


In [25]:
# === ENTRAÎNEMENT ===
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    learning_rate=2e-4,
    bf16=True,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps=20,
    save_steps=20,
    save_total_limit=10,
    report_to="none",
    max_steps=160,
    disable_tqdm=False
)

# Configurer le logging
logging.basicConfig(level=logging.INFO, force=True)

# Créer le répertoire de sortie
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Initialiser le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    callbacks=[SemanticSimilarityCallback(model, tokenizer, dataset["test"], OUTPUT_DIR)],
)

# Ajouter le callback d'évaluation sémantique
# semantic_callback = SemanticSimilarityCallback(tokenizer, dataset["test"], OUTPUT_DIR)
# trainer.add_callback(semantic_callback)

# Évaluation initiale avant entraînement
print("\n🔍 Évaluation avant entraînement (similarité sémantique)...")
score_before = compute_semantic_similarity(model, tokenizer, dataset["test"], output_file=os.path.join(OUTPUT_DIR, "evaluation_avant.json"))
print(f"Score moyen avant entraînement : {score_before:.4f}")

# Entraînement
trainer.train()


  trainer = Trainer(



🔍 Évaluation avant entraînement (similarité sémantique)...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Score moyen avant entraînement : 0.5675
🔄 Étape terminée : 1


Step,Training Loss,Validation Loss
20,0.5734,0.488568


🔄 Étape terminée : 2
🔄 Étape terminée : 3
🔄 Étape terminée : 4
🔄 Étape terminée : 5
🔄 Étape terminée : 6
🔄 Étape terminée : 7
🔄 Étape terminée : 8
🔄 Étape terminée : 9
🔄 Étape terminée : 10
🔄 Étape terminée : 11
🔄 Étape terminée : 12
🔄 Étape terminée : 13
🔄 Étape terminée : 14
🔄 Étape terminée : 15
🔄 Étape terminée : 16
🔄 Étape terminée : 17
🔄 Étape terminée : 18
🔄 Étape terminée : 19
🔄 Étape terminée : 20

✨ Évaluation de la similarité sémantique à la sauvegarde du checkpoint ✨

création du fichier d'évaluation pour le checkpoint 20
fichier d'évaluation : ./qwen2.5-0.5b-instruct-lora-output/evaluation_checkpoint-20.json
calcul de la similarité sémantique pour le checkpoint 20


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


✅ Checkpoint 20: Similarité sémantique = 0.9673

🔄 Étape terminée : 21
🔄 Étape terminée : 22
🔄 Étape terminée : 23
🔄 Étape terminée : 24
🔄 Étape terminée : 25
🔄 Étape terminée : 26
🔄 Étape terminée : 27


KeyboardInterrupt: 

In [32]:
%cd qwen2.5-0.5b-instruct-lora-output/
!pwd
!ls

/content/drive/MyDrive/DLAA_2025/qwen2.5-0.5b-instruct-lora-output
/content/drive/MyDrive/DLAA_2025/qwen2.5-0.5b-instruct-lora-output
checkpoint-20  evaluation_avant.json  evaluation_checkpoint-20.json


In [33]:
!python ../ts_check_training.py evaluation_checkpoint-20.json

2025-08-26 13:51:11.903094: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756216271.927229    7528 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756216271.934607    7528 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756216271.953911    7528 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756216271.953955    7528 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756216271.953959    7528 computation_placer.cc:177] computation placer alr