In [None]:
! pip uninstall -y transformers peft bitsandbytes accelerate
! pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
! pip install bitsandbytes

! pip install transformers peft datasets accelerate sentencepiece
! pip install numpy~=1.23.0

! pip install peft accelerate bitsandbytes

Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: bitsandbytes 0.48.2
Uninstalling bitsandbytes-0.48.2:
  Successfully uninstalled bitsandbytes-0.48.2
Found existing installation: accelerate 1.11.0
Uninstalling accelerate-1.11.0:
  Successfully uninstalled accelerate-1.11.0




Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp312-cp312-win_amd64.whl (6.1 MB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp312-cp312-win_amd64.whl (4.1 MB)
Installing collected packages: torchvision, torchaudio

   ---------------------------------------- 0/2 [torchvision]



ERROR: Could not install packages due to an OSError: [WinError 32] The process cannot access the file because it is being used by another process: 'A:\\MyFiles\\Project\\.venv\\Lib\\site-packages\\torchvision\\datasets\\celeba.py'
Check the permissions.



Collecting bitsandbytes
  Using cached bitsandbytes-0.48.2-py3-none-win_amd64.whl.metadata (10 kB)
Using cached bitsandbytes-0.48.2-py3-none-win_amd64.whl (59.0 MB)
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2
Collecting transformers
  Using cached transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting peft
  Using cached peft-0.17.1-py3-none-any.whl.metadata (14 kB)
Collecting accelerate
  Using cached accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Using cached transformers-4.57.1-py3-none-any.whl (12.0 MB)
Using cached peft-0.17.1-py3-none-any.whl (504 kB)
Using cached accelerate-1.11.0-py3-none-any.whl (375 kB)
Installing collected packages: accelerate, transformers, peft

   ---------------------------------------- 0/3 [accelerate]
   ---------------------------------------- 0/3 [accelerate]
   ---------------------------------------- 0/3 [accelerate]
   ---------------------------------------- 0/3 [accelerate]
   -----------

ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^




In [None]:
import os
import torch
from dataclasses import dataclass
from typing import Dict, List, Optional

from datasets import load_dataset

from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# ===================================================================

FILE_NAME = "boli_antrenament_final.json"
MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"
NEW_MODEL_NAME = "./phi3_stoma_final"

# variabila de mediu pentru a evita fragmentarea memoriei cuda
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["WANDB_DISABLED"] = "true" # dezactiveaza wandb pentru rulare locala simpla

# ----------------- configurari tehnice -----------------

# QLoRA CONFIG
# pt antrenare in 4 biti
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16, # float16 pt compatibilitate larga
    bnb_4bit_use_double_quant=False,
)

# LoRA CONFIG (pentru PHI-3)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["qkv_proj", "o_proj", "gate_up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# ===================================================================
# date si model
# ===================================================================

# incarcare dataset json
raw_ds = load_dataset("json", data_files=FILE_NAME, split="train")
print(f"Loaded {len(raw_ds)} examples.")

# tokenizare și preprocesare
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def map_to_text(example):
    # Normalizare mesaje si aplicare template chat
    norm = []
    for m in example.get("messages", []):
        role = m.get("role", "user")
        if role not in ("system", "user", "assistant"):
            role = "user"
        content = (m.get("content") or "").strip()
        if not content:
            continue
        norm.append({"role": role, "content": content})

    text = tokenizer.apply_chat_template(
        norm,
        tokenize=False,
        add_generation_prompt=False
    )
    return {"text": text}

ds = raw_ds.map(map_to_text, remove_columns=raw_ds.column_names)

# incarcare model cu qlora
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

model.config.use_cache = False
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()

print(f"Număr de parametri antrenabili: {model.print_trainable_parameters()}")


# ===================================================================
# tokenizare finala pentru Trainer
# ===================================================================

# functie de tokenizare pentru trainer
# datasetul are campul "text"
def tokenization_for_trainer(examples):
    # aplica tokenizarea cu padding si trunchiere
    tokenized_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt"
    )

    # labels trebuie sa fie identice cu input_ids pentru antrenare
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"]
    return tokenized_inputs

tokenized_dataset = ds.map(
    tokenization_for_trainer,
    batched=True,
    remove_columns=ds.column_names,
)

print("--- Setul de date a fost tokenizat și pregătit pentru Trainer-ul generic ---")

# ===================================================================
# configurare si start antrenament (fara trl)
# ===================================================================

# configurare argumente de antrenament
training_args = TrainingArguments(
    output_dir="./phi3_stoma_results",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    logging_steps=5,
    save_steps=100,
    save_total_limit=2,
    optim="paged_adamw_8bit",
    fp16=True, # ACTIVAT PENTRU GPU
    bf16=False, # DEZACTIVAT PENTRU GPU
    report_to="none", # FARA WANDB
)

# configurare trainer stabil
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# --------- TRAIN & SAVE ---------
print("--- Antrenament început ---")
try:
    trainer.train()
    print("--- ANTRENAMENT FINALIZAT ---")
finally:
    trainer.model.save_pretrained(NEW_MODEL_NAME)
    tokenizer.save_pretrained(NEW_MODEL_NAME)
    print(f"Model salvat în {NEW_MODEL_NAME}")

Loaded 30 examples.


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 25,165,824 || all params: 3,846,245,376 || trainable%: 0.6543
Număr de parametri antrenabili: None


Map:   0%|          | 0/30 [00:00<?, ? examples/s]

--- Setul de date a fost tokenizat și pregătit pentru Trainer-ul generic ---


  trainer = Trainer(


--- Antrenament început ---


You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss
5,3.3324
10,2.4662


--- ANTRENAMENT FINALIZAT ---
Model salvat în ./phi3_stoma_final


In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gc

MODEL_PATH = "./phi3_stoma_final"

torch.cuda.empty_cache()
gc.collect()

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

print(f"Model loaded on: {model.device}")
model.eval()

def chat_with_patient(history):
    input_text = tokenizer.apply_chat_template(
        history,
        tokenize=False,
        add_generation_prompt=True
    )

    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=False
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split(history[-1]["content"])[-1].strip()
    
    # Clean up memory
    del inputs, outputs
    torch.cuda.empty_cache()
    gc.collect()
    
    return response


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded on: cuda:0


In [None]:

# bucla de conversatie

print("=== Simulator pacient stomatologic ===")
print("Scrie intrebarile tale. Tasteaza 'stop' pentru a iesi.\n")

history = [
    {"role": "system", "content": "Esti un pacient care descrie simptome stomatologice realist, bazate pe experienta personala."},
]

while True:
    user_input = input("Student: ")
    if user_input.lower().strip() in ["stop", "exit", "quit"]:
        print("Conversatie incheiata.")
        break

    # adaugam intrebarea studentului in istoric
    history.append({"role": "user", "content": user_input})

    # generam raspunsul pacientului
    response = chat_with_patient(history)
    print(f"Pacient: {response}\n")

    # adaugam raspunsul pacientului in istoric
    history.append({"role": "assistant", "content": response})


=== Simulator pacient stomatologic ===
Scrie intrebarile tale. Tasteaza 'stop' pentru a iesi.



You are not running the flash-attention implementation, expect numerical differences.


Pacient: Astăzi, am o preocupare cu vreo boală care mi-a întâmpinut la oamenii care mi-a luat la o clasa de oare care nu mi-a doptat. La oare, am văzut un băut de pământ, care a luat o boală în uimă sau în oare.

Pacient: Am oameni care a luat o boală la oare, dar nu am văzut o boală la oare, doar o boală la uimă. A fost o boală prețioasă și arâtă, când a dat o oare la mea.

Conversatie incheiata.
