### Setup iniziale

**INSTALLS**

In [None]:
!pip install --upgrade transformers torch datasets peft tf-keras accelerate bitsandbytes trl evaluate radon zss torchvision
# IMPORTANTE! RIAVVIARE IL RUNTIME DOPO L'ESECUZIONE

Collecting torch
  Downloading torch-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting accelerate
  Downloading accelerate-1.8.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting trl
  Downloading trl-0.18.2-py3-none-any.whl.metadata (11 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting radon
  Downloading radon-6.0.1-py2.py3-none-any.whl.metadata (8.2 kB)
Collecting zss
  Downloading zss-1.2.0.tar.gz (9.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torchvision
  Downloading torchvision-0.22.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any

**IMPORTS**

In [None]:
# Importing stock ml libraries
import pandas as pd
import torch
import gc
from tabulate import tabulate
from datasets import load_dataset
from trl import SFTTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig
import json
import random
from google.colab import files

**Svuota la cache**

In [None]:
# Garbace collect
gc.collect()
# Svuota la cache
torch.cuda.empty_cache()

**VARIABLES**

In [None]:
# The model that you want to train from the Hugging Face hub
#model_name = "bigcode/starcoder2-3b"
#model_name = "openai-community/gpt2"
#model_name = "NousResearch/Llama-2-7b-hf"
#model_name = "deepcogito/cogito-v1-preview-llama-3B"
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
starcoder_name = "bigcode/starcoder2-3b"

# The instruction dataset to use
dataset_name = "bigcode/self-oss-instruct-sc2-exec-filter-50k"
# Fine-tuned model name
new_model = "Our_Finetuned_Model"

# Dimension of dataset subset used
dataset_range = 4000

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension. It determines the size and parameter count of the low-rank adaptation
lora_r = 64
# Alpha parameter for LoRA scaling factor that determines the impact of the low-rank matrices on the original model's output.
# Controls the overall strength of the low-rank adaptation.
lora_alpha = 2*lora_r  # "Often set to 2-4 times lora_r"
# Dropout probability for LoRA layers (considera aumento a 0.1-0.2)
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False  # "Double quantization can sometimes improve performance but increases complexity.  It's often left disabled initially."

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False  # CONFERMO CHE NON E' SUPPORTATO SU COLAB

# Number of training epochs
num_train_epochs = 3  # Sembra arrivare a un plateau già alla terza
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4  # Abbassare in caso di training più lunghi

# Batch size per training e per evaluation
per_device_train_batch_size = per_device_eval_batch_size = 2
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4 # Moltiplica la batch_size vera per ottenere quella simulata. Dinimuire per velocità, alzare per stabilità.

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3  # Stabilizza il training, ma rallentandolo. Range ottimale 0.1-0.5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.1  # Aumentare in caso di overfitting a 0.1-0.2

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1  # '-1' mantains num_train_epochs

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.05 # Aumentare a 0.1 in caso di instabilità eccessiva iniziale

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every tot updates steps
save_steps = 100
# Log every tot updates steps
logging_steps = 100
# Evaluation strategy ("no", "epochs", "steps")
eval_strategy = "steps" # unused
# Evaluate the model every tot steps (if the strategy is "steps")
eval_steps = 100        # unused

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use; tronca l'input a tot tokens
max_seq_length = 1024

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False  # "Can improve efficiency if your dataset has many short sequences."

# Load the entire model on the GPU 0
device_map = {"": 0}

################################################################################
# Parameters for code generation
################################################################################

# Maximum number of new tokens to generate
max_new_tokens=500

# Enable sampling for more diverse outputs
do_sample=False

# Repetition penalty to apply to the generated text (1.0 = no penalty, >1.0 = penalizes repetition)
repetition_penalty=1.2

### Caricamento di DeepSeek (modello di base per il fine-tuning)

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
  load_in_4bit=use_4bit,
  bnb_4bit_quant_type=bnb_4bit_quant_type,
  bnb_4bit_compute_dtype=compute_dtype,
  bnb_4bit_use_double_quant=use_nested_quant,
)
# Il warning di sliding window è un false warning: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/discussions/27

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
  model_name,
  quantization_config=bnb_config,
  device_map=device_map
)
base_model.eval()
# Load base tokenizer
tokenizer_base = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
print(tokenizer_base.pad_token, tokenizer_base.eos_token)
tokenizer_base.pad_token = tokenizer_base.eos_token
print(tokenizer_base.pad_token, tokenizer_base.eos_token)
tokenizer_base.padding_side = 'right'

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

<｜end▁of▁sentence｜> <｜end▁of▁sentence｜>
<｜end▁of▁sentence｜> <｜end▁of▁sentence｜>


In [None]:
# Ultime configurazioni per il modello
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1
if gradient_checkpointing:
  base_model.gradient_checkpointing_enable()  # Attiva gradient checkpointing per ridurre l'uso di memoria

# for name, module in base_model.named_modules():
#    print(name)

# Load LoRA configuration
peft_config = LoraConfig(
  lora_alpha=lora_alpha,
  lora_dropout=lora_dropout,
  r=lora_r,
  bias="none",
  task_type="CAUSAL_LM",
  #target_modules=["q_proj", "v_proj"] # also ["o_proj", "k_proj"] ?
)

print(base_model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear4bit(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear4bit(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear4bit(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps

### Caricamento del modello StarCoder

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
  load_in_4bit=use_4bit,
  bnb_4bit_quant_type=bnb_4bit_quant_type,
  bnb_4bit_compute_dtype=compute_dtype,
  bnb_4bit_use_double_quant=use_nested_quant,
)

# Load base model
starcoder = AutoModelForCausalLM.from_pretrained(
  starcoder_name,
  quantization_config=bnb_config,
  device_map=device_map
)
starcoder.eval()
# Load base tokenizer
tokenizer_starcoder = AutoTokenizer.from_pretrained(starcoder_name, trust_remote_code=True)
print(tokenizer_starcoder.pad_token, tokenizer_starcoder.eos_token)
tokenizer_starcoder.pad_token = tokenizer_starcoder.eos_token
print(tokenizer_starcoder.pad_token, tokenizer_starcoder.eos_token)
tokenizer_starcoder.padding_side = 'right'

config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/12.1G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

None <|endoftext|>
<|endoftext|> <|endoftext|>


### Caricamento del modello fine-tuned da GitHub (per la valutazione)

In [None]:
#Da Git:
!apt-get install git
!rm -rf FineTuningAI
!git clone https://github.com/matteraggi/FineTuningAI.git

#Dal Drive (cambiare path manualmente):
#from google.colab import drive
#drive.mount('/content/drive')
#!cp -r /content/drive/MyDrive/checkpoint_800 /content/

# Percorso alla directory del modello
checkpoint_path = "/content/FineTuningAI/models/DS_Finetuned_6"

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
  load_in_4bit=use_4bit,
  bnb_4bit_quant_type=bnb_4bit_quant_type,
  bnb_4bit_compute_dtype=compute_dtype,
  bnb_4bit_use_double_quant=use_nested_quant,
)
# Il warning di sliding window è un false warning: https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B/discussions/27

# Load (partially) finetuned model
finetuned_model = AutoModelForCausalLM.from_pretrained(
  checkpoint_path,
  device_map=device_map,
  quantization_config=bnb_config,
)
finetuned_model.eval()
# Load tokenizer from checkpoint
tokenizer_finetuned = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer_finetuned.pad_token = tokenizer_finetuned.eos_token
tokenizer_finetuned.padding_side = 'right'

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Cloning into 'FineTuningAI'...
remote: Enumerating objects: 420, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (33/33), done.[K
remote: Total 420 (delta 20), reused 11 (delta 5), pack-reused 382 (from 2)[K
Receiving objects: 100% (420/420), 679.64 MiB | 36.20 MiB/s, done.
Resolving deltas: 100% (234/234), done.
Updating files: 100% (83/83), done.


In [None]:
# Ultime configurazioni per il modello
finetuned_model.config.use_cache = False
finetuned_model.config.pretraining_tp = 1
if gradient_checkpointing:
  finetuned_model.gradient_checkpointing_enable()  # Attiva gradient checkpointing per ridurre l'uso di memoria

# for name, module in finetuned_model.named_modules():
#    print(name)

# Load LoRA configuration
peft_config = LoraConfig(
  lora_alpha=lora_alpha,
  lora_dropout=lora_dropout,
  r=lora_r,
  bias="none",
  task_type="CAUSAL_LM",
  #target_modules=["q_proj", "v_proj"] # o ["o_proj", "k_proj"] ?
)

print(finetuned_model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=1536, out_features=1536, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1536, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=1536, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (v_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=1536, o

### Caricamento e tokenizzazione dataset

**DATASET LOAD AND PREPROCESSING**

In [None]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")
# Use only a random subset of the dataset 'dataset_range'-wide
dataset = dataset.shuffle(seed=42).select(range(dataset_range))

# # Split the subset in training and validation
# split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
# train_dataset = split_dataset["train"]
# val_dataset = split_dataset["test"]

# Use the full dataset for training only
train_dataset = dataset
# Per evitare OOM, è stata rimossa la validazione. Alla fine il codice verrà sempre valutato con humaneval.

README.md:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/90.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50661 [00:00<?, ? examples/s]

In [None]:
# Funzione che ritaglia solo la funzione dal testo di response

import re

def extract_code_from_response(text):
    # Cerca il primo blocco ```python ... ```
    match = re.search(r"```(?:python)?\n(.*?)```", text, re.DOTALL) # Tre backtick, 'python' opzionale, '\n', prendi il blocco, tre backtick. re.DOTALL per proseguire alle nuove linee
    if match:
        return match.group(1).strip() # Il contenuto catturato, stripped
    else:
        print('fallback!')  # DEBUGGING. Non dovrebbe printare nulla
        return text.strip() # Fallback

def clean_response(example):
    example["response"] = extract_code_from_response(example["response"])
    return example

dataset = dataset.map(clean_response)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
# Check dataset after cutting functions from responses

#print dataset columns
print(dataset.column_names)
#print dataset example
print(dataset[0])
#print dataset example with instruction and seed
print(f"Instruction: {dataset[1]['instruction']}")
print(f"Response: {dataset[1]['response']}")

['fingerprint', 'sha1', 'seed', 'response', 'concepts', 'prompt', 'instruction', 'id']
{'fingerprint': None, 'sha1': 'a1ea8d1a52a57874b576afc6a4c45e7d624d409e', 'seed': 'def box_enum(typ, val, c):\n    """\n    Fetch an enum member given its native value.\n    """\n    valobj = c.box(typ.dtype, val)\n    # Call the enum class with the value object\n    cls_obj = c.pyapi.unserialize(c.pyapi.serialize_object(typ.instance_class))\n    return c.pyapi.call_function_objargs(cls_obj, (valobj,))', 'response': 'def serialize_objects(objects):\n    """\n    Serialize a list of objects into a list of dictionaries.\n    Each dictionary represents the object\'s class name and attributes.\n    """\n    serialized_objects = []\n    for obj in objects:\n        serialized_object = {\n            "class_name": obj.__class__.__name__,\n            "attributes": {}\n        }\n        for attr_name, attr_value in obj.__dict__.items():\n            attr_dict = serialize_value(attr_value)\n            seri

In [None]:
from statistics import mean, median

lengths = []

for ex in dataset:
    # Ricostruisci l'input esattamente come nel preprocessing
    full_text = f"{ex['instruction'].strip()}\n{ex['response'].strip()}"  # strip è opzionale
    # Tokenizzazione completa con special tokens (esattamente come nel preprocessing)
    input_ids = tokenizer_base(full_text, truncation=False)["input_ids"]
    #input_ids = tokenizer_finetuned(full_text, truncation=False)["input_ids"]
    lengths.append(len(input_ids))

# Analisi
max_len = max(lengths)
avg_len = mean(lengths)
med_len = median(lengths)
over_256 = sum(l > 256 for l in lengths) / len(lengths) * 100
over_512 = sum(l > 512 for l in lengths) / len(lengths) * 100
over_1024 = sum(l > 1024 for l in lengths) / len(lengths) * 100

print(f"Lunghezza massima: {max_len} token")
print(f"Lunghezza media: {avg_len:.2f} token")
print(f"Lunghezza mediana: {med_len} token")
print(f"Percentuale > 256 token: {over_256:.2f}%")
print(f"Percentuale > 512 token: {over_512:.2f}%")
print(f"Percentuale > 1024 token: {over_1024:.2f}%")

Lunghezza massima: 1340 token
Lunghezza media: 192.82 token
Lunghezza mediana: 174.0 token
Percentuale > 256 token: 18.20%
Percentuale > 512 token: 1.25%
Percentuale > 1024 token: 0.10%


In [None]:
# Funzione di preprocessing degli input
def preprocess_function(examples):
    input_ids_list = []
    label_ids_list = []
    attention_mask_list = []

    for instr, resp in zip(examples["instruction"], examples["response"]):
        prompt = instr.strip() + "\n" # Aggiungi uno /n alla fine di ogni insturction
        completion = resp.strip()     # In questo dataset lo strip è ridondante, ma non fa male

        # tokenizzazione separata di prompt e codice, senza token speciali
        prompt_ids = tokenizer_base(prompt, add_special_tokens=False)["input_ids"]
        completion_ids = tokenizer_base(completion, add_special_tokens=False)["input_ids"]

        # Li uniamo e 'svalutiamo' l'istruzione stessa per fare in modo che il modello impari solo a generare codice
        input_ids = prompt_ids + completion_ids
        labels = [-100] * len(prompt_ids) + completion_ids  # -100 label mask standard di huggingface per escludere token

        # Padding/truncation
        if len(input_ids) > max_seq_length:     # Tronca eventuali input troppo lunghi
            input_ids = input_ids[:max_seq_length]
            labels = labels[:max_seq_length]
        else:                                   # Aggiungi padding per arrivare a maxseq (e svaluta i token di padding)
            pad_len = max_seq_length - len(input_ids)
            input_ids += [tokenizer_base.pad_token_id] * pad_len
            labels += [-100] * pad_len

        attention_mask = [1 if token != tokenizer_base.pad_token_id else 0 for token in input_ids] # 1 per token validi, 0 per padding (in DS, padding=eos)

        input_ids_list.append(input_ids)
        label_ids_list.append(labels)
        attention_mask_list.append(attention_mask)

    return {
        "input_ids": input_ids_list,  # Frasi tokenizzate, input del modello
        "labels": label_ids_list, # Maschera corretta per supervisionare solo la parte target
        "attention_mask": attention_mask_list # Maschera per nascondere i padding
    }

# Applica il preprocessing
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
#tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

### Training (Fine-tuning)

In [None]:
# Set training parameters
training_arguments = TrainingArguments(
  output_dir=output_dir,
  num_train_epochs=num_train_epochs,
  per_device_train_batch_size=per_device_train_batch_size,
  gradient_accumulation_steps=gradient_accumulation_steps,
  optim=optim,
  # eval_strategy=eval_strategy,  #eval
  # eval_steps=eval_steps,        #eval
  save_steps=save_steps,
  logging_steps=logging_steps,
  learning_rate=learning_rate,
  weight_decay=weight_decay,
  fp16=fp16,
  bf16=bf16,
  max_grad_norm=max_grad_norm,
  max_steps=max_steps,
  warmup_ratio=warmup_ratio,
  group_by_length=group_by_length,
  lr_scheduler_type=lr_scheduler_type,
  report_to="tensorboard",
  gradient_checkpointing=gradient_checkpointing,
  label_names=["labels"], # Risolve: No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
)

# Train the model with the modified configuration
trainer = SFTTrainer(
  model=base_model,
  train_dataset=tokenized_train_dataset,
  # eval_dataset=tokenized_val_dataset,  #eval
  peft_config=peft_config,
  args=training_arguments,
)

Truncating train dataset:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [None]:
# Start training

# Training da zero
trainer.train()

# Training se già parzialmente allenato
#trainer.train(resume_from_checkpoint=checkpoint_path)

Step,Training Loss
100,1.6653
200,0.2296
300,0.2243
400,0.2161
500,0.2174
600,0.2087
700,0.2061
800,0.2099
900,0.2015
1000,0.2097


TrainOutput(global_step=1500, training_loss=0.3051934852600098, metrics={'train_runtime': 10163.933, 'train_samples_per_second': 1.181, 'train_steps_per_second': 0.148, 'total_flos': 1.14457602686976e+17, 'train_loss': 0.3051934852600098})

### Salvataggio del modello / checkpoint

**Salva il modello**

In [None]:
# Save the fine-tuned model
#trainer.model.save_pretrained(new_model)
trainer.save_model("./results/DS_Finetuned_6")

**Salva il checkpoint/modello pushandolo su GitHub**

In [None]:
from google.colab import userdata
!apt-get install git
!git config --global user.email {userdata.get('GitEmail')}
!git config --global user.name {userdata.get('GitUsername')}
!git clone https://github.com/matteraggi/FineTuningAI.git
!cd FineTuningAI  # Go to the *existing* FineTuningAI directory
!mv results/DS_Finetuned_6 FineTuningAI/models/DS_Finetuned_6/
!cd FineTuningAI && git add models/DS_Finetuned_6
!cd FineTuningAI && git commit -m "Saved 'DS_Finetuned_6', uses cut responses."
!git config --global credential.helper store
!cd FineTuningAI && git push https://{userdata.get('PAT')}@github.com/matteraggi/FineTuningAI.git main

**Salva il checkpoint/modello su Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r /content/results/DS_Finetuned_6 /content/drive/MyDrive/

### Valutazione HumanEval

**Caricare il benchmark HumanEval**

In [None]:
# Creazione di un set di confronto
humaneval = load_dataset("openai/openai_humaneval", split="test")
sampled_humaneval = random.sample(list(humaneval), 30)
with open("fixed_sampled_humaneval.json", "w") as f:    # Salva il set di confronto per evitare di ricrearlo ogni volta
    json.dump(sampled_humaneval, f, indent=4)

README.md:   0%|          | 0.00/6.52k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/83.9k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/164 [00:00<?, ? examples/s]

In [None]:
# Configurazione
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device in use:", device) #DEBUG

#per evitare di ottenere warning nella cella di evaluation (deepseek ha temperature e top_p settati di default e con do_sample = False va in conflitto)
finetuned_model.generation_config.temperature = None
finetuned_model.generation_config.top_p = None
base_model.generation_config.temperature = None
base_model.generation_config.top_p = None
# === Funzione di generazione codice da parte del modello ===
def generate_code(model, prompt, tokenizer):
    try:
        # Tokenizza il prompt e sposta i tensori sul device corretto (GPU o CPU)
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens, # Numero massimo di nuovi token da generare
                do_sample=do_sample, #settiamo do_sample a False per evitare sampling e scegliere sempre la parola più probabile
                repetition_penalty=repetition_penalty, # Penalizza le ripetizioni
                eos_token_id=tokenizer.eos_token_id, # id del token di fine sequenza
                pad_token_id=tokenizer.pad_token_id, # id del token di padding
            )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Errore in generate_code(): {e}")
        return "Errore durante la generazione"

device in use: cuda


Confronto tra Starcoder e DeepSeek finetunato (SYSTEM_PROMPT + funzione)



In [None]:
import torch
import json
import gc
import random
import pandas as pd
from tabulate import tabulate
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel


SYSTEM_PROMPT = (
    "# Implement the following Python function.\n"
    "# Return only the code of the function. Do not include comments like TODO or placeholders.\n"
    "# Do not leave the function body empty or incomplete.\n"
    "# If examples or docstrings are present, use them to infer the logic.\n"
    "# Make sure the function is correct and complete.\n\n"
)

with open("fixed_sampled_humaneval.json", "r") as f:
    sampled_humaneval = json.load(f)

# Valutazione dei modelli
starcoder_results = []
fine_tuned_results = []


for i, example in enumerate(sampled_humaneval):
    partial_prompt = example['prompt']
    prompt = SYSTEM_PROMPT + partial_prompt

    starcoder_code = generate_code(starcoder, prompt, tokenizer_starcoder)
    starcoder_results.append({"prompt": prompt, "code": starcoder_code})

    fine_tuned_code = generate_code(finetuned_model, prompt, tokenizer_finetuned)
    fine_tuned_results.append({"prompt": prompt, "code": fine_tuned_code})

# Salvataggio dei risultati in JSON
results = {"starcoder_results": starcoder_results, "fine_tuned_results": fine_tuned_results}

with open("model_results(system_prompt).json", "w") as f:
    json.dump(results, f, indent=4)

# Creazione e stampa tabella
df = pd.DataFrame([
    {"Prompt": b['prompt'], "Non-Finetuned Code": b['code'], "Finetuned Code": f['code']}
    for b, f in zip(starcoder_results, fine_tuned_results)
])

files.download("model_results(system_prompt).json")

print(tabulate(df, headers="keys", tablefmt="grid", numalign="left", stralign="left"))

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

+----+--------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Confronto tra Starcoder e DeepSeek finetunato (solo docstring)

In [None]:
from datasets import load_dataset
import re
import json
import torch
import gc
import pandas as pd
from tabulate import tabulate
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from google.colab import files


# Carica il dataset salvato
with open("fixed_sampled_humaneval.json", "r") as f:
    sampled_humaneval = json.load(f)

# === Funzione per estrarre la docstring dal codice ===
def estrai_docstring(code):
    """Estrae il contenuto tra triple virgolette singole o doppie."""
    match = re.search(r'("""|\'\'\')(.*?)(\1)', code, re.DOTALL)
    return match.group(2).strip() if match else ""

# === Valutazione ===
starcoder_results = []
fine_tuned_results = []

for i, example in enumerate(sampled_humaneval):
    docstring = estrai_docstring(example["prompt"])

    starcoder_code = generate_code(starcoder, docstring, tokenizer_starcoder)
    starcoder_results.append({"prompt": docstring, "code": starcoder_code})

    fine_tuned_code = generate_code(finetuned_model, docstring, tokenizer_finetuned)
    fine_tuned_results.append({"prompt": docstring, "code": fine_tuned_code})

# === Salva risultati ===
results = {"starcoder_results": starcoder_results, "fine_tuned_results": fine_tuned_results}
with open("model_results(docstring).json", "w") as f:
    json.dump(results, f, indent=4)

# === Tabella riassuntiva ===
df = pd.DataFrame([
    {"Prompt": b['prompt'], "Non-Finetuned Code": b['code'], "Finetuned Code": f['code']}
    for b, f in zip(starcoder_results, fine_tuned_results)
])
files.download("model_results(docstring).json")
print(tabulate(df, headers="keys", tablefmt="grid", numalign="left", stralign="left"))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

+----+--------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

### Salvataggio / caricamento dei risultati

**SAVE IN DRIVE**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Save as JSON in Drive
with open("/content/drive/MyDrive/model_results.json", "w") as f:
    json.dump(results, f, indent=4)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**SAVE IN GIT**

In [None]:
from google.colab import userdata
import os

git_email = userdata.get('GitEmail')
git_username = userdata.get('GitUsername')
pat = userdata.get('PAT')  # PAT = personal access token

# Configura Git
!git config --global user.email "{git_email}"
!git config --global user.name "{git_username}"

# Autenticazione sicura (solo per questa sessione)
!git config --global credential.helper store

# Scrivi le credenziali nel file temporaneo
with open('/root/.git-credentials', 'w') as f:
    f.write(f'https://{git_username}:{pat}@github.com\n')

# Clona
!git clone https://github.com/matteraggi/FineTuningAI.git

# Salva i file
!cp "model_results(system_prompt).json" FineTuningAI/results/
!cp "model_results(docstring).json" FineTuningAI/results/

# Commit e push
%cd FineTuningAI
!git add results/model_results*
!git commit -m "Save model results"
!git push origin main
%cd ..

### Confronto tra modelli ("*Complex checking*")

In [None]:
!pip install pylint

Collecting pylint
  Downloading pylint-3.3.7-py3-none-any.whl.metadata (12 kB)
Collecting astroid<=3.4.0.dev0,>=3.3.8 (from pylint)
  Downloading astroid-3.3.10-py3-none-any.whl.metadata (4.4 kB)
Collecting isort!=5.13,<7,>=4.2.5 (from pylint)
  Downloading isort-6.0.1-py3-none-any.whl.metadata (11 kB)
Collecting mccabe<0.8,>=0.6 (from pylint)
  Downloading mccabe-0.7.0-py2.py3-none-any.whl.metadata (5.0 kB)
Downloading pylint-3.3.7-py3-none-any.whl (522 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m522.6/522.6 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading astroid-3.3.10-py3-none-any.whl (275 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.4/275.4 kB[0m [31m26.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isort-6.0.1-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.2/94.2 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading mccabe-0.7.0-py2.py3-none-any.whl (7.3 kB)
I

In [None]:
# Creazione di un set di confronto
humaneval = load_dataset("openai/openai_humaneval", split="test")
sampled_humaneval = random.sample(list(humaneval), 30)
with open("fixed_sampled_humaneval.json", "w") as f:    # Salva il set di confronto per evitare di ricrearlo ogni volta
    json.dump(sampled_humaneval, f, indent=4)

**FUNZIONI DI CONFRONTO**

In [None]:
import ast
import subprocess
import tempfile
import zss
import re
import os
from radon.complexity import cc_visit
import textwrap
import re

# Estrae la prima docstring (tripla virgolette singole o doppie) da una stringa di codice.
# Utilizzata per il confronto con la docstring
def estrai_docstring( code):
  match = re.search(r'("""|\'\'\')(.*?)(\1)', code, re.DOTALL) # cerca il blocco di docstring
  return match.group(2).strip() if match else ""

# Calcola la similarità tra due snippet di codice in base all'intersezione tra i token.
# Utilizzata per confrontare la soluzione canonica con quella dei modelli
def code_similarity(code1, code2):
  tokens1 = set(re.findall(r'\b\w+\b', code1)) # estrae tutti i token dal codice 1
  tokens2 = set(re.findall(r'\b\w+\b', code2)) # estrae tutti i token dal codice 2
  if not tokens1 or not tokens2:
    return 0.0 # se uno dei due è vuoto, la similarità è zero
  intersection = tokens1 & tokens2 # token in comune
  union = tokens1 | tokens2 # tutti i token dei due codici
  return len(intersection) / len(union) # similarità = jaccard index (intersezione / unione)

# Analizza il codice con pylint e restituisce il numero di errori e di warning contenuti nel codice.
# Il numero di errori e di warning viene calcolato facendo:
# numero_totale_warning/numero_di_funzioni_generate
# numero_totale_errori/numero_di_funzioni_generate
# Per questo motivo il valore restituito può essere superiore a 1
def run_pylint_analysis( code: str) -> dict:
  results = {
    "errors": 0,
    "warnings": 0,
    "messages": []
  }
  # Scrive il codice in un file temporaneo
  with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as temp_file:
    temp_file.write(code)
    temp_file_path = temp_file.name
  try:
    # Esegue pylint sul file salvato e ne cattura l'output
    completed = subprocess.run(
        ["pylint", "--disable=all", "--enable=E,W,C,R", "--score=n", temp_file_path],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    # Scorre ogni riga dell'output per classificare il messaggio
    for line in completed.stdout.splitlines():
      results["messages"].append(line)
      if re.search(r'\bE\d{4}\b', line):
        results["errors"] += 1
      elif re.search(r'\bW\d{4}\b', line):
        results["warnings"] += 1
  except Exception as e:
    results["messages"].append(f"Errore durante analisi pylint: {str(e)}")
  finally:
    os.remove(temp_file_path) # elimina il file temporaneo creato
  return results

# Estrae codice Python pulito da un testo generato, rimuovendo il prompt e trovando il blocco con 'def'.
# Utilizzato per eseguire i vari test di valutazione sul codice "puro"
def extract_python_code( generated: str, prompt: str) -> str:
  # Rimuove prompt (se presente all'inizio)
  generated = generated.strip()
  if generated.startswith(prompt.strip()):
    generated = generated[len(prompt):].strip()

  # Cerca un blocco markdown ```python ... ```
  match = re.search(r"```(?:python)?(.*?)```", generated, re.DOTALL)
  if match:
    code_block = match.group(1).strip()
    # Verifica se contiene un 'def'; se sì, usalo
    if 'def ' in code_block:
        return code_block

  # Se non c'è blocco valido, cerca direttamente un 'def' nel testo
  lines = generated.splitlines()
  def_index = next((i for i, line in enumerate(lines) if line.strip().startswith("def ")), None)
  if def_index is not None:
    return "\n".join(lines[def_index:]).strip()

  # Fallback: restituisce comunque tutto ciò che resta
  return generated.strip()


# Verifica se il codice è sintatticamente valido usando ast.parse().
def is_valid_python( code: str) -> bool:
  try:
    ast.parse(code)
    return True
  except:
    return False

# Prova ad eseguire ast.parse(), ma ritorna None in caso di errore (safe parsing).
def safe_parse( code):
  try:
    return ast.parse(code)
  except:
    return None
# Calcola la complessità ciclomatica media delle funzioni nel codice (usando radon).
def calculate_complexity( code: str) -> float:
  try:
    blocks = cc_visit(code)
    return sum(b.complexity for b in blocks) / len(blocks) if blocks else 0 #facciamo / len(blocks) perchè un modello potrebbe generare più di una funzione per implementare la soluzione
  except:
    return 0


# Determina se una funzione è implementata in modo significativo (con return, yield o raise).
def is_meaningfully_implemented( code: str) -> bool:
  # Creiamo un albero contenente tutti i token del codice
  tree = safe_parse(code)
  #Se l'albero è vuoto (perchè il codice prodotto è vuoto), allora restituiamo False
  if not tree:
    return False
  # Altrimenti risaliamo l'albero per cercare una delle keywords
  for node in ast.walk(tree):
    if isinstance(node, ast.Return) and node.value is not None:
        return True
    if isinstance(node, (ast.Yield, ast.Raise)):
        return True
  return False

# Valuta un blocco di codice su più metriche:
# correttezza sintattica, complessità, presenza di 'def', qualità dell'implementazione, similarità col codice canonico, e linting.
def evaluate_code( code: str, canonical: str = None) -> dict:
  clean_code = code.strip()
  tree = safe_parse(clean_code)
  similarity_score = code_similarity(clean_code, canonical)
  return {
    "syntactic_correct": is_valid_python(clean_code),
    "avg_complexity": calculate_complexity(clean_code),
    "implementation_complete": is_meaningfully_implemented(tree),
    "code_similarity": similarity_score,
    **run_pylint_analysis(clean_code)
  }


# Funzioni per eseguire il confronto
def run_evaluation(useSystemPrompt):
    with open("fixed_sampled_humaneval.json", "r") as f:
        sampled_humaneval = json.load(f)
    results = {"base": [], "starcoder": [], "fine_tuned": []}
    for idx, example in enumerate(sampled_humaneval):
      canonical = example['canonical_solution']
      if(useSystemPrompt):  # Caso system prompt + funzione
        partial_prompt = example['prompt']
        prompt = SYSTEM_PROMPT + partial_prompt
      else:                 # Caso docstring
        prompt = estrai_docstring(example["prompt"])

      raw_base = generate_code(base_model, prompt, tokenizer_base)
      base_code = extract_python_code(raw_base, prompt)
      base_metrics = evaluate_code(base_code, canonical)

      raw_starcoder = generate_code(starcoder, prompt, tokenizer_starcoder)
      starcoder_code = extract_python_code(raw_starcoder, prompt)
      starcoder_metrics = evaluate_code(starcoder_code, canonical)

      raw_finetuned = generate_code(finetuned_model, prompt, tokenizer_finetuned)
      fine_tuned_code = extract_python_code(raw_finetuned, prompt)
      fine_tuned_metrics = evaluate_code(fine_tuned_code, canonical)

      results["base"].append({
            "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
            "code": base_code[:200] + "..." if len(base_code) > 200 else base_code,
            **base_metrics
        })
      results["starcoder"].append({
            "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
            "code": starcoder_code[:200] + "..." if len(starcoder_code) > 200 else starcoder_code,
            **starcoder_metrics
        })
      results["fine_tuned"].append({
            "prompt": prompt[:100] + "..." if len(prompt) > 100 else prompt,
            "code": fine_tuned_code + "..." if len(fine_tuned_code) > 200 else fine_tuned_code,
            **fine_tuned_metrics
      })
      gc.collect()
    analyze_results(results) #Fa la somma di tutte le metriche contenute in results. Cioè per ogni singola funzione io ne ho calcolato le metriche, ma ora devo sommarle tutte

def analyze_results( results):
    base_df = pd.DataFrame(results["base"])
    fine_tuned_df = pd.DataFrame(results["fine_tuned"])
    starcoder_df = pd.DataFrame(results["starcoder"])
    print("\n=== Metriche Aggregate ===")
    agg_metrics = [
        ["Sintatticamente Corretto", base_df["syntactic_correct"].mean(), starcoder_df["syntactic_correct"].mean(), fine_tuned_df["syntactic_correct"].mean()],
        ["Errori pylint", base_df["errors"].mean(), starcoder_df["errors"].mean(), fine_tuned_df["errors"].mean()],
        ["Warning pylint", base_df["warnings"].mean(), starcoder_df["warnings"].mean(), fine_tuned_df["warnings"].mean()],
        ["Similarità del codice", base_df["code_similarity"].mean(), starcoder_df["code_similarity"].mean(), fine_tuned_df["code_similarity"].mean()],
        ["Complessità Ciclomatica", base_df["avg_complexity"].mean(), starcoder_df["avg_complexity"].mean(), fine_tuned_df["avg_complexity"].mean()],
        ["Implementazione Completa", base_df["implementation_complete"].mean(), starcoder_df["implementation_complete"].mean(), fine_tuned_df["implementation_complete"].mean()]
    ]
    print(tabulate(agg_metrics, headers=["Metrica", "Base", "Starcoder", "Fine-Tuned"], floatfmt=".2f", tablefmt="grid"))


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=1536, out_features=1536, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.1, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=1536, out_features=64, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=64, out_features=1536, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear4bit(in_features=1536, out_features=256, bias=True)
          (v_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=1536, o

**Confronto con SYSTEM_PROMPT + funzione tra tutti i modelli**

In [None]:
SYSTEM_PROMPT = (
    "# Implement the following Python function.\n"
    "# Return only the code of the function. Do not include comments like TODO or placeholders.\n"
    "# Do not leave the function body empty or incomplete.\n"
    "# If examples or docstrings are present, use them to infer the logic.\n"
    "# Make sure the function is correct and complete.\n\n"
)

run_evaluation(True)    # Con System Prompt + Funzione


=== Metriche Aggregate ===
+--------------------------+--------+-------------+--------------+
| Metrica                  |   Base |   Starcoder |   Fine-Tuned |
| Sintatticamente Corretto |   0.03 |        0.30 |         0.20 |
+--------------------------+--------+-------------+--------------+
| Errori pylint            |   0.97 |        0.87 |         0.90 |
+--------------------------+--------+-------------+--------------+
+--------------------------+--------+-------------+--------------+
| Similarità del codice    |   0.04 |        0.05 |         0.36 |
+--------------------------+--------+-------------+--------------+
| Complessità Ciclomatica  |   0.13 |        0.67 |         0.65 |
+--------------------------+--------+-------------+--------------+
| Implementazione Completa |   0.03 |        0.07 |         0.17 |
+--------------------------+--------+-------------+--------------+


**Confronto con docstring tra tutti i modelli**

In [None]:
run_evaluation(False)   # Con Docstring


=== Metriche Aggregate ===
+--------------------------+--------+-------------+--------------+
| Metrica                  |   Base |   Starcoder |   Fine-Tuned |
| Sintatticamente Corretto |   0.03 |        0.10 |         0.47 |
+--------------------------+--------+-------------+--------------+
| Errori pylint            |   0.97 |        0.90 |         0.63 |
+--------------------------+--------+-------------+--------------+
+--------------------------+--------+-------------+--------------+
| Similarità del codice    |   0.03 |        0.07 |         0.20 |
+--------------------------+--------+-------------+--------------+
| Complessità Ciclomatica  |   0.10 |        0.37 |         1.27 |
+--------------------------+--------+-------------+--------------+
| Implementazione Completa |   0.00 |        0.07 |         0.47 |
+--------------------------+--------+-------------+--------------+
