<a href="https://colab.research.google.com/github/matteraggi/FineTuningAI/blob/main/Progetto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**INSTALLS**

In [1]:
!pip install --user transformers torch datasets peft tf-keras accelerate bitsandbytes trl evaluate radon zss
# IMPORTANTE! RIAVVIARE IL RUNTIME DOPO L'ESECUZIONE

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting radon
  Downloading radon-6.0.1-py2.py3-none-any.whl.metadata (8.2 kB)
Collecting zss
  Downloading zss-1.2.0.tar.gz (9.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadat

**IMPORTS**

In [1]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
import gc
from datasets import load_dataset
from trl import SFTTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

**Svuota la cache**

In [2]:
# Garbace collect
gc.collect()
# Svuota la cache
torch.cuda.empty_cache()

**VARIABLES**

In [3]:
# The model that you want to train from the Hugging Face hub
model_name = "bigcode/starcoder2-3b"
# The instruction dataset to use
dataset_name = "bigcode/self-oss-instruct-sc2-exec-filter-50k"
# Fine-tuned model name
new_model = "starcoder2-finetuned"

# Dimension of dataset subset used
dataset_range = 4000

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension. It determines the size and parameter count of the low-rank adaptation
lora_r = 64  # XXXXXXXXXXX
# Alpha parameter for LoRA scaling factor that determines the impact of the low-rank matrices on the original model's output.
# Controls the overall strength of the low-rank adaptation.
lora_alpha = 2*lora_r  # "Often set to 2-4 times lora_r"
# Dropout probability for LoRA layers (considera aumento a 0.1-0.2)
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False  # "Double quantization can sometimes improve performance but increases complexity.  It's often left disabled initially."

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False  # CONFERMO CHE NON E' SUPPORTATO SU COLAB

# Number of training epochs
num_train_epochs = 3  # Sembra arrivare a un plateau già alla terza
# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4  # Abbassare in caso di training più lunghi

# Batch size per training e per evaluation
per_device_train_batch_size = per_device_eval_batch_size = 2 # A 16 usava quasi tutta la vram ma max_seq_length era minimo
# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4 # Moltiplica la batch_size vera per ottenere quella simulata. Dinimuire per velocità, alzare per stabilità.

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3  # Stabilizza il training, ma rallentandolo. Range ottimale 0.1-0.5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.1  # Aumentare in caso di overfitting a 0.1-0.2

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1  # '-1' mantains num_train_epochs

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.05 # Aumentare a 0.1 in caso di instabilità eccessiva iniziale XXXXX

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every tot updates steps
save_steps = 100
# Log every tot updates steps
logging_steps = 100
# Evaluation strategy ("no", "epochs", "steps")
evaluation_strategy="steps"
# Evaluate the model every tot steps (if the strategy is "steps")
eval_steps=100

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use; tronca l'input a tot tokens
max_seq_length = 2048 # Abbassare a 1024 se la RAM dà problemi (ma ridurrebbe le)

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False  # "Can improve efficiency if your dataset has many short sequences."

# Load the entire model on the GPU 0
device_map = {"": 0}

### SCELTA: carica il modello di base per allenarlo o carica il modello finetunato da git per valutarlo

**NON RUNNARE DI DEFAULT**
*- LOAD BASE MODEL*

In [4]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
  load_in_4bit=use_4bit,
  bnb_4bit_quant_type=bnb_4bit_quant_type,
  bnb_4bit_compute_dtype=compute_dtype,
  bnb_4bit_use_double_quant=use_nested_quant,
)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
  model_name,
  quantization_config=bnb_config,
  device_map=device_map
)

config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/12.1G [00:00<?, ?B/s]

**NON RUNNARE DI DEFAULT**
*- LOAD FINETUNED MODEL*

In [None]:
!apt-get install git
!git clone https://github.com/matteraggi/FineTuningAI.git
# Percorso alla directory del modello
model_dir = "FineTuningAI/models/finetuned_model"

compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
  load_in_4bit=use_4bit,
  bnb_4bit_quant_type=bnb_4bit_quant_type,
  bnb_4bit_compute_dtype=compute_dtype,
  bnb_4bit_use_double_quant=use_nested_quant,
)

model = AutoModelForCausalLM.from_pretrained(
  model_dir,
  device_map=device_map,
  quantization_config=bnb_config,
)

# Da qui, continui l'esecuzione tramite le celle seguenti, ovvero quelle di LoRA

**Last model configs and apply LoRA configuration to the model**

In [5]:
# Ultime configurazioni per il modello
model.config.use_cache = False
model.config.pretraining_tp = 1
if gradient_checkpointing:
  model.gradient_checkpointing_enable()  # Attiva gradient checkpointing per ridurre l'uso di memoria

# Load LoRA configuration
# Example configuration for target modules (can vary per model)
peft_config = LoraConfig(
  lora_alpha=lora_alpha,
  lora_dropout=lora_dropout,
  r=lora_r,
  bias="none",
  task_type="CAUSAL_LM",
  target_modules=["q_proj", "v_proj"]  # These are common modules for transformers (queries and values in attention layers)
)

for param in model.parameters():
    if param.dtype in [torch.float32, torch.float64, torch.float16, torch.bfloat16]:  # Check for floating point types
        param.requires_grad = True  # Ensure only floating-point parameters require gradients
#print(any(param.requires_grad for param in model.parameters() if param.dtype in [torch.float32, torch.float64, torch.float16, torch.bfloat16]))  # Should print True
# Apply LoRA using PEFT
model = get_peft_model(model, peft_config)  # Wrap the model with the LoRA configuration
# Now, only LoRA layers will have requires_grad=True
#print(any(param.requires_grad for param in model.parameters()))  # This should be True for LoRA layers

### Caricamento e tokenizzazione dataset

**DATASET LOAD AND PROCESSING**

In [6]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")
# Use only a random subset of the dataset 'dataset_range'-wide
dataset = dataset.shuffle(seed=42).select(range(dataset_range))
# Split the subset in training and validation
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

README.md:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/90.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50661 [00:00<?, ? examples/s]

In [7]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

tokenizer_config.json:   0%|          | 0.00/7.88k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/777k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/442k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

In [8]:
# Inspect dataset columns
# print(dataset.column_names)

# Funzione di preprocessamento migliorata
def preprocess_function(examples):
  input_texts = [f"{instr} {prompt}" if instr else prompt for instr, prompt in zip(examples["instruction"], examples["prompt"])]
  encodings = tokenizer(input_texts, truncation=True, max_length=max_seq_length)
  #encodings = tokenizer(input_texts)  #, truncation=False, max_length=None)
  encodings["text"] = input_texts  # Aggiungi la colonna "text" per evitare KeyError
  return encodings  # max length aumentata (più efficienza a discapito della memoria) e ora ritorna tensori al posto di lists

# Apply tokenization
##tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

# Inspect dataset columns
# #print(tokenized_dataset.column_names)
# print(tokenized_train_dataset.column_names)
# print(tokenized_val_dataset.column_names)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

### Training

In [9]:
# Set training parameters
training_arguments = TrainingArguments(
  output_dir=output_dir,
  num_train_epochs=num_train_epochs,
  per_device_train_batch_size=per_device_train_batch_size,
  gradient_accumulation_steps=gradient_accumulation_steps,
  optim=optim,
  evaluation_strategy=evaluation_strategy,
  eval_steps=eval_steps,
  save_steps=save_steps,
  logging_steps=logging_steps,
  learning_rate=learning_rate,
  weight_decay=weight_decay,
  fp16=fp16,
  bf16=bf16,
  max_grad_norm=max_grad_norm,
  max_steps=max_steps,
  warmup_ratio=warmup_ratio,
  group_by_length=group_by_length,
  lr_scheduler_type=lr_scheduler_type,
  report_to="tensorboard",
  gradient_checkpointing=gradient_checkpointing,
)

# Train the model with the modified configuration
trainer = SFTTrainer(
  model=model,
  train_dataset=tokenized_train_dataset,
  eval_dataset=tokenized_val_dataset,
  peft_config=peft_config,
  args=training_arguments,
)



Converting train dataset to ChatML:   0%|          | 0/3200 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/3200 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/3200 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/800 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

In [10]:
# Start training

# Training da zero
trainer.train()

# Training se già parzialmente allenato
#trainer.train(resume_from_checkpoint=True)

# IGNORE 'Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.' WARNING; CAUSED BY LIBRARY GLITCH

Step,Training Loss,Validation Loss
100,1.6841,0.454828
200,0.3393,0.143694
300,0.183,0.129278


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


KeyboardInterrupt: 

In [None]:
# Test per vedere il numero totale di passi per cui è stato allenato il modello
# Utile per vedere se effettivamente l'allenamento è stato ripreso
print(trainer.state.global_step)

871


In [None]:
# Save the fine-tuned model
#trainer.model.save_pretrained(new_model) # Salva il modello nella directory specificata da 'new_model', che nel nostro caso è "starcoder2-finetuned"
trainer.save_model("./results/messia_3")

**NON RUNNARE DI DEFAULT**
Per pushare il modello che hai appena trainato su git

In [None]:
from google.colab import userdata
!apt-get install git
!git config --global user.email {userdata.get('GitEmail')}
!git config --global user.name {userdata.get('GitUsername')}
!git clone https://github.com/matteraggi/FineTuningAI.git
!cd FineTuningAI  # Go to the *existing* FineTuningAI directory
!mv results/messia_3 FineTuningAI/models/messia_3/
!cd FineTuningAI && git add models/messia_3
!cd FineTuningAI && git commit -m "Saved new finetuned model, ... "
!git config --global credential.helper store  # Or 'cache' if you prefer
!cd FineTuningAI && git push https://{userdata.get('PAT')}@github.com/matteraggi/FineTuningAI.git main

Se è troppo pesante (principalmente nel caso di checkpoints), salvali in drive

In [19]:
from google.colab import drive
#drive.mount('/content/drive')
!cp -r /content/FineTuningAI/models/checkpoint-300 /content/drive/MyDrive/

### HumanEval evaluation

**Caricare il benchmark HumanEval**
Dopo il fine-tuning, devi confrontare il tuo modello con uno pre-addestrato (StarCoder2 senza fine-tuning) e con il modello fine-tunato.

In [None]:
SYSTEM_PROMPT = "Generate Python code for the following task:\n"
NON_FINETUNED_ADDITIONAL_PROMPT = ""

In [None]:
# Configurazione
base_model_name = "bigcode/starcoder2-3b"
model_path = "FineTuningAI/starcoder2_Model_Tensors"  # Path del modello fine-tunato
device = "cuda" if torch.cuda.is_available() else "cpu"

# Caricamento del dataset HumanEval e selezione casuale di 10 esempi
humaneval = load_dataset("openai/openai_humaneval", split="test")
sampled_humaneval = random.sample(list(humaneval), 10)  # Estrazione casuale di 10 elementi

# Configurazione quantizzazione
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Caricamento dei modelli
non_finetuned_model = AutoModelForCausalLM.from_pretrained(
    base_model_name, device_map={"": device}, quantization_config=quantization_config
).eval()

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

# Funzione per generare codice
def generate_code(model, prompt, is_finetuned=False):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=200)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Valutazione del modello
base_results = []
fine_tuned_results = []

for i, example in enumerate(sampled_humaneval):
    prompt = example['prompt']

    base_code = generate_code(non_finetuned_model, prompt, is_finetuned=False)
    base_results.append({"prompt": prompt, "code": base_code})

    fine_tuned_code = generate_code(non_finetuned_model, prompt, is_finetuned=True)
    fine_tuned_results.append({"prompt": prompt, "code": fine_tuned_code})

    print(f"Processed example {i+1}/10")
    gc.collect()  # Pulizia della memoria

# Salvataggio dei risultati in JSON
results = {"base_results": base_results, "fine_tuned_results": fine_tuned_results}

with open("model_results.json", "w") as f:
    json.dump(results, f, indent=4)

# Creazione e stampa tabella
df = pd.DataFrame([
    {"Prompt": b['prompt'], "Non-Finetuned Code": b['code'], "Finetuned Code": f['code']}
    for b, f in zip(base_results, fine_tuned_results)
])

print(tabulate(df, headers="keys", tablefmt="grid", numalign="left", stralign="left"))


NameError: name 'torch' is not defined

In [None]:
import json

# Assuming base_results and fine_tuned_results are your arrays
results = {
    "base_results": base_results,
    "fine_tuned_results": fine_tuned_results
}

# Save as JSON
with open("model_results.json", "w") as f:
    json.dump(results, f, indent=4)



In [None]:
import pandas as pd
from tabulate import tabulate

table_data = []
for non_fine_tuned, fine_tuned in zip(base_results, fine_tuned_results):
    table_data.append({
        "Prompt": non_fine_tuned['prompt'],
        "Non-Finetuned Code": non_fine_tuned['code'],
        "Finetuned Code": fine_tuned['code']
    })
# Use tabulate for better formatting
df = pd.DataFrame(table_data)

# Print only the first 5 rows with tabulate
print(tabulate(df.head(), headers='keys', tablefmt='grid', numalign="left", stralign="left"))

+----+--------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+
|    | Prompt                                                                                                       | Non-Finetuned Code                                                                                                                                                                                                  | Finetuned Code                                                                                                       |
| 0  | from typing import List                                                                    

### Save/Load results

**SAVE IN DRIVE**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Save as JSON in Drive
with open("/content/drive/MyDrive/model_results.json", "w") as f:
    json.dump(results, f, indent=4)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**SAVE IN GIT**

In [None]:
from google.colab import userdata
!apt-get install git
!git config --global user.email {userdata.get('GitEmail')}
!git config --global user.name {userdata.get('GitUsername')}
!git clone https://github.com/matteraggi/FineTuningAI.git
!cp model_results.json FineTuningAI/results/
!cd FineTuningAI && git add results/model_results.json
!cd FineTuningAI && git commit -m "Save model results"
!git config --global credential.helper store  # Or 'cache' if you prefer
!cd FineTuningAI && git push https://{userdata.get('PAT')}@github.com/matteraggi/FineTuningAI.git main


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
fatal: destination path 'FineTuningAI' already exists and is not an empty directory.
On branch main
Your branch is ahead of 'origin/main' by 2 commits.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
Everything up-to-date


**LOAD FROM DRIVE**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import json

# Load JSON file from Drive
with open("/content/drive/MyDrive/model_results.json", "r") as f:
    results = json.load(f)

# Now results['base_results'] and results['fine_tuned_results'] hold the data
base_results = results['base_results']
fine_tuned_results = results['fine_tuned_results']


Mounted at /content/drive


**LOAD FROM GIT**

In [None]:
import json
!apt-get install git
!git clone https://github.com/matteraggi/FineTuningAI.git

with open("FineTuningAI/results/model_results.json", "r") as f:
    results = json.load(f)

base_results = results['base_results']
fine_tuned_results = results['fine_tuned_results']

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.12).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.
fatal: destination path 'FineTuningAI' already exists and is not an empty directory.


### **Complex checking** holy shit me vs God he loses

In [None]:
import ast
import radon.complexity
import io
import subprocess  # For running the cc command
import json
from pylint import lint
from pylint.reporters.json_reporter import JSONReporter
import tempfile  # For creating a temporary file
import zss
import math


class WritableObject(object):
    "dummy output stream for pylint"
    def __init__(self):
        self.content = []
    def write(self, st):
        "dummy write"
        self.content.append(st)
    def read(self):
        "dummy read"
        return self.content

def ast_to_tree(node):
    """Convert an AST node to a zss-compatible tree."""
    if isinstance(node, ast.AST):
        children = [ast_to_tree(child) for child in ast.iter_child_nodes(node)]
        return zss.Node(type(node).__name__, children)
    else:
        return zss.Node(str(node))

def sigmoid(x, k=1, x0=0):
    """Sigmoid function to transform distance into similarity."""
    return 1 / (1 + math.exp(-k * (x - x0)))

def normalized_ast_distance(code1, code2):
    """Calculate normalized tree edit distance."""
    ast1 = ast.parse(code1)
    ast2 = ast.parse(code2)
    tree1 = ast_to_tree(ast1)
    tree2 = ast_to_tree(ast2)
    distance = zss.simple_distance(tree1, tree2)
    max_distance = len(list(ast.walk(ast1))) + len(list(ast.walk(ast2)))
    return distance / max_distance

def ast_similarity(code1, code2, k=10, x0=0.5):
    """Calculate similarity between two ASTs using sigmoid."""
    distance = normalized_ast_distance(code1, code2)
    similarity = sigmoid(distance, k=k, x0=x0)
    return similarity

def evaluate_code(code, canonical_solution_ast=None):
    metrics = {}

    # 1. Syntactic Correctness & Static Analysis
    try:
        ast.parse(code)
        metrics["syntactic_correct"] = True
    except SyntaxError as e:
        metrics["syntactic_correct"] = False
        metrics["syntax_error"] = str(e)

    with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as temp_file:
        temp_filename = temp_file.name
        temp_file.write(code)
    pylint_output = WritableObject()
    json_reporter = JSONReporter(pylint_output)

    # Pylint options
    pylint_opts = [
        "--disable=missing-module-docstring,C0301",  # Suppress some messages
        "--rcfile=/dev/null",  # Disable rc file
        temp_filename,  # The temporary filename
    ]

    # Run pylint
    lint.Run(pylint_opts, reporter=json_reporter, exit=False)


    pylint_json_output = pylint_output.read()
    for a in pylint_json_output:
      print(a)
    print("pylint_json_output"+str(pylint_json_output))
    try:
      pylint_results = json.loads(''.join(pylint_json_output))
      metrics["pylint_errors"] = len([m for m in pylint_results if m.get("type") == "error"])
      metrics["pylint_warnings"] = len([m for m in pylint_results if m.get("type") == "warning"])
    except json.JSONDecodeError:
        metrics["pylint"] = "Failed to parse pylint output."


    os.remove(temp_filename) # Clean up the temporary file


    # 2. Code Complexity
    try:
        process = subprocess.run(['radon', 'cc', '-j', '-'],  # -j for JSON output, - for stdin
                               capture_output=True, input=code, text=True, check=True)
        radon_output = json.loads(process.stdout)
        if type(radon_output) is dict:
            metrics["avg_complexity"] = 0
        else:
          metrics["avg_complexity"] = sum(int(item['complexity']) for item in radon_output["-"]) / len(radon_output["-"]) if radon_output else 0
    except subprocess.CalledProcessError as e:
        metrics["avg_complexity"] = 0
        print(f"Radon cc command error: {e}")
    except json.JSONDecodeError as e:
        metrics["avg_complexity"] = 0
        print(f"Radon JSON decode error: {e}")
    except FileNotFoundError as e:
        metrics["avg_complexity"] = 0
        print(f"Radon cc not found: {e}. Is Radon installed?")

    # 3. Code Quality (Basic)
    metrics["code_length"] = len(code)

    # 4. Algorithmic Similarity (AST-based, if canonical solution is provided)
    if canonical_solution_ast:
        try:
            metrics["ast_similarity"] = 1.0- float(ast_similarity(code, canonical_solution_ast)) # Use custom similarity function

        except Exception as e:
            metrics["ast_similarity"] = 0
            print(f"AST comparison error: {e}")
    else:
        metrics["ast_similarity"] = 0

    return metrics

# Example usage

#code1 = """
#def foo(x):
#    return x + 1
#"""

#code2 = """
#def bar(y):
 #   return y - 1
#"""

#a= evaluate_code(code1, code2)
#print(a)


In [None]:
# Store evaluation results
base_eval_results = []
fine_tuned_eval_results = []

for eval_item, non_fine_tuned, fine_tuned in zip(dataset, base_results, fine_tuned_results):
    prompt = non_fine_tuned['prompt']
    canonical = extract_python_code(eval_item['response'])

    base_pass = evaluate_code(non_fine_tuned["code"], canonical)
    fine_tuned_pass = evaluate_code(fine_tuned["code"], canonical)
    # Store results
    base_eval_results.append({"prompt": prompt, "code": non_fine_tuned["code"], **base_pass})
    fine_tuned_eval_results.append({"prompt": prompt, "code": fine_tuned["code"], **fine_tuned_pass})

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
        "message-id": "E0001"
    }
]


pylint_json_output['[\n    {\n        "type": "error",\n        "module": "tmp06i0a9oi",\n        "obj": "",\n        "line": 27,\n        "column": 22,\n        "endLine": null,\n        "endColumn": null,\n        "path": "/tmp/tmp06i0a9oi.py",\n        "symbol": "syntax-error",\n        "message": "Parsing failed: \'invalid decimal literal (tmp06i0a9oi, line 27)\'",\n        "message-id": "E0001"\n    }\n]', '\n']
AST comparison error: invalid decimal literal (<unknown>, line 27)
[
    {
        "type": "error",
        "module": "tmpc0s6htm6",
        "obj": "",
        "line": 1,
        "column": 10,
        "endLine": null,
        "endColumn": null,
        "path": "/tmp/tmpc0s6htm6.py",
        "symbol": "syntax-error",
        "message": "Parsing failed: 'invalid syntax (tmpc0s6htm6, line 1)'",
        "message-id": "E0001"
    }
]


pylint_json_output['[\n    {\n        "t

In [None]:
import pandas as pd
from tabulate import tabulate

# Convert the evaluation results to DataFrames
base_df = pd.DataFrame(base_eval_results)
fine_tuned_df = pd.DataFrame(fine_tuned_eval_results)

# Filter rows based on the specified conditions
base_df_filtered = base_df[
    (base_df["syntactic_correct"] == True) |
    (base_df["pylint_errors"] == 0) |
    (base_df["ast_similarity"] > 0.8)
]

fine_tuned_df_filtered = fine_tuned_df[
    (fine_tuned_df["syntactic_correct"] == True) |
    (fine_tuned_df["pylint_errors"] == 0) |
    (fine_tuned_df["ast_similarity"] > 0.8)
]

# Customize columns to display (including 'prompt' and 'code')
columns_to_display = ["prompt", "code", "syntactic_correct", "pylint_errors", "pylint_warnings", "avg_complexity", "code_length", "ast_similarity"]

# Display Base Model Results using Tabulate (filtered)
print("Base Model Results (filtered):")
if not base_df_filtered.empty:  # Check if the filtered DataFrame is empty
    print(tabulate(base_df_filtered[columns_to_display], headers='keys', tablefmt='grid', showindex=False))
else:
    print("No results to display (no rows match the criteria).")

# Display Fine-Tuned Model Results using Tabulate (filtered)
print("\nFine-Tuned Model Results (filtered):")
if not fine_tuned_df_filtered.empty:
    print(tabulate(fine_tuned_df_filtered[columns_to_display], headers='keys', tablefmt='grid', showindex=False))
else:
    print("No results to display (no rows match the criteria).")

# Display lengths of the filtered DataFrames
print(f"\nLength of Base Model Results (filtered): {len(base_df_filtered)}")
print(f"Length of Fine-Tuned Model Results (filtered): {len(fine_tuned_df_filtered)}")

Base Model Results (filtered):
No results to display (no rows match the criteria).

Fine-Tuned Model Results (filtered):
+------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+---------------------+-----------------+-------------------+------------------+---------------+------------------+
| def flip_case(string: str) -> str:                                                                               | def flip_case(string: str) -> str:                                                                               | True                |               0 |                 5 |                0 |           868 |      3.04046e-12 |
|     """ For a given string, flip lowercase characters to uppercase and uppercase to lowercase.                   |     """ For a given string, flip lowercase characters to uppercase and