<a href="https://colab.research.google.com/github/matteraggi/FineTuningAI/blob/main/Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#ciao
#matteraggi

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, PeftModel

In [None]:
# Modello di partenza per NLP (es. GPT-2, LLaMA-2, StarCoder)
model_name = "bigcode/starcoder2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Caricamento dataset per il fine-tuning (es. stack-exchange, code datasets)
ds = load_dataset("bigcode/self-oss-instruct-sc2-exec-filter-50k", split="train[:1%]")
train_test_split = ds.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [None]:
# Preprocessamento dataset
def preprocess_function(examples):
    tokenized = tokenizer(examples["response"], truncation=True, padding="max_length", max_length=128)
    return {"input_ids": tokenized["input_ids"], "attention_mask": tokenized["attention_mask"]}

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

In [None]:
# Configurazione Lora (opzionale, per ridurre memoria)
lora_config = LoraConfig(r=64, lora_alpha=16, lora_dropout=0.1)
model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel(model, lora_config)

In [None]:
# Configurazione dell'addestramento
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    remove_unused_columns=False,
)

In [None]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
# Avvio del fine-tuning
trainer.train()