In [1]:
import pandas as pd
df = pd.read_csv("/kaggle/input/dataset-ml-qa/dataset_QA.csv")

In [2]:
! pip install transformers peft datasets bitsandbytes accelerate
! pip install evaluate bert-score
! pip install rouge_score

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=1.13.0->peft)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=1.13.0->peft)
  Downloading nvidia_cu

In [3]:
df.head(2)

Unnamed: 0,question,answer
0,What is the goal of this machine learning tech...,To help you or your team work on a machine lea...
1,What does the guide assume about the reader's ...,It assumes the reader has taken a machine lear...


In [4]:
import evaluate

from transformers import TrainerCallback
import pandas as pd
import os

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # basic string cleaning
    preds = [p.strip() for p in preds]
    refs = [r.strip() for r in refs]

    results = {}
    results.update(accuracy.compute(predictions=preds, references=refs))
    results.update(f1.compute(predictions=preds, references=refs, average="macro"))
    results.update(bleu.compute(predictions=[[p.split()] for p in preds], references=[[r.split()] for r in refs]))
    results.update(rouge.compute(predictions=preds, references=refs, use_stemmer=True))
    results.update(bertscore.compute(predictions=preds, references=refs, lang="en"))

    return {
        "accuracy": results["accuracy"],
        "f1": results["f1"],
        "bleu": results["bleu"],
        "rouge1": results["rouge1"],
        "rougeL": results["rougeL"],
        "bertscore_f1": sum(results["bertscore_f1"]) / len(results["bertscore_f1"])
    }

class CSVLogger(TrainerCallback):
    def __init__(self, path="metrics_log.csv"):
        self.path = path
        self.logs = []

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            metrics["epoch"] = state.epoch
            self.logs.append(metrics)
            pd.DataFrame(self.logs).to_csv(self.path, index=False)


2025-06-07 08:30:53.121513: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749285053.302807      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749285053.358907      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [5]:
from transformers import AutoTokenizer
from datasets import Dataset
from datasets import DatasetDict


data = [{"text": "Hello, how are you?"}, {"text": "I am learning QLoRA fine-tuning."}]
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-mini-instruct", trust_remote_code=True)

def format_qa(example):
    prompt = f"Q: {example['question']}\nA: {example['answer']}"
    return {"text": prompt}

dataset = Dataset.from_pandas(df[["question", "answer"]])
formatted_dataset = dataset.map(format_qa)
tokenized_dataset = formatted_dataset.map(lambda e: tokenizer(e["text"], truncation=True, padding="max_length", max_length=512), batched=True)
tokenized_dataset = tokenized_dataset.map(lambda e: {"labels": e["input_ids"]})

# 90% train, 10% test
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

tokenizer_config.json:   0%|          | 0.00/2.93k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.91M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Map:   0%|          | 0/2592 [00:00<?, ? examples/s]

Map:   0%|          | 0/2592 [00:00<?, ? examples/s]

Map:   0%|          | 0/2592 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import bitsandbytes as bnb  # make sure bitsandbytes is installed
from peft import LoraConfig, get_peft_model

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-4-mini-instruct",
    load_in_4bit=True,       # <-- enable 4-bit quantization
    device_map="auto",
    trust_remote_code=True
)

# Configure LoRA
lora_config = LoraConfig(
    r=256,
    lora_alpha=32,
    target_modules=["qkv_proj"],  # example for some models
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

config.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.9k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/54.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-4-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [7]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
! export CUDA_LAUNCH_BLOCKING=1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
from transformers import TrainingArguments
from transformers import Trainer
import torch
torch.cuda.empty_cache()
training_args = TrainingArguments(
    output_dir="/kaggle/working/phi4-lora-checkpoints",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="epoch",
    learning_rate=2e-4,
    logging_dir="/kaggle/working/logs",  # for tensorboard logs
    log_level="info",                    # verbose log level
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    compute_metrics=None,
)

trainer.train()


  trainer = Trainer(
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: question, text, answer. If question, text, answer are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2,332
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 4
  Total optimization steps = 435
  Number of trainable para

Step,Training Loss
50,0.9663
100,0.2435
150,0.2238
200,0.2174


Saving model checkpoint to /kaggle/working/phi4-lora-checkpoints/checkpoint-146
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-4-mini-instruct/snapshots/5a149550068a1eb93398160d8953f5f56c3603e9/config.json
Model config Phi3Config {
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-4-mini-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-4-mini-instruct--modeling_phi3.Phi3ForCausalLM",
    "AutoTokenizer": "microsoft/Phi-4-mini-instruct--Xenova/gpt-4o"
  },
  "bos_token_id": 199999,
  "embd_pdrop": 0.0,
  "eos_token_id": 199999,
  "full_attn_mod": 1,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "interpolate_factor": 1,
  "lm_head_bias": false,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "phi3",
  "num_att

In [None]:
trainer.compute_metrics = compute_metrics
trainer.evaluate(eval_dataset=test_dataset)