# Setup

Make sure you use Kaggle's P100 GPU. This notebook has not been tested with any other GPU.

In [40]:
import os
import wandb
from kaggle_secrets import UserSecretsClient

try:
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")
    os.environ["WANDB_API_KEY"] = wandb_api_key
    wandb.login(key=wandb_api_key)
except Exception as e:
    print("WANDB_API_KEY not set or failed to load.")
    print("Reason:", str(e))
    print("In Kaggle, add it via Add-ons → Secrets → Add Secret.")

MODEL_UIDS = [
    "unsloth/llama-3.2-1B-bnb-4bit",
    "unsloth/llama-3.2-3B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
]
DATASET_SFT_UIDS = [
    "allenai/tulu-3-sft-personas-math-grade",
    "allenai/tulu-3-sft-personas-math",
    "allenai/tulu-3-sft-personas-instruction-following",
    "allenai/tulu-3-sft-personas-algebra",
    "allenai/tulu-3-sft-personas-code",
]
DATASET_SFT_MIXTURE_UIDS = ["allenai/tulu-3-sft-mixture"]
LM_EVAL_UIDS = [
    "hellaswag",
    "gsm8k",
    "arc_easy",
    "truthfulqa",
    "winogrande",
    "humaneval",
]

MODEL_UID = MODEL_UIDS[0]  # 0 is 1B, 1 is 3B, 2 is 8B
DATASET_UIDS = DATASET_SFT_UIDS # make sure this represents the datasets you're currently interested in
DATASET_UID = DATASET_UIDS[3] # choose your dataset

MAX_STEPS = 1  # edit this after debugging
LIMIT = 1 # edit this after debugging

print("Config done...")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Config done...


In [5]:
!git clone https://github.com/EleutherAI/lm-evaluation-harness.git
!pip install -e ./lm-evaluation-harness/.
!pip install unsloth transformers datasets wandb pandas

fatal: destination path 'lm-evaluation-harness' already exists and is not an empty directory.
Obtaining file:///kaggle/working/lm-evaluation-harness
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Collecting evaluate (from lm_eval==0.4.8)
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jsonlines (from lm_eval==0.4.8)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting pytablewriter (from lm_eval==0.4.8)
  Downloading pytablewriter-1.2.1-py3-none-any.whl.metadata (38 kB)
Collecting rouge-score>=0.0.4 (from lm_eval==0.4.8)
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sacrebleu>=1.5.0 (from lm_eval==0.4.8)
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)


# Fine-Tuning

In [6]:
from unsloth import FastLanguageModel
import wandb
from transformers import BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
import torch
from datasets import load_dataset
import json

model_name = MODEL_UID.split("/")[-1]
dataset_name = DATASET_UID.split("/")[-1]

base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_UID, max_seq_length=2048, dtype=None, load_in_4bit=True
)

base_model.save_pretrained(f"{model_name}")
tokenizer.save_pretrained(f"{model_name}")

model = FastLanguageModel.get_peft_model(
    base_model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    # random_state = 3407,
    max_seq_length=2048,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

model.config.quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

train_dataset = load_dataset(DATASET_UID, split="train")
print(train_dataset[0].keys())
print(json.dumps(train_dataset[0]["messages"], indent=2))


def formatting_func(examples):
    messages = examples["messages"]
    texts = [
        "".join([m["content"].strip() + "\n" for m in convo]).strip()
        for convo in messages
    ]
    return {"text": texts}


train_dataset = train_dataset.map(formatting_func, batched=True)

wandb.login(key=os.environ["WANDB_API_KEY"])
wandb.init(
    project="pm-pt",
    name=f"{model_name}_{dataset_name}",
    config={
        "model": MODEL_UID,
        "dataset": DATASET_UID,
        "max_steps": MAX_STEPS,
        "learning_rate": 2e-4,
        "batch_size": 2,
        "gradient_accumulation_steps": 4,
    },
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    args=SFTConfig(
        dataset_text_field="text",
        max_seq_length=2048,
        learning_rate=2e-4,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=MAX_STEPS,
        report_to="wandb",
        run_name=f"{model_name}_{dataset_name}",
        output_dir="outputs",
        optim="adamw_8bit",
    ),
)

trainer.train()

wandb.finish()

model.save_pretrained(f"{model_name}_finetuned_{dataset_name}")
tokenizer.save_pretrained(f"{model_name}_finetuned_{dataset_name}")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-04-30 09:18:27.736298: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746004707.914467      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746004707.970988      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.3: Fast Llama patching. Transformers: 4.51.1.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 6.0. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

Unsloth 2025.4.3 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


README.md:   0%|          | 0.00/627 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/36.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

dict_keys(['id', 'prompt', 'messages'])
[
  {
    "content": "Alex is a supportive and organized partner who helps their significant other, Jamie, design a system to boost productivity while managing household tasks and work projects. They decide to allocate their time between these tasks using a quadratic model to maximize efficiency. \n\n1. Alex and Jamie determine that the time, in hours, they should spend on household tasks each week can be modeled by the quadratic equation \\( h(t) = -2t^2 + 8t + 5 \\), where \\( t \\) is the number of weeks since they started using the new system. Determine the number of weeks, \\( t \\), after which they should expect to spend the maximum time on household tasks. Also, find the maximum time they should spend on household tasks in a week.\n\n2. To ensure balanced productivity, Alex and Jamie agree to spend at least 15 hours per week on work projects. If the time spent on work projects can be modeled by the inequality \\( w(t) = 3t + 2 \\geq 15 \\

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/20000 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 20,000 | Num Epochs = 1 | Total steps = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192/1,000,000,000 (1.13% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.9967


0,1
train/epoch,▁▁
train/global_step,▁▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
total_flos,50099016327168.0
train/epoch,0.0004
train/global_step,1.0
train/grad_norm,0.22184
train/learning_rate,0.0
train/loss,0.9967
train_loss,0.99665
train_runtime,16.1452
train_samples_per_second,0.496
train_steps_per_second,0.062


('llama-3.2-1B-bnb-4bit_finetuned_tulu-3-sft-personas-algebra/tokenizer_config.json',
 'llama-3.2-1B-bnb-4bit_finetuned_tulu-3-sft-personas-algebra/special_tokens_map.json',
 'llama-3.2-1B-bnb-4bit_finetuned_tulu-3-sft-personas-algebra/tokenizer.json')

# Evaluation

In [70]:
import os
import subprocess

model_name = MODEL_UID.split("/")[-1]
dataset_name = DATASET_UID.split("/")[-1]

peft_path = f"./{model_name}_finetuned_{dataset_name}"

os.environ["HF_ALLOW_CODE_EVAL"] = "1"

tasks_str = ",".join(LM_EVAL_UIDS)

command = [
    "lm_eval",
    "--model",
    "hf",
    "--model_args",
    f"pretrained=./{model_name},peft={peft_path}",
    "--tasks",
    tasks_str,
    "--confirm_run_unsafe_code",
    "--device",
    "cuda",
    "--batch_size",
    "auto",
    "--limit",
    str(LIMIT),
]

subprocess.run(command)

2025-04-30 10:18:47.586715: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746008327.609704    1839 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746008327.616519    1839 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
100%|██████████| 1/1 [00:00<00:00, 6842.26it/s]
100%|██████████| 1/1 [00:00<00:00, 949.80it/s]
100%|██████████| 1/1 [00:00<00:00, 660.10it/s]
100%|██████████| 1/1 [00:00<00:00, 693.85it/s]
100%|██████████| 1/1 [00:00<00:00, 2031.14it/s]
100%|██████████| 1/1 [00:00<00:00, 1997.29it/s]
100%|██████████| 1/1 [00:00<00:00, 247.74it/s]
100%|██████████| 1/1 [00:00<00:00, 1028.77it/s]
Running loglikelihood requests: 100%|██████████| 18/18 [0

Passed argument batch_size = auto:1. Detecting largest batch size
Determined largest batch size: 64


Running generate_until requests: 100%|██████████| 3/3 [01:10<00:00, 23.43s/it]


Passed argument batch_size = auto. Detecting largest batch size
Determined Largest batch size: 1


fatal: not a git repository (or any parent up to mount point /kaggle)
Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).


hf (pretrained=./llama-3.2-1B-bnb-4bit,peft=./llama-3.2-1B-bnb-4bit_finetuned_tulu-3-sft-personas-algebra), gen_kwargs: (None), limit: 1.0, num_fewshot: None, batch_size: auto (64)
|    Tasks     |Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
|--------------|------:|----------------|-----:|-----------|---|-----:|---|------|
|arc_easy      |      1|none            |     0|acc        |↑  |0.0000|±  |   N/A|
|              |       |none            |     0|acc_norm   |↑  |1.0000|±  |   N/A|
|gsm8k         |      3|flexible-extract|     5|exact_match|↑  |0.0000|±  |   N/A|
|              |       |strict-match    |     5|exact_match|↑  |0.0000|±  |   N/A|
|hellaswag     |      1|none            |     0|acc        |↑  |0.0000|±  |   N/A|
|              |       |none            |     0|acc_norm   |↑  |1.0000|±  |   N/A|
|humaneval     |      1|create_test     |     0|pass@1     |   |0.0000|±  |   N/A|
|truthfulqa_gen|      3|none            |     0|bleu_acc   |↑  |0.0000|±

CompletedProcess(args=['lm_eval', '--model', 'hf', '--model_args', 'pretrained=./llama-3.2-1B-bnb-4bit,peft=./llama-3.2-1B-bnb-4bit_finetuned_tulu-3-sft-personas-algebra', '--tasks', 'hellaswag,gsm8k,arc_easy,truthfulqa,winogrande,humaneval', '--confirm_run_unsafe_code', '--device', 'cuda', '--batch_size', 'auto', '--limit', '1'], returncode=0)

In [71]:
from itertools import product

columns = ["model_uid", "dataset_uid"] + LM_EVAL_UIDS

model_dataset_pairs = list(product(MODEL_UIDS, DATASET_UIDS))

empty_eval_df = pd.DataFrame(columns=columns)

for model_uid, dataset_uid in model_dataset_pairs:
    row = {
        "model_uid": model_uid,
        "dataset_uid": dataset_uid,
    }
    for task in LM_EVAL_UIDS:
        row[task] = None
    empty_eval_df.loc[len(empty_eval_df)] = row

empty_eval_df.to_excel("empty_eval_results.xlsx", index=False)

print("Created empty eval_results.xlsx")

Created empty eval_results.xlsx
