In [7]:
from unsloth import FastLanguageModel, is_bfloat16_supported

from trl import SFTTrainer
from transformers import TrainingArguments
from tuning.data.train_dataset import get_train_dataset
from tuning.training.config_training import ModelLoadConfig, LoraConfig, SFTRunConfig, PTRunConfig, DPOTrainingConfig, TrainingArgumentsConfig, PassAtKConfig, sft_batch_size, effective_batch_size
from tuning.training.perplexity_callback import PerplexityStoppingCallback
from tuning.training.passk_callback import PassAtKStoppingCallback
from tuning.utils.utils import apply_chat_template, chat_template_func
import json
import sys
from datasets import load_from_disk
from typing import List, Optional, Union
from pathlib import Path
from tuning.config import DATASETS_DIR, HF_MODEL_MAP
import os
from tuning.training.config_training import DatasetConfig, SFTRunConfig
from tuning.config import MODELS_DIR
from tuning.training.sft_training import train_model_sft
from tuning.training.dpo_training import train_model_dpo
import torch

import subprocess
import importlib

In [2]:
import importlib
import tuning.training.passk_callback
importlib.reload(tuning.training.passk_callback)
from tuning.training.passk_callback import PassAtKStoppingCallback
from tuning.training.dpo_training import train_model_dpo
importlib.reload(tuning.training.dpo_training)
from tuning.training.dpo_training import train_model_dpo

In [3]:
MODEL = "llama3-8B"
total_train_size = 4096  # 29980
perplexity_thresholds = [7.0,6.0, 5.75, 5.5, 5.25, 5.0, 4.75, 4.5, 4.25,4.0, 3.9, 3.8, 3.7, 3.6,3.55,3.5,3.45,3.4,3.35,3.3, 3.25, 3.2, 3.15, 3.1]
perplexity_thresholds = [6.0, 5.0, 4.0, 3.75, 3.5, 3.25, 3.0]
perplexity_thresholds = [4.0, 2.0]


In [4]:
dataset_config = DatasetConfig(
    dataset = "tuluif",
    dataset_type = "sft",
    train_size = total_train_size, # 29980
)

run_config = SFTRunConfig(
    dataset_config = dataset_config,
    model_name_hf = HF_MODEL_MAP[MODEL],  # Use HuggingFace model name, not local path
    model_name = MODEL,  # Base model name for output directory construction
    do_training=True,
    do_inference=False,
    do_evaluation=False,
)
passk_config = PassAtKConfig( # this is just to dynamically view the pass@1 of ifeval
    target_pass_at_k=[0.3,0.4,0.5,0.6],
    k_values=[1],
    n_samples=1,
    num_prompts=100,
    temperature=0.7,
    strict=True,
    enabled=True,
)

lora_config = LoraConfig()
model_load_config = ModelLoadConfig()
model_load_config.max_seq_length = 4096
training_args = TrainingArgumentsConfig()
training_args.eval_steps = 32
training_args.per_device_train_batch_size = 16
training_args.gradient_accumulation_steps = 1


In [None]:
import wandb
run = wandb.init(
    name=run_config.run_name, 
    project="tuning", 
    reinit=True,
    # Optional: Pass config here so it's logged even if training crashes early
    config=run_config.__dict__ if hasattr(run_config, "__dict__") else {} 
)
with run:
    model, tokenizer, trainer, callbacks = train_model_sft(
        run_config = run_config,
        lora_config = lora_config,
        model_load_config = model_load_config,
        training_args = training_args,
        perplexity_thresholds = perplexity_thresholds, 
    )

Getting train dataset for run config: llama3-8B_sft-tuluif-4096
Checking for dataset at /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/data/datasets/sft-tuluif-4096
Dataset already exists at /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/data/datasets/sft-tuluif-4096
Sampled dataset: DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'messages', 'constraints'],
        num_rows: 4096
    })
    test: Dataset({
        features: ['id', 'prompt', 'messages', 'constraints'],
        num_rows: 200
    })
})
Example training row: {'id': 'personas_IF_se11a0hu4d0lnuekd4d6hmmb', 'prompt': 'Identify and list three groundbreaking research papers in the field of artificial intelligence from the past five years. The papers should be recognized for their innovative contributions and have received significant citations. Provide a brief summary of each paper, not exceeding 50 words per summary.', 'messages': [{'content': 'You are a helpful assi

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'messages', 'constraints', 'text'],
        num_rows: 4096
    })
    test: Dataset({
        features: ['id', 'prompt', 'messages', 'constraints', 'text'],
        num_rows: 200
    })
})
{'id': 'personas_IF_se11a0hu4d0lnuekd4d6hmmb', 'prompt': 'Identify and list three groundbreaking research papers in the field of artificial intelligence from the past five years. The papers should be recognized for their innovative contributions and have received significant citations. Provide a brief summary of each paper, not exceeding 50 words per summary.', 'messages': [{'content': 'You are a helpful assistant who is an expert at responding to prompts by carefully following the given instructions', 'role': 'system'}, {'content': 'Identify and list three groundbreaking research papers in the field of artificial intelligence from the past five years. The papers should be recognized for their innovative contributions and have rece

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,096 | Num Epochs = 2 | Total steps = 512
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 1 x 1) = 16
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)


[PerplexityCallback] on_train_begin: model_name=llama3-8B


Step,Training Loss,Validation Loss
32,1.2122,1.256714
64,1.0232,1.083082
96,0.9453,1.038077
128,0.9989,1.016148
160,0.9714,1.002002
192,1.0022,0.994574
224,0.9774,0.982577
256,0.9641,0.97533
288,0.8231,0.972143
320,0.9859,0.970879



[PerplexityCallback] Step 32, Data Points 512: PPL = 4.1853

[PerplexityCallback] Step 64, Data Points 1024: PPL = 3.5144
[PerplexityCallback] Sweetspot threshold 4.0 reached!
[PerplexityCallback] Saving sweetspot checkpoint to /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-4.00_sft-512
Found HuggingFace hub cache directory: /home/shougan/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Checking cache directory for required files...


Unsloth: Copying 4 files from cache to `/home/shougan/projects/aip-fredashi/shougan/balance-budg


Successfully copied all 4 files from cache to `/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-4.00_sft-512`


Unsloth: Preparing safetensor model files: 100%|███████████████| 4/4 [00:00<00:00, 58661.59it/s]
Unsloth: Merging weights into 16bit: 100%|████████████████████████| 4/4 [00:43<00:00, 10.90s/it]


Unsloth: Merge process complete. Saved to `/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-4.00_sft-512`
[PerplexityCallback] Sweetspot checkpoint saved with metadata at /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models_metadata/llama3-8B_ppl-0202_1259.json
[PerplexityCallback] Launching DPO job with data points 7168 at checkpoint /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-4.00_sft-512
[PerplexityCallback] Remaining thresholds: [2.0]
[PerplexityCallback] Continuing training to next threshold: 2.0

[PerplexityCallback] Step 96, Data Points 1536: PPL = 3.3164

[PerplexityCallback] Step 128, Data Points 2048: PPL = 3.2553

[PerplexityCallback] Step 160, Data Points 2560: PPL = 3.2811

[PerplexityCallback] Step 192, Data Points 3072: PPL = 3.2609

[PerplexityCallback] Step 224, Data Points 3584: PPL = 3.1216

[PerplexityCallback] Step 256, Data Points 4096: PPL = 3.0879

[PerplexityCal

In [8]:
# ppl_callback = callbacks[-1]
# metadata_file = ppl_callback.metadata_path
metadata_file = "/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models_metadata/llama3-8B_ppl-0202_1259.json"
checkpoints = []
with open(metadata_file, "r") as f:
    for line in f:
        checkpoints.append(json.loads(line))
print(checkpoints)


[{'threshold_type': 'perplexity', 'threshold_value': 4.0, 'global_step': 64, 'checkpoint_path': '/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-4.00_sft-512', 'data_points_seen': 512}]


In [9]:
!nvidia-smi
import gc
del model, tokenizer, trainer, callbacks # this deletes the references to such objects
gc.collect() # then we force the GC
torch.cuda.empty_cache() # and we release the GPU CUDA cache
# torch.cuda.reset_cuda_context() #  for a clean state
!nvidia-smi

Mon Feb  2 13:53:47 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.148.08             Driver Version: 570.148.08     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
| N/A   38C    P0            120W /  700W |     525MiB /  81559MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

NameError: name 'model' is not defined

In [10]:
checkpoints = checkpoints[-1:]
for checkpoint in checkpoints:
    model_name = Path(checkpoint["checkpoint_path"]).name
    data = total_train_size - checkpoint["data_points_seen"] 
    model_load_config = ModelLoadConfig()
    training_args = DPOTrainingConfig()
    training_args.eval_steps = 250
    dataset_config = DatasetConfig(
        dataset = "tuluif",
        dataset_type = "pt",
        train_size = data,
    )
    sft_run_config = SFTRunConfig(
        dataset_config = DatasetConfig(
            dataset = "tuluif",
            dataset_type = "sft",
            train_size = checkpoint["data_points_seen"],
            dynamic_path = model_name
        ),
        model_name = MODEL,
        model_name_hf = HF_MODEL_MAP[MODEL], 
        task_name = "ifeval"
    )
    run_config = PTRunConfig(
        dataset_config = dataset_config,
        # model_name_hf = HF_MODEL_MAP[MODEL],  
        model_name = MODEL,  
        sft_run_config = sft_run_config,
        task_name = "ifeval",
        pft_method = "dpo",
        do_training = True
    )
    passk_config = PassAtKConfig( # this is just to dynamically view the pass@1 of ifeval
        target_pass_at_k=[1.2],
        k_values=[1],
        n_samples=1,
        num_prompts=100,
        temperature=0.7,
        strict=False,
        enabled=True,
    )
    model, tokenizer, trainer = train_model_dpo(
        run_config = run_config,
        lora_config = lora_config,
        model_load_config = model_load_config,
        training_args = training_args,
        # passk_config = passk_config,
        # perplexity_thresholds= [0.1] # dummy value to periodically check perplexities too
    )
    del model, tokenizer, trainer
    gc.collect()
    torch.cuda.empty_cache()


Per device train batch size: 2
Getting train dataset for run config: llama3-8B_llama3-8B_ppl-4.00_sft-512_pt-tuluif-3584
Checking for dataset at /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/data/datasets/pt-tuluif-3584
Dataset already exists at /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/data/datasets/pt-tuluif-3584
Sampled dataset: DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'constraints', 'chosen', 'rejected', 'chonsen_model', 'rejected_model', 'system_message'],
        num_rows: 3584
    })
    test: Dataset({
        features: ['id', 'prompt', 'constraints', 'chosen', 'rejected', 'chonsen_model', 'rejected_model', 'system_message'],
        num_rows: 200
    })
})
Example training row: {'id': 'personas_IF_dqxglsux2n8jeu59qktlnesh', 'prompt': 'Name two famous equestrian events that are part of the international jumping circuit, and format your answer by choosing one from these options: lowercase, UPPERCASE, Title 

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth: Will map <|im_end|> to EOS = <|im_end|>.


{'chonsen_model': 'gpt-4o',
 'chosen': '<|im_start|>assistant\n'
           'LONGINES GLOBAL CHAMPIONS TOUR, ROLEX GRAND SLAM OF SHOW '
           'JUMPING<|im_end|>\n',
 'constraints': ['punctuation:use no comma', 'format:choose one from options'],
 'id': 'personas_IF_dqxglsux2n8jeu59qktlnesh',
 'prompt': '<|im_start|>system\n'
           'You are a helpful assistant who is an expert at responding to '
           'prompts by carefully following the given instructions<|im_end|>\n'
           '<|im_start|>user\n'
           'Name two famous equestrian events that are part of the '
           'international jumping circuit, and format your answer by choosing '
           'one from these options: lowercase, UPPERCASE, Title '
           'Case.<|im_end|>\n'
           '<|im_start|>assistant\n',
 'rejected': '<|im_start|>assistant\n'
             '- longines global champions tour\n'
             '- FEI WORLD CUP JUMPING<|im_end|>\n',
 'rejected_model': 'gpt-4o',
 'system_message': 'You are 

Unsloth 2025.10.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Model loaded - <class 'peft.peft_model.PeftModelForCausalLM'>
Allocated: 15.30 GB
Cached: 15.31 GB


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,584 | Num Epochs = 2 | Total steps = 56
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 8 x 1) = 128
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)
[34m[1mwandb[0m: Currently logged in as: [33mshougan[0m ([33mshougan-university-of-waterloo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 480.00 MiB. GPU 0 has a total capacity of 79.19 GiB of which 455.00 MiB is free. Including non-PyTorch memory, this process has 78.74 GiB memory in use. Of the allocated memory 77.93 GiB is allocated by PyTorch, and 82.67 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)