In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported

from trl import SFTTrainer
from transformers import TrainingArguments
from tuning.data.train_dataset import get_train_dataset
from tuning.training.config_training import ModelLoadConfig, LoraConfig, SFTRunConfig, PTRunConfig, DPOTrainingConfig, TrainingArgumentsConfig, PassAtKConfig, sft_batch_size, effective_batch_size
from tuning.training.perplexity_callback import PerplexityStoppingCallback
from tuning.training.passk_callback import PassAtKStoppingCallback
from tuning.utils.utils import apply_chat_template, chat_template_func
import json
import sys
from datasets import load_from_disk
from typing import List, Optional, Union
from pathlib import Path
from tuning.config import DATASETS_DIR, HF_MODEL_MAP
import os
from tuning.training.config_training import DatasetConfig, SFTRunConfig
from tuning.config import MODELS_DIR
from tuning.training.sft_training import train_model_sft
from tuning.training.dpo_training import train_model_dpo
import torch

import subprocess
import importlib

In [23]:
import importlib
import tuning.training.passk_callback
importlib.reload(tuning.training.passk_callback)
from tuning.training.passk_callback import PassAtKStoppingCallback
from tuning.training.dpo_training import train_model_dpo
importlib.reload(tuning.training.dpo_training)
from tuning.training.dpo_training import train_model_dpo

In [2]:
MODEL = "llama3-8B"
total_train_size = 8192  # 29980
perplexity_thresholds = [7.0,6.0, 5.75, 5.5, 5.25, 5.0, 4.75, 4.5, 4.25,4.0, 3.9, 3.8, 3.7, 3.6,3.55,3.5,3.45,3.4,3.35,3.3, 3.25, 3.2, 3.15, 3.1]
perplexity_thresholds = [7.0, 6.0]

In [10]:
dataset_config = DatasetConfig(
    dataset = "tuluif",
    dataset_type = "sft",
    train_size = total_train_size, # 29980
)

run_config = SFTRunConfig(
    dataset_config = dataset_config,
    model_name_hf = HF_MODEL_MAP[MODEL],  # Use HuggingFace model name, not local path
    model_name = MODEL,  # Base model name for output directory construction
    do_training=True,
    do_inference=False,
    do_evaluation=False,
)
passk_config = PassAtKConfig( # this is just to dynamically view the pass@1 of ifeval
    target_pass_at_k=[0.3,0.4,0.5,0.6],
    k_values=[1],
    n_samples=1,
    num_prompts=100,
    temperature=0.7,
    strict=True,
    enabled=True,
)

lora_config = LoraConfig()
model_load_config = ModelLoadConfig()
model_load_config.max_seq_length = 4096
training_args = TrainingArgumentsConfig()

In [4]:
model, tokenizer, trainer, callbacks = train_model_sft(
    run_config = run_config,
    lora_config = lora_config,
    model_load_config = model_load_config,
    training_args = training_args,
    perplexity_thresholds = perplexity_thresholds, 
)

Getting train dataset for run config: llama3-8B_sft-tuluif-8192
Checking for dataset at /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/data/datasets/sft-tuluif-8192
Dataset already exists at /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/data/datasets/sft-tuluif-8192
Sampled dataset: DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'messages', 'constraints'],
        num_rows: 8192
    })
    test: Dataset({
        features: ['id', 'prompt', 'messages', 'constraints'],
        num_rows: 200
    })
})
Example training row: {'id': 'personas_IF_9u3mcrurksv7hypq3xlppyba', 'prompt': 'Compile a detailed summary of the most recent case discussed on the podcast. Include the title of the episode wrapped in double angular brackets, i.e. <<title>>, and quote at least one statement made by the host during the discussion.', 'messages': [{'content': 'You are a helpful assistant who is an expert at responding to prompts by carefully followin

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth: Will map <|im_end|> to EOS = <|end_of_text|>.


DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'messages', 'constraints', 'text'],
        num_rows: 8192
    })
    test: Dataset({
        features: ['id', 'prompt', 'messages', 'constraints', 'text'],
        num_rows: 200
    })
})
{'id': 'personas_IF_9u3mcrurksv7hypq3xlppyba', 'prompt': 'Compile a detailed summary of the most recent case discussed on the podcast. Include the title of the episode wrapped in double angular brackets, i.e. <<title>>, and quote at least one statement made by the host during the discussion.', 'messages': [{'content': 'You are a helpful assistant who is an expert at responding to prompts by carefully following the given instructions', 'role': 'system'}, {'content': 'Compile a detailed summary of the most recent case discussed on the podcast. Include the title of the episode wrapped in double angular brackets, i.e. <<title>>, and quote at least one statement made by the host during the discussion.', 'role': 'user'}, {'content': "I'm

Unsloth 2025.10.1 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


[PerplexityCallback] Initialized with perplexity_thresholds=[6.0, 7.0]
[PerplexityCallback] Training will stop at final threshold: 7.0
[PerplexityCallback] num_samples=200
[PerplexityCallback] Test dataset size: 200
Dataset sample 1st one: {'id': 'personas_IF_o3dewf33wk895zom2dbfye5u', 'prompt': 'I want to create a podcast where I explore the themes and motifs found in surreal dystopian literature, both mine and others. This podcast will dive deep into the artistic and philosophical underpinnings of this genre, aiming to become a leading voice in literary criticism. Can you guide me on how to effectively launch and grow this podcast? Include at least 3 **bold text** sections such as: **bold text 1**, **bold text 2**, etc., and incorporate a famous quote about dystopian literature.', 'messages': [{'content': 'You are a helpful assistant who is an expert at responding to prompts by carefully following the given instructions', 'role': 'system'}, {'content': 'I want to create a podcast whe

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 8,192 | Num Epochs = 2 | Total steps = 1,024
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)
[34m[1mwandb[0m: Currently logged in as: [33mshougan[0m ([33mshougan-university-of-waterloo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[PerplexityCallback] on_train_begin: model_name=llama3-8B
Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
20,1.5453,1.988706
40,1.2469,1.402444


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient



[PerplexityCallback] Step 20, Data Points 320: PPL = 8.6229

[PerplexityCallback] Step 40, Data Points 640: PPL = 4.3168
[PerplexityCallback] Sweetspot threshold 6.0 reached!
[PerplexityCallback] Saving sweetspot checkpoint to /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-6.00_sft-640
Found HuggingFace hub cache directory: /home/shougan/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Checking cache directory for required files...


Unsloth: Copying 4 files from cache to `/home/shougan/projects/aip-fredashi/shougan/bala


Successfully copied all 4 files from cache to `/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-6.00_sft-640`


Unsloth: Preparing safetensor model files: 100%|███████| 4/4 [00:00<00:00, 65281.00it/s]
Unsloth: Merging weights into 16bit: 100%|████████████████| 4/4 [01:05<00:00, 16.26s/it]


Unsloth: Merge process complete. Saved to `/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-6.00_sft-640`
[PerplexityCallback] Sweetspot checkpoint saved with metadata at /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models_metadata/llama3-8B_ppl-0126_0131.json
[PerplexityCallback] Launching DPO job with data points 7552 at checkpoint /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-6.00_sft-640
[PerplexityCallback] Final threshold 6.0 reached! Stopping training.
Found HuggingFace hub cache directory: /home/shougan/.cache/huggingface/hub


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Checking cache directory for required files...


Unsloth: Copying 4 files from cache to `/home/shougan/projects/aip-fredashi/shougan/bala


Successfully copied all 4 files from cache to `/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_sft-tuluif-8192`


Unsloth: Preparing safetensor model files: 100%|███████| 4/4 [00:00<00:00, 74565.40it/s]
Unsloth: Merging weights into 16bit: 100%|████████████████| 4/4 [01:07<00:00, 16.92s/it]


Unsloth: Merge process complete. Saved to `/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_sft-tuluif-8192`


In [None]:
ppl_callback = callbacks[-1]
metadata_file = ppl_callback.metadata_path
checkpoints = []
with open(metadata_file, "r") as f:
    for line in f:
        checkpoints.append(json.loads(line))
print(checkpoints)


[{'threshold_type': 'perplexity', 'threshold_value': 6.0, 'global_step': 40, 'checkpoint_path': '/home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-6.00_sft-640', 'data_points_seen': 640}]


In [None]:
del model, tokenizer, trainer
torch.cuda.empty_cache()

In [None]:
for checkpoint in checkpoints:
    model_name = Path(checkpoint["checkpoint_path"]).name
    data = total_train_size - checkpoint["data_points_seen"] 
    model_load_config = ModelLoadConfig()
    training_args = DPOTrainingConfig()
    training_args.eval_steps = 5
    dataset_config = DatasetConfig(
        dataset = "tuluif",
        dataset_type = "pt",
        train_size = data,
    )
    sft_run_config = SFTRunConfig(
        dataset_config = DatasetConfig(
            dataset = "tuluif",
            dataset_type = "sft",
            train_size = checkpoint["data_points_seen"],
            dynamic_path = model_name
        ),
        model_name = MODEL,
        model_name_hf = HF_MODEL_MAP[MODEL], 
        task_name = "ifeval"
    )
    run_config = PTRunConfig(
        dataset_config = dataset_config,
        # model_name_hf = HF_MODEL_MAP[MODEL],  
        model_name = MODEL,  
        sft_run_config = sft_run_config,
        task_name = "ifeval",
        pft_method = "dpo",
        do_training = True
    )
    passk_config = PassAtKConfig( # this is just to dynamically view the pass@1 of ifeval
        target_pass_at_k=[1.2],
        k_values=[1],
        n_samples=1,
        num_prompts=50,
        temperature=0.7,
        strict=True,
        enabled=True,
    )
    model, tokenizer, trainer = train_model_dpo(
        run_config = run_config,
        lora_config = lora_config,
        model_load_config = model_load_config,
        training_args = training_args,
        passk_config = passk_config,
        # perplexity_thresholds= [0.1] # dummy value to periodically check perplexities too
    )
    del model, tokenizer, trainer
    torch.cuda.empty_cache()





Per device train batch size: 1
Getting train dataset for run config: llama3-8B_llama3-8B_ppl-6.00_sft-640_pt-tuluif-7552
Checking for dataset at /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/data/datasets/pt-tuluif-7552
Dataset already exists at /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/data/datasets/pt-tuluif-7552
Sampled dataset: DatasetDict({
    train: Dataset({
        features: ['id', 'prompt', 'constraints', 'chosen', 'rejected', 'chonsen_model', 'rejected_model', 'system_message'],
        num_rows: 7552
    })
    test: Dataset({
        features: ['id', 'prompt', 'constraints', 'chosen', 'rejected', 'chonsen_model', 'rejected_model', 'system_message'],
        num_rows: 200
    })
})
Example training row: {'id': 'personas_IF_dqxglsux2n8jeu59qktlnesh', 'prompt': 'Name two famous equestrian events that are part of the international jumping circuit, and format your answer by choosing one from these options: lowercase, UPPERCASE, Title 

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

{'chonsen_model': 'gpt-4o',
 'chosen': '<|im_start|>assistant\n'
           'LONGINES GLOBAL CHAMPIONS TOUR, ROLEX GRAND SLAM OF SHOW '
           'JUMPING<|im_end|>\n',
 'constraints': ['punctuation:use no comma', 'format:choose one from options'],
 'id': 'personas_IF_dqxglsux2n8jeu59qktlnesh',
 'prompt': '<|im_start|>system\n'
           'You are a helpful assistant who is an expert at responding to '
           'prompts by carefully following the given instructions<|im_end|>\n'
           '<|im_start|>user\n'
           'Name two famous equestrian events that are part of the '
           'international jumping circuit, and format your answer by choosing '
           'one from these options: lowercase, UPPERCASE, Title '
           'Case.<|im_end|>\n'
           '<|im_start|>assistant\n',
 'rejected': '<|im_start|>assistant\n'
             '- longines global champions tour\n'
             '- FEI WORLD CUP JUMPING<|im_end|>\n',
 'rejected_model': 'gpt-4o',
 'system_message': 'You are 

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,552 | Num Epochs = 2 | Total steps = 944
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 16
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 16 x 1) = 16
 "-____-"     Trainable parameters = 83,886,080 of 8,114,147,328 (1.03% trained)


[PassAtKCallback] on_train_begin: model_name=PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096, padding_idx=128004)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
      

Step,Training Loss,Validation Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
5,0.6953,0.691851,0.000649,-0.003869,0.57,0.004519,-344.048309,-361.884674,-1.462493,-1.469434,0,0,0
10,0.719,0.694471,0.001362,0.002344,0.465,-0.000982,-344.041168,-361.82251,-1.462374,-1.469163,No Log,No Log,No Log


[PassAtKCallback] Saving model to /tmp/tmp6tn699vo...
Detected local model directory: /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-6.00_sft-640
Found HuggingFace hub cache directory: /home/shougan/.cache/huggingface/hub


Unsloth: Preparing safetensor model files:  25%|███████▎                     | 1/4 [00:02<00:07,  2.67s/it]

Copied model-00001-of-00004.safetensors from local model directory


Unsloth: Preparing safetensor model files:  50%|██████████████▌              | 2/4 [00:06<00:07,  3.54s/it]

Copied model-00002-of-00004.safetensors from local model directory


Unsloth: Preparing safetensor model files:  75%|█████████████████████▊       | 3/4 [00:19<00:07,  7.84s/it]

Copied model-00003-of-00004.safetensors from local model directory


Unsloth: Preparing safetensor model files: 100%|█████████████████████████████| 4/4 [00:23<00:00,  5.83s/it]


Copied model-00004-of-00004.safetensors from local model directory


Unsloth: Merging weights into 16bit: 100%|███████████████████████████████████| 4/4 [01:23<00:00, 20.85s/it]


Unsloth: Merge process complete. Saved to `/tmp/tmp6tn699vo`
[PassAtKCallback] Moving training model to CPU...
[PassAtKCallback] Loading model with vLLM from /tmp/tmp6tn699vo...
LLM UTILISATION IS 0.8
INFO 01-26 02:53:08 [config.py:841] This model supports multiple tasks: {'classify', 'reward', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 01-26 02:53:08 [config.py:1472] Using max model len 131072
INFO 01-26 02:53:08 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 01-26 02:53:28 [__init__.py:244] Automatically detected platform cuda.
INFO 01-26 02:53:32 [core.py:526] Waiting for init message from front-end.
INFO 01-26 02:53:32 [core.py:69] Initializing a V1 LLM engine (v0.9.2) with config: model='/tmp/tmp6tn699vo', speculative_config=None, tokenizer='/tmp/tmp6tn699vo', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=13107

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:09<00:29,  9.67s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:12<00:10,  5.44s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:24<00:08,  8.38s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:40<00:00, 11.63s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:40<00:00, 10.16s/it]



INFO 01-26 02:54:15 [default_loader.py:272] Loading weights took 40.82 seconds
INFO 01-26 02:54:15 [gpu_model_runner.py:1801] Model loading took 15.0006 GiB and 41.277051 seconds
INFO 01-26 02:54:28 [backends.py:508] Using cache directory: /home/shougan/.cache/vllm/torch_compile_cache/55237772d8/rank_0_0/backbone for vLLM's torch.compile
INFO 01-26 02:54:28 [backends.py:519] Dynamo bytecode transform time: 12.63 s
INFO 01-26 02:54:33 [backends.py:181] Cache the graph of shape None for later use
INFO 01-26 02:55:03 [backends.py:193] Compiling a graph for general shape takes 34.35 s
INFO 01-26 02:55:21 [monitor.py:34] torch.compile takes 46.98 s in total
INFO 01-26 02:55:22 [gpu_worker.py:232] Available KV cache memory: 19.20 GiB
INFO 01-26 02:55:22 [kv_cache_utils.py:716] GPU KV cache size: 157,280 tokens
INFO 01-26 02:55:22 [kv_cache_utils.py:720] Maximum concurrency for 131,072 tokens per request: 1.20x


Capturing CUDA graph shapes: 100%|█████████████| 67/67 [00:23<00:00,  2.87it/s]


INFO 01-26 02:55:46 [gpu_model_runner.py:2326] Graph capturing finished in 23 secs, took 0.55 GiB
INFO 01-26 02:55:46 [core.py:172] init engine (profile, create kv cache, warmup model) took 90.61 seconds
[PassAtKCallback] Generating 50 prompts x 1 samples...


Adding requests:   0%|          | 0/50 [00:00<?, ?it/s]

Processed prompts:   0%|        | 0/50 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



[PassAtKCallback] Moving training model back to GPU...
[PassAtKCallback] Evaluating responses...

[PassAtKCallback] Step 5, Data Points 80: pass@1=0.1600 (strict, 50 prompts)
[PassAtKCallback] Saving model to /tmp/tmpcdf91sej...
Detected local model directory: /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-6.00_sft-640
Found HuggingFace hub cache directory: /home/shougan/.cache/huggingface/hub


Unsloth: Preparing safetensor model files:  25%|███████▎                     | 1/4 [00:30<01:32, 30.91s/it]

Copied model-00001-of-00004.safetensors from local model directory


Unsloth: Preparing safetensor model files:  50%|██████████████▌              | 2/4 [01:02<01:03, 31.54s/it]

Copied model-00002-of-00004.safetensors from local model directory


Unsloth: Preparing safetensor model files:  75%|█████████████████████▊       | 3/4 [01:35<00:32, 32.08s/it]

Copied model-00003-of-00004.safetensors from local model directory


Unsloth: Preparing safetensor model files: 100%|█████████████████████████████| 4/4 [01:42<00:00, 25.62s/it]


Copied model-00004-of-00004.safetensors from local model directory


Unsloth: Merging weights into 16bit: 100%|███████████████████████████████████| 4/4 [01:14<00:00, 18.70s/it]


Unsloth: Merge process complete. Saved to `/tmp/tmpcdf91sej`
[PassAtKCallback] Moving training model to CPU...
[PassAtKCallback] Loading model with vLLM from /tmp/tmpcdf91sej...
LLM UTILISATION IS 0.8
INFO 01-26 03:01:40 [config.py:841] This model supports multiple tasks: {'classify', 'reward', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 01-26 03:01:40 [config.py:1472] Using max model len 131072
INFO 01-26 03:01:40 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 01-26 03:02:00 [__init__.py:244] Automatically detected platform cuda.
INFO 01-26 03:02:04 [core.py:526] Waiting for init message from front-end.
INFO 01-26 03:02:04 [core.py:69] Initializing a V1 LLM engine (v0.9.2) with config: model='/tmp/tmpcdf91sej', speculative_config=None, tokenizer='/tmp/tmpcdf91sej', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=13107

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:11<00:34, 11.47s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:14<00:12,  6.25s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:24<00:07,  7.99s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:40<00:00, 11.42s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:40<00:00, 10.20s/it]



INFO 01-26 03:02:47 [default_loader.py:272] Loading weights took 41.00 seconds
INFO 01-26 03:02:47 [gpu_model_runner.py:1801] Model loading took 15.0006 GiB and 41.455702 seconds
INFO 01-26 03:03:01 [backends.py:508] Using cache directory: /home/shougan/.cache/vllm/torch_compile_cache/4e49688955/rank_0_0/backbone for vLLM's torch.compile
INFO 01-26 03:03:01 [backends.py:519] Dynamo bytecode transform time: 13.01 s
INFO 01-26 03:03:05 [backends.py:181] Cache the graph of shape None for later use
INFO 01-26 03:03:35 [backends.py:193] Compiling a graph for general shape takes 34.16 s
INFO 01-26 03:03:53 [monitor.py:34] torch.compile takes 47.16 s in total
INFO 01-26 03:03:54 [gpu_worker.py:232] Available KV cache memory: 19.20 GiB
INFO 01-26 03:03:55 [kv_cache_utils.py:716] GPU KV cache size: 157,280 tokens
INFO 01-26 03:03:55 [kv_cache_utils.py:720] Maximum concurrency for 131,072 tokens per request: 1.20x


Capturing CUDA graph shapes: 100%|█████████████| 67/67 [00:22<00:00,  3.01it/s]


INFO 01-26 03:04:17 [gpu_model_runner.py:2326] Graph capturing finished in 22 secs, took 0.55 GiB
INFO 01-26 03:04:17 [core.py:172] init engine (profile, create kv cache, warmup model) took 89.61 seconds
[PassAtKCallback] Generating 50 prompts x 1 samples...


Adding requests:   0%|          | 0/50 [00:00<?, ?it/s]

Processed prompts:   0%|        | 0/50 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



[PassAtKCallback] Moving training model back to GPU...
[PassAtKCallback] Evaluating responses...

[PassAtKCallback] Step 10, Data Points 160: pass@1=0.1000 (strict, 50 prompts)
[PassAtKCallback] Saving model to /tmp/tmp2weal79z...
Detected local model directory: /home/shougan/projects/aip-fredashi/shougan/balance-budget/tuning/models/llama3-8B_ppl-6.00_sft-640
Found HuggingFace hub cache directory: /home/shougan/.cache/huggingface/hub


Unsloth: Preparing safetensor model files:  25%|███████▎                     | 1/4 [00:30<01:30, 30.10s/it]

Copied model-00001-of-00004.safetensors from local model directory


Unsloth: Preparing safetensor model files:  50%|██████████████▌              | 2/4 [01:01<01:01, 30.88s/it]

Copied model-00002-of-00004.safetensors from local model directory


Unsloth: Preparing safetensor model files:  75%|█████████████████████▊       | 3/4 [01:34<00:31, 32.00s/it]

Copied model-00003-of-00004.safetensors from local model directory


Unsloth: Preparing safetensor model files: 100%|█████████████████████████████| 4/4 [01:41<00:00, 25.49s/it]


Copied model-00004-of-00004.safetensors from local model directory


Unsloth: Merging weights into 16bit: 100%|███████████████████████████████████| 4/4 [01:17<00:00, 19.48s/it]


Unsloth: Merge process complete. Saved to `/tmp/tmp2weal79z`
[PassAtKCallback] Moving training model to CPU...
[PassAtKCallback] Loading model with vLLM from /tmp/tmp2weal79z...
LLM UTILISATION IS 0.8
INFO 01-26 03:10:13 [config.py:841] This model supports multiple tasks: {'classify', 'reward', 'generate', 'embed'}. Defaulting to 'generate'.
INFO 01-26 03:10:13 [config.py:1472] Using max model len 131072
INFO 01-26 03:10:13 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 01-26 03:10:33 [__init__.py:244] Automatically detected platform cuda.
INFO 01-26 03:10:37 [core.py:526] Waiting for init message from front-end.
INFO 01-26 03:10:37 [core.py:69] Initializing a V1 LLM engine (v0.9.2) with config: model='/tmp/tmp2weal79z', speculative_config=None, tokenizer='/tmp/tmp2weal79z', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=13107

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:09<00:27,  9.30s/it]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:11<00:10,  5.26s/it]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:22<00:07,  7.60s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:35<00:00,  9.88s/it]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:35<00:00,  8.88s/it]



INFO 01-26 03:11:15 [default_loader.py:272] Loading weights took 35.66 seconds
INFO 01-26 03:11:15 [gpu_model_runner.py:1801] Model loading took 15.0006 GiB and 36.097880 seconds
INFO 01-26 03:11:28 [backends.py:508] Using cache directory: /home/shougan/.cache/vllm/torch_compile_cache/e98dde76e6/rank_0_0/backbone for vLLM's torch.compile
INFO 01-26 03:11:28 [backends.py:519] Dynamo bytecode transform time: 12.63 s
INFO 01-26 03:11:32 [backends.py:181] Cache the graph of shape None for later use
