In [4]:
%%capture
import os
os.environ["UNSLOTH_VLLM_STANDBY"] = "1" # [NEW] Extra 30% context lengths!
if "COLAB_" not in "".join(os.environ.keys()):
    # If you're not in Colab, just use pip install or uv pip install
    !pip install unsloth vllm
else:
    pass # For Colab / Kaggle, we need extra instructions hidden below \/

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [3]:
import torch
# Optional: verify it's using the right GPU
print("CUDA device count:", torch.cuda.device_count())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(0))

CUDA device count: 1
Current device: 0
Device name: NVIDIA RTX A6000


In [1]:
import torch

# Set device to GPU 2
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")


cuda:2


In [5]:
# import os
# # os.environ["CUDA_VISIBLE_DEVICES"] = "1"  # or "2"
# device = torch.device("cuda:1")

from unsloth import FastLanguageModel, is_bfloat16_supported
import torch

max_seq_length = 512
lora_rank = 16

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-4",
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    fast_inference = False,  # safer for now
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.9,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = ["gate_proj", "up_proj", "down_proj"],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)




  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 09-08 10:49:00 [__init__.py:241] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.9.1: Fast Llama patching. Transformers: 4.56.1. vLLM: 0.10.1.1.
   \\   /|    NVIDIA RTX A6000. Num GPUs = 1. Max memory: 47.529 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.99it/s]
Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.9.1 patched 40 layers with 0 QKV layers, 0 O layers and 40 MLP layers.


In [6]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [6]:
dataset = get_gsm8k_questions()
print(f"Number of samples in dataset: {len(dataset)}")


Number of samples in dataset: 7473


In [7]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = False, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 1,
    per_device_train_batch_size = 12,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 6, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 200,
    num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 100,
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

In [9]:

trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 12 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (12 x 1 x 1) = 12
 "-____-"     Trainable parameters = 44,236,800 of 14,703,744,000 (0.30% trained)


-------------------- Question:
A concert ticket costs $40. Mr. Benson bought 12 tickets and received a 5% discount for every ticket bought that exceeds 10. How much did Mr. Benson pay in all? 
Answer:
476 
Response:
<reasoning>
1. Identify the cost for the first 10 tickets without any discount. Each ticket costs $40, so:
   \[
   \text{Cost of 10 tickets} = 10 \times 40 = 400 \text{ dollars}
   \]

2. Calculate the cost for the additional 2 tickets, which receive a 5% discount:
   - The discount is 5% of $40:
     \[
     \text{Discount per ticket} = 0.05 \times 40 = 2 \text{ dollars}
     \]
   - The discounted price per ticket is:
     \[
     \text{Discounted price per ticket} = 40 - 2 = 38 \text{ dollars}
     \]
   - Total cost for the 2 discounted tickets is:
     \[
     \text{Cost of 2 discounted tickets} = 2 \times 38 = 76 \text{ dollars 
Extracted:
<reasoning>
1. Identify the cost for the first 10 tickets without any discount. Each ticket costs $40, so:
   \[
   \text{Cost of

Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,rewards / xmlcount_reward_func / mean,rewards / xmlcount_reward_func / std,rewards / soft_format_reward_func / mean,rewards / soft_format_reward_func / std,rewards / strict_format_reward_func / mean,rewards / strict_format_reward_func / std,rewards / int_reward_func / mean,rewards / int_reward_func / std,rewards / correctness_reward_func / mean,rewards / correctness_reward_func / std
1,0.0,0.125,0.0,200.0,200.0,200.0,1.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.125,0.0,200.0,200.0,200.0,1.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,-0.052167,0.09532,198.0,178.0,200.0,0.833333,188.0,178.0,198.0,0.0,-0.052167,0.191586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.0,0.104667,0.049806,197.333344,168.0,200.0,0.916667,168.0,168.0,168.0,0.0,0.104667,0.070437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.114583,0.025516,200.0,200.0,200.0,1.0,0.0,0.0,0.0,0.000291,0.114583,0.036084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.267,0.518917,195.75,158.0,200.0,0.833333,174.5,158.0,191.0,0.000173,0.058667,0.111281,0.0,0.0,0.0,0.0,0.041667,0.144338,0.166667,0.57735
7,0.0,0.095,0.098324,199.166672,190.0,200.0,0.916667,190.0,190.0,190.0,0.000268,0.095,0.116794,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.630083,0.641608,196.916672,186.0,200.0,0.75,187.666672,186.0,190.0,0.000334,0.005083,0.209362,0.0,0.0,0.0,0.0,0.125,0.226134,0.5,0.904534
9,0.0,0.09125,0.08267,200.0,200.0,200.0,1.0,0.0,0.0,0.0,0.000113,0.09125,0.116913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0625,0.051031,200.0,200.0,200.0,1.0,0.0,0.0,0.0,0.000272,0.0625,0.065279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


-------------------- Question:
Janet pays $40/hour for 3 hours per week of clarinet lessons and $28/hour for 5 hours a week of piano lessons. How much more does she spend on piano lessons than clarinet lessons in a year? 
Answer:
1040 
Response:
<reasoning>
First, we need to calculate Janet's weekly costs for both clarinet and piano lessons.

1. **Clarinet Lessons:**
   - Cost per hour: $40
   - Hours per week: 3
   - Weekly cost for clarinet lessons = $40/hour * 3 hours/week = $120/week

2. **Piano Lessons:**
   - Cost per hour: $28
   - Hours per week: 5
   - Weekly cost for piano lessons = $28/hour * 5 hours/week = $140/week

Next, we calculate the annual cost for each type of lesson by multiplying the weekly cost by the number of weeks in a year (typically 52 weeks).

3. **Annual Cost for Clarinet Lessons:**
   - Weekly cost = $120
   - Annual cost = $120/week * 52 weeks/year = $6,240/year

4. **Annual Cost for 
Extracted:
<reasoning>
First, we need to calculate Janet's weekly cost

TrainOutput(global_step=100, training_loss=3.113401421650508e-07, metrics={'train_runtime': 4128.518, 'train_samples_per_second': 0.291, 'train_steps_per_second': 0.024, 'total_flos': 0.0, 'train_loss': 3.113401421650508e-07})

In [None]:
# text = tokenizer.apply_chat_template([
#     {"role" : "user", "content" : "Which is bigger? 9.11 or 9.9?"},
# ], tokenize = False, add_generation_prompt = True)

# from vllm import SamplingParams
# sampling_params = SamplingParams(
#     temperature = 0.8,
#     top_p = 0.95,
#     max_tokens = 1024,
# )
# output = model.fast_generate(
#     [text],
#     sampling_params = sampling_params,
#     lora_request = None,
# )[0].outputs[0].text

# output

text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : "A pencil costs $1 and a notebook costs $2.How much do 3 pencils and 2 notebooks cost?"},
], tokenize = False, add_generation_prompt = True)


import torch

input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)

output_ids = model.generate(
    input_ids,
    temperature=0.8,
    top_p=0.95,
    max_new_tokens=1024,
    do_sample=True,
)

output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(output)


userA pencil costs $1 and a notebook costs $2.How much do 3 pencils and 2 notebooks cost?assistantTo find the total cost of 3 pencils and 2 notebooks, you can calculate the cost of each separately and then add them together.

The cost of 3 pencils:
\[ 3 \text{ pencils} \times \$1 \text{ per pencil} = \$3 \]

The cost of 2 notebooks:
\[ 2 \text{ notebooks} \times \$2 \text{ per notebook} = \$4 \]

Now, add the costs together:
\[ \$3 + \$4 = \$7 \]

So, 3 pencils and 2 notebooks cost \$7.


In [13]:
# model.save_lora("grpo_saved_lora")

model.save_pretrained("grpo_saved_lora")


In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "Which is bigger? 9.11 or 9.9?"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora"),
)[0].outputs[0].text

output

In [None]:
print(output)

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False:
    model.save_pretrained("model")
    tokenizer.save_pretrained("model")
if False:
    model.push_to_hub("hf/model", token = "")
    tokenizer.push_to_hub("hf/model", token = "")
