In [1]:
import os
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '0'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-09 13:19:19 __init__.py:190] Automatically detected platform cuda.


In [3]:
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 512 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "meta-llama/Llama-3.2-1B-Instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

==((====))==  Unsloth 2025.2.5: Fast Llama patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: vLLM loading unsloth/llama-3.2-1b-instruct-unsloth-bnb-4bit with actual GPU utilization = 59.33%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 23.68 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 512. Num Sequences = 256.
Unsloth: vLLM's KV Cache can use up to 12.95 GB. Also swap space = 6 GB.
INFO 02-09 13:19:31 config.py:542] This model supports multiple tasks: {'reward', 'embed', 'classify', 'generate', 'score'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config using kwargs = {'load_in_8bit': False, 'load_in_4bit': True, 'bnb_4bit_compute_dtype': 'bfloa



INFO 02-09 13:19:34 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-09 13:19:35 model_runner.py:1115] Loading model weights took 1.0453 GB
INFO 02-09 13:19:35 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 02-09 13:19:36 worker.py:267] Memory profiling takes 0.77 seconds
INFO 02-09 13:19:36 worker.py:267] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.59) = 14.05GiB
INFO 02-09 13:19:36 worker.py:267] model weights take 1.05GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 1.18GiB; the rest of the memory reserved for KV Cache is 11.77GiB.
INFO 02-09 13:19:37 executor_base.py:110] # CUDA blocks: 24102, # CPU blocks: 12288
INFO 02-09 13:19:37 executor_base.py:115] Maximum concurrency for 512 tokens per request: 753.19x
INFO 02-09 13:19:40 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:15<00:00,  2.19it/s]

INFO 02-09 13:19:56 model_runner.py:1562] Graph capturing finished in 16 secs, took 0.45 GiB
INFO 02-09 13:19:56 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 20.39 seconds



Unsloth 2025.2.5 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [4]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [5]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4, # Increase to 4 for smoother training
    num_generations = 6, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 200,
    #num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = 250,
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "outputs",
)

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


In [None]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 250
 "-____-"     Number of trainable parameters = 22,544,384


-------------------- Question:
Ahmed and Emily are having a contest to see who can get the best grade in the class. There have been 9 assignments and Ahmed has a 91 in the class. Emily has a 92. The final assignment is worth the same amount as all the other assignments. Emily got a 90 on the final assignment. What is the minimum grade Ahmed needs to get to beat Emily if all grades are whole numbers? 
Answer:
100 
Response:
To solve this, we'll first assume Emily's grade is the minimum grade she needs to win - 94 (since she has a 92 and needs to beat it for the contest) and Ahmed's grade is 92.

She needs -4 points to not lose the contest, as (91 - 94) = -3. So Ahmed must get 1.

Since the final assignment is worth half the total grade, and it's worth 3 units, to get an average of 94, Ahmed needs to get 3 * 3 = 9 in the final assignment.

Therefore, Ahmed needs to get a minimum of M/9 = 3 in the final assignment for Emily to lose. 

Since 3 is 3/4 of 10, or 3.75, the minimum Ahmed needs

Step,Training Loss,reward,reward_std,completion_length,kl
1,0.0,0.0,0.0,150.583338,0.0
2,0.0,0.0,0.0,135.166672,0.0
3,0.0,0.0,0.0,137.625004,0.0
4,0.0,0.0,0.0,151.916676,0.0
5,0.0,0.0,0.0,163.58334,0.0
6,0.0,0.0,0.0,131.750002,0.0
7,0.0,0.0,0.0,148.708336,0.0
8,0.0,0.0,0.0,170.833336,0.0
9,0.0,0.0,0.0,172.58334,0.0
10,0.0,0.0,0.0,123.500002,0.0


-------------------- Question:
In a graveyard, there are 20 skeletons.  Half of these skeletons are adult women, and the remaining number are split evenly between adult men and children.  If an adult woman has 20 bones in their body, and a male has 5 more than this, and a child has half as many as an adult woman, how many bones are in the graveyard? 
Answer:
375 
Response:
The number of skeletons that are adult women is 20 / 2 = 10 skeletons.

The total number of adult women is 10 * 3 = 30 skeletons.

The number of skeletons that are adult men is 20 * 3 = 60 skeletons.

There are 20 skeletons that are children.  Since the problem states that the skeletons are evenly split between children and men, there is 20 / 2 = 10 children skeletons.

The total number of adult bones is 30 + 60 + 10 = 100 skeletons.

The total number of bones is 20 * 2 = 40 bones.

The number of bones in the graveyard is 40. 
Extracted:
The number of skeletons that are adult women is 20 / 2 = 10 skeletons.

The tota

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : "Calculate pi."},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.83s/it, est. speed input: 21.27 toks/s, output: 258.56 toks/s]

Approximating pi, which is an irrational number, is a complex calculation that involves using various mathematical methods. Here's an approximate calculation using the Leibniz formula for pi, which is a series of digits that converges to pi:

1. The Leibniz formula for pi is: pi = 1 - (1/4) + (1/9) - (1/41) + (1/92) - ...

This formula is based on the concept of an infinite series that involves the sum of an infinite number of terms. The series is:

1 - (1/4) + (1/9) - (1/41) + (1/92) - ...

The first term is 1, and the subsequent terms are calculated using fractions with denominators that are multiples of the previous term's denominator. The sum of these fractions is then calculated to find the approximate value of pi.

To calculate pi using this method, we need to sum the infinite series:

pi ≈ 1 - (1/4) + (1/9) - (1/41) + (1/92) - ...

Using a calculator or a computer, we can sum the first few terms of the series to get an approximate value for pi:

pi ≈ 3.14159265359

However, this




In [None]:
model.save_lora("grpo_saved_lora")

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : SYSTEM_PROMPT},
    {"role" : "user", "content" : "Calculate pi."},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_saved_lora"),
)[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.12s/it, est. speed input: 54.49 toks/s, output: 228.66 toks/s]

To calculate pi, we can use a mathematical formula:

pi (π) = 3.14159 (and it goes on infinitely)

We can also use a more precise value for pi, known as the Monte Carlo method, which is:

pi ≈ 3.141592653589793

This method uses a series of random numbers to simulate the movement of a ball in a sphere and measure its circumference. It's a method that has been used for centuries and is still widely used today.

Here's a simple Python code snippet to calculate pi using the Monte Carlo method:

```python
import random

def calculate_pi(num_iterations):
    num_points = 10**6
    points_inside_circle = 0
    for _ in range(num_iterations:
        x = random.uniform(-1, 1)
        y = random.uniform(-1, 1)
        distance = x**2 + y**2
        if distance < 1:
            points_inside_circle += 1
    pi_estimate = 4 * points_inside_circle / num_points
    return pi_estimate

print(calculate_pi(1000000))
```

When you run this code, you should see that the estimate of pi is approximately 3




In [None]:
model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
# model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 70.77 out of 124.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 16/16 [00:00<00:00, 101.36it/s]

Unsloth: Saving tokenizer...




 Done.
Done.
