In [1]:
import os
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '0'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

0

In [3]:
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-11 00:56:12 __init__.py:190] Automatically detected platform cuda.


In [4]:
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 512 # Can increase for longer reasoning traces
lora_rank = 128 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2b-it-bnb-4bit",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    dtype = None,
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = lora_rank,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 0,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth 2025.2.5: Fast Gemma patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: vLLM loading unsloth/gemma-2b-it-bnb-4bit with actual GPU utilization = 59.33%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 23.68 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 512. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 11.63 GB. Also swap space = 6 GB.
INFO 02-11 00:56:23 config.py:542] This model supports multiple tasks: {'score', 'embed', 'reward', 'generate', 'classify'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config using kwargs = {'load_in_8bit': False, 'load_in_4bit': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_q



INFO 02-11 00:56:25 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-11 00:56:26 model_runner.py:1115] Loading model weights took 1.9409 GB
INFO 02-11 00:56:26 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 02-11 00:56:27 worker.py:267] Memory profiling takes 0.99 seconds
INFO 02-11 00:56:27 worker.py:267] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.59) = 14.05GiB
INFO 02-11 00:56:27 worker.py:267] model weights take 1.94GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 2.06GiB; the rest of the memory reserved for KV Cache is 10.00GiB.
INFO 02-11 00:56:27 executor_base.py:110] # CUDA blocks: 36391, # CPU blocks: 21845
INFO 02-11 00:56:27 executor_base.py:115] Maximum concurrency for 512 tokens per request: 1137.22x
INFO 02-11 00:56:31 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory erro

Capturing CUDA graph shapes: 100%|██████████| 31/31 [00:15<00:00,  2.04it/s]

INFO 02-11 00:56:46 model_runner.py:1562] Graph capturing finished in 15 secs, took 0.46 GiB
INFO 02-11 00:56:46 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 20.17 seconds



Unsloth 2025.2.5 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.


In [5]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
You are required to accurately answer an arithmetic reasoning question. Solve using a step-by-step sequence of calculations, showing the reasoning behind each step.
Provide a short overview of your reasoning and an answer (less than 100 characters total) in the following structured format, with a line break (new line character) immediately following each tag and stop after </answer>: 
<reasoning>
[Your step-by-step reasoning leading to the answer]
</reasoning>
<answer>
[The final numerical answer, as an integer or float]
</answer>
<eos>

Example of the desired output:
<reasoning>
First, I calculated the area of the rectangle by multiplying length and width. Then, I divided the area by 2 to find half the area. This half area represents the answer.
</reasoning>
<answer>
12.5
</answer>
<eos>

Follow this format precisely, stopping completely with <eos> token after the new line following closing </answer> tag.
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'user', 'content': SYSTEM_PROMPT + "\n" + x['question']},
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

scaling_down = 3.

# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content'].split("\n")[-1]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    if hash(q) % 10 == 0:
        print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 / scaling_down if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 / scaling_down if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 / scaling_down if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count / scaling_down

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [6]:
from trl import GRPOConfig, GRPOTrainer

training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 6, # Decrease if out of memory
    max_prompt_length = 512,
    max_completion_length = 256,
    num_train_epochs = 2, # Set to 1 for a full training run
    save_steps = 500,
    max_grad_norm = 0.1,
    report_to = "tensorboard",
    logging_dir = "logs/runs",
    output_dir = "outputs",
)

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


In [7]:
CHAT_TEMPLATE = """
{%- for message in messages %}
    {{- '<|' + message['role'] + '|>\n' }}
    {{- message['content'] + eos_token }}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|assistant|>\n' }}
{%- endif %}
"""

if tokenizer.chat_template is None:
    tokenizer.chat_template = CHAT_TEMPLATE # set the attribute

In [8]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 14,946
 "-____-"     Number of trainable parameters = 156,893,184


Step,Training Loss,reward,reward_std,completion_length,kl
1,0.0,0.244222,0.024925,83.666672,0.0
2,0.0,0.25,0.0,71.0,0.0
3,0.0,0.243056,0.01701,84.666672,0.000218
4,0.0,0.243056,0.01701,102.5,0.000237
5,0.0,0.235611,0.022311,93.666672,0.000169
6,0.0,0.195611,0.061304,104.666672,0.000272
7,0.0,0.231444,0.017765,69.166672,0.000167
8,0.0,0.244611,0.0132,90.0,7.7e-05
9,0.0,0.583333,0.816497,65.833336,0.000242
10,0.0,0.575556,0.820519,67.833336,0.000171


-------------------- Question:
Boston had .5 feet of snow on the first day of winter.  The next day they got an additional 8 inches.  Over the next 2 days, 2 inches of the snow melted.  On the fifth day, they received another 2 times the amount of snow they received on the first day.  How many feet of snow do they now have? 
Answer:
2 
Response:
<reasoning>
First, I added 8 inches to the original amount of snow.
Then I added 2 inches to the total amount of snow.
Last, I divided the final total to the number of days to get the amount of snow they have now.
</reasoning>
<answer>1.5 feet
</answer> 
Extracted:
1.5 feet
-------------------- Question:
Janet has a business selling custom collars for dogs and cats. If it takes 18 inches of nylon to make a dog collar and 10 inches to make a cat collar, how much nylon does she need to make 9 dog collars and 3 cat collars? 
Answer:
192 
Response:
<reasoning>
To find the total amount of nylon needed, we can simply add the quantities of nylon requi



-------------------- Question:
Julia is performing in her high school musical this weekend and her family wants to come to the show. Tickets are $12 for adults and $10 for children. If her mom, dad, grandma, and three little sisters come to the show, how much will the total be for their tickets? 
Answer:
66 
Response:
<reasoning>
First, I calculated the cost for the adults at 12 per ticket. Then, I added the costs for the children at 10 per ticket for a total of 12 + 10 = 22.
</reasoning>
<answer>22
</answer> 
Extracted:
22
-------------------- Question:
Stacy is a high school Calculus teacher.  She assigns 45 problems for homework.  There are twice as many multiple choice problems as free response, and 7 more free response than true/false.  How many true/false questions are there? 
Answer:
6 
Response:
<reasoning>
There are 45 - 2 = 43 multiple choice questions and 7 + 4 = 11 free response questions.
</reasoning>
<answer>43
</answer> 
Extracted:
43
-------------------- Question:
Alexi



-------------------- Question:
A small store made 50 posters to advertise their products. Two-fifths of them are small posters while half of them are medium posters. The rest are large posters. How many large posters are there? 
Answer:
5 
Response:
<reasoning>
First, I calculated the total number of posters (50).
Then, I calculated that 2/5 of 50 are small posters, which is 10.
Finally, I calculated that half of 10 is 5, so there are 5 large posters.
</reasoning>
<answer>55
</answer> 
Extracted:
55
-------------------- Question:
The moon has a surface area that is 1/5 that of Earth. The surface area of the Earth is 200 square acres. The land on the moon is worth 6 times that of the land on the Earth. If the total value of all the land on the earth is 80 billion dollars, what is the total value in billions of all the land on the moon? 
Answer:
96 
Response:
<reasoning>
The surface area of the moon is 1/5 * 200 sq miles = 40 sq miles.
The land on the moon is 6 * 40 sq miles = 240 sq mil

TrainOutput(global_step=14946, training_loss=0.0033848514304839843, metrics={'train_runtime': 55399.7518, 'train_samples_per_second': 0.27, 'train_steps_per_second': 0.27, 'total_flos': 0.0, 'train_loss': 0.0033848514304839843})

In [60]:
%load_ext tensorboard
%tensorboard --logdir logs/runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [29]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : SYSTEM_PROMPT + "\n" + "Which is bigger? 1.11 or 1.9?"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.20it/s, est. speed input: 534.72 toks/s, output: 160.63 toks/s]

<reasoning>
First, 1.11 is smaller than 1.9.
Then, 1.9 - 1.11 = 0.79.
Therefore, 1.11 is smaller than 1.9.
</reasoning>
<answer>1.11</answer>





In [3]:
model.save_lora("grpo_gemma_saved_lora_2b")

NameError: name 'model' is not defined

In [27]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : SYSTEM_PROMPT + "\n" + "Which is bigger? 1.11 or 1.9?"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_gemma_saved_lora"),
)[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.19it/s, est. speed input: 533.77 toks/s, output: 87.86 toks/s]

<reasoning>
First, 1.11 is less than 1.9, so it is smaller.
</reasoning>
<answer>1
</answer>





In [20]:
model.save_pretrained_merged("unsloth_gemma-2-2b-r1-reasoning_2b", tokenizer, save_method = "merged_16bit")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 69.5 out of 124.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 18/18 [00:00<00:00, 81.80it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [21]:
model.save_pretrained_merged("unsloth_gemma-2-2b-r1-reasoning_2b_lora", tokenizer, save_method = "lora",)

Unsloth: Saving tokenizer... Done.
 Done.h: Saving model...


In [14]:
test_data = get_gsm8k_questions(split = "test")

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [26]:
from tqdm import tqdm
import random

def evaluate_model(model, tokenizer, dataset, runs=3, temperature=0.8, top_p=0.95, max_tokens=1024, seed=42):
    random.seed(seed)
    correct_answer = 0
    correct_format = 0
    
    for k, item in enumerate(tqdm(dataset)):
        prompt = item['prompt']
        answer = item['answer']
    
        text = tokenizer.apply_chat_template(prompt, tokenize = False, add_generation_prompt = True)
    
        sampling_params = SamplingParams(
            temperature = temperature,
            top_p = top_p,
            max_tokens = max_tokens,
        )

        for _ in range(runs):
            output = model.fast_generate(
                [text],
                sampling_params = sampling_params,
                lora_request = None,
                 use_tqdm=False,
            )[0].outputs[0].text
        
            correct_answer += int(answer in output)
            correct_format += count_xml(output)

    scaling = 1 / (len(dataset) * runs)
    answer_accuracy = correct_answer * scaling
    format_accuracy = correct_format * scaling

    print(f"Percentage of correct answers: {answer_accuracy:.3f}")
    print(f"Score of correct formats: {format_accuracy:.3f}")

    return {
        "answer_accuracy": answer_accuracy,
        "format_accuracy": format_accuracy,
    }

accuracy_scores = []
for k in range(10):
    results = evaluate_model(model, tokenizer, test_data, runs=1, seed=k)
    accuracy_scores.append(results["answer_accuracy"])

print(accuracy_scores)

100%|██████████| 1319/1319 [12:42<00:00,  1.73it/s]


Percentage of correct answers: 0.217
Percentage of correct formats: 0.067


100%|██████████| 1319/1319 [12:42<00:00,  1.73it/s]


Percentage of correct answers: 0.218
Percentage of correct formats: 0.065


100%|██████████| 1319/1319 [12:46<00:00,  1.72it/s]


Percentage of correct answers: 0.212
Percentage of correct formats: 0.065


100%|██████████| 1319/1319 [12:44<00:00,  1.73it/s]


Percentage of correct answers: 0.224
Percentage of correct formats: 0.065


100%|██████████| 1319/1319 [12:38<00:00,  1.74it/s]


Percentage of correct answers: 0.220
Percentage of correct formats: 0.064


100%|██████████| 1319/1319 [12:44<00:00,  1.73it/s]


Percentage of correct answers: 0.215
Percentage of correct formats: 0.064


100%|██████████| 1319/1319 [12:42<00:00,  1.73it/s]


Percentage of correct answers: 0.220
Percentage of correct formats: 0.065


100%|██████████| 1319/1319 [12:43<00:00,  1.73it/s]


Percentage of correct answers: 0.223
Percentage of correct formats: 0.065


100%|██████████| 1319/1319 [12:41<00:00,  1.73it/s]


Percentage of correct answers: 0.216
Percentage of correct formats: 0.064


100%|██████████| 1319/1319 [12:46<00:00,  1.72it/s]

Percentage of correct answers: 0.231
Percentage of correct formats: 0.064
[0.21683093252463986, 0.21758908263836238, 0.21228203184230476, 0.22365428354814254, 0.21986353297952996, 0.2145564821834723, 0.21986353297952996, 0.22289613343442002, 0.21607278241091735, 0.2312357846853677]





In [35]:
baseline_scores = [0.175, 0.170, 0.180]
accuracy_scores = [0.21683093252463986, 0.21758908263836238, 0.21228203184230476, 0.22365428354814254, 0.21986353297952996, 0.2145564821834723, 0.21986353297952996, 0.22289613343442002, 0.21607278241091735, 0.2312357846853677]

In [55]:
import numpy as np
from statsmodels.stats.weightstats import ztest

z_statistic, p_value = ztest(accuracy_scores, baseline_scores, value=0)

print(f"Mean accuracy: {np.mean(accuracy_scores):.3f} (std : {np.std(accuracy_scores):.3f})")
print(f"Mean baseline: {np.mean(baseline_scores):.3f} (std : {np.std(baseline_scores):.3f})")
print(f"Z-statistic: {z_statistic:.5f}")
print(f"P-value: {p_value:.5f}")

Mean accuracy: 0.219 (std : 0.005)
Mean baseline: 0.175 (std : 0.004)
Z-statistic: 12.61128
P-value: 0.00000


In [56]:
from huggingface_hub import login, whoami
token = os.environ.get('HF_TOKEN')
login(token=token)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [59]:
model.push_to_hub_merged("lmassaron/unsloth_gemma-2b-r1-reasoning-16bit", tokenizer, save_method="lora", token=token)

Unsloth: Saving LoRA adapters. Please wait...


README.md:   0%|          | 0.00/578 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/628M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved lora model to https://huggingface.co/lmassaron/unsloth_gemma-2b-r1-reasoning-16bit
