In [1]:
import os
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '0'
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

0

In [3]:
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-10 15:08:34 __init__.py:190] Automatically detected platform cuda.


In [4]:
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 512 # Can increase for longer reasoning traces
lora_rank = 128 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    #model_name = "unsloth_gemma-2-2b-r1-reasoning",
    model_name = "unsloth/gemma-2b-it-bnb-4bit",
    #model_name = "jaydiaz2023/gemma-2b-instruct-reasoning",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    dtype = None,
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.6, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = lora_rank,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 0,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth 2025.2.5: Fast Gemma patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: vLLM loading unsloth/gemma-2b-it-bnb-4bit with actual GPU utilization = 59.33%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 23.68 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 512. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 11.63 GB. Also swap space = 6 GB.
INFO 02-10 15:08:44 config.py:542] This model supports multiple tasks: {'embed', 'reward', 'generate', 'classify', 'score'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config using kwargs = {'load_in_8bit': False, 'load_in_4bit': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_q



INFO 02-10 15:08:46 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-10 15:08:47 model_runner.py:1115] Loading model weights took 1.9409 GB
INFO 02-10 15:08:47 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 02-10 15:08:49 worker.py:267] Memory profiling takes 1.01 seconds
INFO 02-10 15:08:49 worker.py:267] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.59) = 14.05GiB
INFO 02-10 15:08:49 worker.py:267] model weights take 1.94GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 2.06GiB; the rest of the memory reserved for KV Cache is 10.00GiB.
INFO 02-10 15:08:49 executor_base.py:110] # CUDA blocks: 36391, # CPU blocks: 21845
INFO 02-10 15:08:49 executor_base.py:115] Maximum concurrency for 512 tokens per request: 1137.22x
INFO 02-10 15:08:52 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory erro

Capturing CUDA graph shapes: 100%|██████████| 31/31 [00:15<00:00,  2.02it/s]

INFO 02-10 15:09:08 model_runner.py:1562] Graph capturing finished in 15 secs, took 0.46 GiB
INFO 02-10 15:09:08 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 20.34 seconds



Unsloth 2025.2.5 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.


In [5]:
import re
from datasets import load_dataset, Dataset

# Load and prep dataset
SYSTEM_PROMPT = """
"Provide a short response (less than 100 characters) in the following structured format, with a line break (new line character) immediately following each tag and stop after </answer>: 
<reasoning>
[Your step-by-step reasoning leading to the answer]
</reasoning>
<answer>
[The final numerical answer, as an integer or float]
</answer>
<eos>

Example of the desidered output:
<reasoning>
First, I calculated the area of the rectangle by multiplying length and width. Then, I divided the area by 2 to find half the area. This half area represents the answer.
</reasoning>
<answer>
12.5
</answer>
<eos>

Follow this format precisely, stopping completely with <eos> token after the new line following closing </answer> tag.
"""

XML_COT_FORMAT = """\
<reasoning>
{reasoning}
</reasoning>
<answer>
{answer}
</answer>
"""

def extract_xml_answer(text: str) -> str:
    answer = text.split("<answer>")[-1]
    answer = answer.split("</answer>")[0]
    return answer.strip()

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

# uncomment middle messages for 1-shot prompting
def get_gsm8k_questions(split = "train") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] # type: ignore
    data = data.map(lambda x: { # type: ignore
        'prompt': [
            {'role': 'user', 'content': SYSTEM_PROMPT},
            {'role': 'assistant', 'content': x['question']}
        ],
        'answer': extract_hash_answer(x['answer'])
    }) # type: ignore
    return data # type: ignore

dataset = get_gsm8k_questions()

scaling_down = 1.

# Reward functions
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    q = prompts[0][-1]['content']
    extracted_responses = [extract_xml_answer(r) for r in responses]
    if hash(q) % 10 == 0:
        print('-'*20, f"Question:\n{q}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
    return [2.0 if r == a else 0.0 for r, a in zip(extracted_responses, answer)]

def int_reward_func(completions, **kwargs) -> list[float]:
    responses = [completion[0]['content'] for completion in completions]
    extracted_responses = [extract_xml_answer(r) for r in responses]
    return [0.5 / scaling_down if r.isdigit() else 0.0 for r in extracted_responses]

def strict_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<reasoning>\n.*?\n</reasoning>\n<answer>\n.*?\n</answer>\n$"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 / scaling_down if match else 0.0 for match in matches]

def soft_format_reward_func(completions, **kwargs) -> list[float]:
    """Reward function that checks if the completion has a specific format."""
    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
    responses = [completion[0]["content"] for completion in completions]
    matches = [re.match(pattern, r) for r in responses]
    return [0.5 / scaling_down if match else 0.0 for match in matches]

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>\n") == 1:
        count += 0.125
    if text.count("\n</reasoning>\n") == 1:
        count += 0.125
    if text.count("\n<answer>\n") == 1:
        count += 0.125
        count -= len(text.split("\n</answer>\n")[-1])*0.001
    if text.count("\n</answer>") == 1:
        count += 0.125
        count -= (len(text.split("\n</answer>")[-1]) - 1)*0.001
    return count / scaling_down

def xmlcount_reward_func(completions, **kwargs) -> list[float]:
    contents = [completion[0]["content"] for completion in completions]
    return [count_xml(c) for c in contents]

In [6]:
from trl import GRPOConfig, GRPOTrainer

training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 6, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 200,
    num_train_epochs = 1, # Set to 1 for a full training run
    save_steps = 250,
    max_grad_norm = 0.1,
    report_to = "tensorboard",
    logging_dir = "logs/runs",
    output_dir = "outputs",
)

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


In [7]:
CHAT_TEMPLATE = """
{%- for message in messages %}
    {{- '<|' + message['role'] + '|>\n' }}
    {{- message['content'] + eos_token }}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|assistant|>\n' }}
{%- endif %}
"""

if tokenizer.chat_template is None:
    tokenizer.chat_template = CHAT_TEMPLATE # set the attribute

In [8]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        xmlcount_reward_func,
        soft_format_reward_func,
        strict_format_reward_func,
        int_reward_func,
        correctness_reward_func,
    ],
    args = training_args,
    train_dataset = dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,473 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 1
\        /    Total batch size = 1 | Total steps = 7,473
 "-____-"     Number of trainable parameters = 156,893,184


Step,Training Loss,reward,reward_std,completion_length,kl
1,-0.0,0.7625,0.038635,69.833336,0.0
2,-0.0,0.697167,0.104147,59.0,0.0
3,0.0,0.649667,0.235152,70.0,0.000174
4,0.0,0.75,0.0,98.5,0.000172
5,0.0,0.6045,0.200054,92.333336,0.000223
6,0.0,0.535,0.27335,76.5,0.000277
7,0.0,0.673833,0.077548,69.0,6.3e-05
8,0.0,0.767,0.035231,64.333336,0.000122
9,0.0,0.756333,0.073271,70.5,0.0002
10,0.0,1.4685,0.99925,59.0,0.000165


-------------------- Question:
Terry's mom brought home 4 different colored dogs from the shelter. The brown dog weighs 4 pounds. The black dog weighs 1 pound more than the brown dog. The white dog weighs twice as much as the brown dog. The grey dog weighs 2 pounds less than the black dog. What's the average weight of all the dogs? 
Answer:
5 
Response:
<reasoning>
First, I found the weights of the individual dogs: brown (4 pounds), black (5 pounds), white (8 pounds), and gray (2 pounds).
Then, I calculated the average weight as (4 + 5 + 8 + 2) / 4 = 5 pounds.
</reasoning>
<answer>5</answer> 
Extracted:
5
-------------------- Question:
Jason has a moray eel that eats 20 guppies a day and 5 betta fish who each eat 7 guppies a day. How many guppies per day does she need to buy? 
Answer:
55 
Response:
<reasoning>
First, Jason has 20 guppies. He eats 20 guppies a day, so he needs 20 guppies to feed the moray eel. Then, 5 of Jason's guppies eat 7 guppies each day, so he needs 35 guppies to 



-------------------- Question:
A taco truck buys 100 pounds of beef.  They use .25 pounds of beef per taco.  If they sell each taco for $2 and each taco takes $1.5 to make how much profit did they make if they used all the beef? 
Answer:
200 
Response:
<reasoning>
First, the truck buys 100 pounds of beef.  They use .25 pounds of beef per taco.  If they sell each taco for $2 and each taco takes $1.5 to make, they made $200 ($2 x 100) by selling the beef.
</reasoning>
<answer>200
</answer> 
Extracted:
200
-------------------- Question:
Karl sells clothing in his store. He sells a T-shirt that costs $5, some pants that cost $4, and some skirts that cost $6, he also sells some refurbished t-shirts that cost half the original price. How much is his total income if he sold two T-shirts, one pair of pants, four skirts, and six refurbished T-shirts? 
Answer:
53 
Response:
<reasoning>
First, Karl sells a T-shirt for $5, a pair of pants for $4, and four skirts for $6, totaling $5 + 4 + 4 = $15. 



-------------------- Question:
Granger has five times as many spots as his brother, Cisco.  But Cisco has 5 less than half as many spots as his cousin, Rover.  If Rover has 46 spots, how many spots do Granger and Cisco have combined? 
Answer:
108 
Response:
<reasoning>
Granger has 5 * 5 = 25 spots.
Cisco has 5 - 2 = 3 spots.
Half of 25 is 12.5.
Rover has 46.
Therefore, Granger and Cisco have 25 + 3 = 28 spots combined.
</reasoning>
<answer>28
</answer> 
Extracted:
28
-------------------- Question:
In 5 years, Joey will be as old as Beth is now. If Joey is 9 now, how old was Joey when Beth was Joey's age now? 
Answer:
4 
Response:
<reasoning>
First, Joey's current age is 9. If Beth is 15 now, Joey's age was 6 when Beth was 15. Therefore, his age 5 years ago was 10.
</reasoning>
<answer>10
</answer> 
Extracted:
10
-------------------- Question:
Meso can type 15 pages in 5 minutes. Tyler can type the same 15 pages in 3 minutes.  How many minutes would it take Meso and Tyler to type 40 pag



-------------------- Question:
Annabelle collected a weekly allowance of $30. She spent a third of it buying junk food, then spent another $8 on sweets. Out of guilt she decides to save the rest. How much did she save? 
Answer:
12 
Response:
<reasoning>
First, Annabelle collected a weekly allowance of $30. She spent a third of it buying junk food, then spent another $8 on sweets. Out of guilt she decides to save the rest, which is $30 - ($30 x 1/3) = $10.
</reasoning>
<answer>10
</answer> 
Extracted:
10
-------------------- Question:
Alex gets paid $500 a week and 10% of his weekly income is deducted as tax. He also pays his weekly water bill for $55 and gives away another 10% of his weekly income as a tithe. How much money does Alex have left? 
Answer:
345 
Response:
<reasoning>
First, he gets paid $500 a week. Then, 10% of his weekly income is deducted as tax, which is 0.10*$500 = $50. He also pays his weekly water bill for $55. Finally, he gives away another 10% of his weekly income



-------------------- Question:
When Jayson is 10 his dad is four times his age and his mom is 2 years younger than his dad.  How old was Jayson's mom when he was born? 
Answer:
28 
Response:
<reasoning>
First, Dad's age is 4x. Mom's age is (4x - 2).
Therefore, Mom's age when Jayson was born was 2 less than Dad's age.
</reasoning>
<answer>28
</answer> 
Extracted:
28
-------------------- Question:
Coral reads 30 pages of a book on night 1, and 2 less than twice that on night 2.  Night 3 Coral reads 3 more pages than the sum of the first two nights.  How many total pages did Coral read in the 3 nights? 
Answer:
179 
Response:
<reasoning>
First, Coral read 30 pages of a book on night 1.
Then, 2 less than twice that on night 2.
This means Coral read 2 * 30 = 60 pages on night 2.
Finally, Coral read 3 more than the sum of the first two nights, 30 + 2 = 32 pages on night 3.
</reasoning>
<answer>32
</answer> 
Extracted:
32
-------------------- Question:
Mario's salary increased by 40% to $4000



-------------------- Question:
Without factoring in the cost of depreciation for his car John made $30,000 doing Uber.  When he finally traded in the car he bought for $18,000 he got $6000 back.  What was his profit from driving Uber? 
Answer:
18000 
Response:
<reasoning>
First, he made $30,000 from Uber. Then, he traded the car for $18,000, resulting in a profit of $2,000.
</reasoning>
<answer>125</answer> 
Extracted:
125
-------------------- Question:
Chip takes 2 pages of notes every day, 5 days a week, for each of his 5 classes.  His notebook paper comes in packs of 100 sheets of paper per pack.  After 6 weeks, how many packs of notebook paper will Chip use? 
Answer:
3 
Response:
<reasoning>
First, Chip takes 2 pages of notes every day.  5 days a week, he takes 2 pages/day * 5 days/wk = 10 pages/wk.  6 weeks of taking 10 pages/wk results in 60 pages taken.
</reasoning>
<answer>60
</answer> 
Extracted:
60
-------------------- Question:
There are 10 6-ounces of glasses that are only 



-------------------- Question:
At the park, Naomi saw people riding 7 regular bikes and 11 children’s bikes.  Regular bikes have 2 wheels and kid’s bikes have 4 wheels.  How many wheels did Naomi see at the park? 
Answer:
58 
Response:
<reasoning>
First, I counted the number of regular bikes (7).
Then, I counted the number of children’s bikes (11).
Subtract the regular bikes from the total bikes to find the number of children’s bikes: 7 - 11 = 4.
Therefore, Naomi saw 7 regular bikes and 11 children’s bikes at the park.
</reasoning>
<answer>11
</answer> 
Extracted:
11
-------------------- Question:
After violating the group's policies, 20 members of a Facebook group were removed by the group's admin. If each member posted 50 messages per day in the group and the number of members in the group was 150 before the admin removed some, calculate the total number of messages the remaining members would send in a week. 
Answer:
45500 
Response:
<reasoning>
First, I distributed the total number



-------------------- Question:
Big Dig Mining Company mines three different types of ore: copper, iron, and nickel. Across all their mines, 10% of their output is nickel, 60% is iron, and the rest is copper. They mine 720 tons of nickel a day. How many tons of copper does Big Dig Mining Company mine daily? 
Answer:
360 
Response:
<reasoning>
First, I calculated the total amount of ore mined: 10% x 720 = 0.10 x 720 = 72 tons of nickel. Then, I divided the total amount by 2 to find half the area: 72 ÷ 2 = 36 tons of iron. Finally, I subtracted the amount of nickel and iron from the total to find the amount of copper mined: 720 - 0.10 x 720 - 0.60 x 720 = 128 tons of copper.
</reasoning>
<answer>128
</answer> 
Extracted:
128
-------------------- Question:
In Dana's senior high school class there were 200 students.  60% of the students were female, and 50% of the females were brunettes.  If 50% of the female brunettes were under 5 feet tall, then how many female brunettes in Dana's high sc



-------------------- Question:
Claire wants to make 2 cakes for her mother. Two packages of flour are required for making a cake. If 1 package of flour is $3, how much does she pay for the flour that is enough to make 2 cakes? 
Answer:
12 
Response:
<reasoning>
First, I bought 2 packages of flour for $3 each, which is $6. Then, I divided 6 by 2 to find that she needs 3 packages of flour to make 2 cakes.
</reasoning>
<answer>15
</answer> 
Extracted:
15
-------------------- Question:
Janet filmed a new movie that is 60% longer than her previous 2-hour long movie.  Her previous movie cost $50 per minute to film, and the newest movie cost twice as much per minute to film as the previous movie.  What was the total amount of money required to film Janet's entire newest film? 
Answer:
1920 
Response:
<reasoning>
First, Janet filmed a new movie that is 60% longer than her previous 2-hour long movie. 

Let x be the cost of her previous movie per minute.

Then, the cost of her new movie was 2x.




-------------------- Question:
Trip wanted to watch the new action movie in theaters.  An evening ticket cost $10 and a large popcorn & drink combo would cost him an additional $10.  He noticed on their website, they had a special offer.  From 12 noon to 3 pm, save 20% off tickets and 50% off any food combos.  How much money could Trip save by going to the earlier movie? 
Answer:
7 
Response:
<reasoning>
First, I found the price of the ticket and popcorn & drink combo. Then I subtracted the discount from the total cost.
</reasoning>
<answer>40
</answer> 
Extracted:
40
-------------------- Question:
Penn operates an ice cream stall for a week. On the first day, she made $10. Every day after that, she made $4 more than the previous day. How much money did Penn make after 5 days? 
Answer:
90 
Response:
<reasoning>
First, I made $10 on the first day. Each subsequent day, I made $4 more than the previous day. 5 days later, I made $10 + 4(5) = $14.
</reasoning>
<answer>14</answer>
 
Extracte



TrainOutput(global_step=7473, training_loss=0.0027202775428372896, metrics={'train_runtime': 24269.8978, 'train_samples_per_second': 0.308, 'train_steps_per_second': 0.308, 'total_flos': 0.0, 'train_loss': 0.0027202775428372896})

In [9]:
%load_ext tensorboard
%tensorboard --logdir logs/runs

In [17]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : SYSTEM_PROMPT},
    {"role" : "assistant", "content" : "Which is bigger? 1.11 or 1.9?"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams
sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    lora_request = None,
)[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  3.07it/s, est. speed input: 649.72 toks/s, output: 160.11 toks/s]

<reasoning>
First, I calculated the difference between 1.11 and 1.9. This difference represents the absolute difference between the two numbers.
</reasoning>
<answer>
0.8
</answer>





In [11]:
model.save_lora("grpo_gemma_saved_lora_2a")



In [12]:
text = tokenizer.apply_chat_template([
    {"role" : "user", "content" : SYSTEM_PROMPT},
    {"role" : "assistant", "content" : "Which is bigger? 9.11 or 9.9?"},
], tokenize = False, add_generation_prompt = True)

from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature = 0.8,
    top_p = 0.95,
    max_tokens = 1024,
)
output = model.fast_generate(
    text,
    sampling_params = sampling_params,
    lora_request = model.load_lora("grpo_gemma_saved_lora"),
)[0].outputs[0].text

print(output)

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  1.75it/s, est. speed input: 369.83 toks/s, output: 103.41 toks/s]

<reasoning>
First, 9.11 is rounded down to 9. Then, 9.9 is rounded up to 10. Therefore, 9.11 is bigger.
</reasoning>
<answer>10
</answer>





In [13]:
model.save_pretrained_merged("unsloth_gemma-2-2b-r1-reasoning_2a", tokenizer, save_method = "merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 71.49 out of 124.96 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 18/18 [00:00<00:00, 78.82it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [None]:
test_data = get_gsm8k_questions(split = "test")

In [16]:
from tqdm import tqdm
import random

def evaluate_model(model, tokenizer, dataset, runs=3, temperature=0.8, top_p=0.95, max_tokens=1024, seed=42):
    random.seed(seed)
    correct_answer = 0
    correct_format = 0
    
    for k, item in enumerate(tqdm(dataset)):
        prompt = item['prompt']
        answer = item['answer']
    
        text = tokenizer.apply_chat_template(prompt, tokenize = False, add_generation_prompt = True)
    
        sampling_params = SamplingParams(
            temperature = temperature,
            top_p = top_p,
            max_tokens = max_tokens,
        )

        for _ in range(runs):
            output = model.fast_generate(
                [text],
                sampling_params = sampling_params,
                lora_request = None,
                 use_tqdm=False,
            )[0].outputs[0].text
        
            correct_answer += int(answer in output)
            correct_format += count_xml(output)

    scaling = 1 / (len(dataset) * runs)
    answer_accuracy = correct_answer * scaling
    format_accuracy = correct_format * scaling

    print(f"Percentage of correct answers: {answer_accuracy:.3f}")
    print(f"Percentage of correct formats: {format_accuracy:.3f}")

    return {
        "answer_accuracy": answer_accuracy,
        "format_accuracy": format_accuracy,
    }

results = evaluate_model(model, tokenizer, test_data, runs=1)

100%|██████████| 1319/1319 [10:59<00:00,  2.00it/s]

Percentage of correct answers: 0.182
Percentage of correct formats: 0.209



