In [1]:
import os
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '0'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

0

In [3]:
import re
from datasets import load_dataset, Dataset

def extract_hash_answer(text: str) -> str | None:
    if "####" not in text:
        return None
    return text.split("####")[1].strip()

def get_gsm8k_questions(split = "test") -> Dataset:
    data = load_dataset('openai/gsm8k', 'main')[split] 
    data = data.map(lambda x: {
        'prompt': [
            {'role': 'user', 'content': x['question']},
        ],
        'answer': extract_hash_answer(x['answer'])
    }) 
    return data

In [4]:
test_data = get_gsm8k_questions(split = "test")

In [5]:
from tqdm import tqdm
import random
from vllm import SamplingParams

def count_xml(text) -> float:
    count = 0.0
    if text.count("<reasoning>") == 1:
        count += 0.125
    if text.count("</reasoning>") == 1:
        count += 0.125
    if text.count("<answer>") == 1:
        count += 0.125
        count -= len(text.split("<answer>")[-1].split("</answer>")[0])*0.001
    if text.count("</answer>") == 1:
        count += 0.125
        count -= (len(text.split("</answer>")[-1]) - 1)*0.001
    return count
    
from tqdm import tqdm
import random

def evaluate_model(model, tokenizer, dataset, runs=3, temperature=0.8, top_p=0.95, max_tokens=1024, seed=42):
    random.seed(seed)
    correct_answer = 0
    correct_format = 0
    
    for k, item in enumerate(tqdm(dataset)):
        prompt = item['prompt']
        answer = item['answer']
    
        text = tokenizer.apply_chat_template(prompt, tokenize = False, add_generation_prompt = True)
    
        sampling_params = SamplingParams(
            temperature = temperature,
            top_p = top_p,
            max_tokens = max_tokens,
        )

        for _ in range(runs):
            output = model.fast_generate(
                [text],
                sampling_params = sampling_params,
                lora_request = None,
                 use_tqdm=False,
            )[0].outputs[0].text
        
            correct_answer += int(answer in output)
            correct_format += count_xml(output)

    scaling = 1 / (len(dataset) * runs)
    answer_accuracy = correct_answer * scaling
    format_accuracy = correct_format * scaling

    print(f"Percentage of correct answers: {answer_accuracy:.3f}")
    print(f"Score of correct formats: {format_accuracy:.3f}")

    return {
        "answer_accuracy": answer_accuracy,
        "format_accuracy": format_accuracy,
    }

INFO 02-12 00:53:34 __init__.py:190] Automatically detected platform cuda.


In [6]:
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [7]:
from unsloth import is_bfloat16_supported
import torch
max_seq_length = 256 # Can increase for longer reasoning traces

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "google/gemma-2-2b-it",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    dtype = None,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)

==((====))==  Unsloth 2025.2.5: Fast Gemma2 patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: vLLM loading unsloth/gemma-2-2b-it-bnb-4bit with actual GPU utilization = 49.44%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 23.68 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 256. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 9.25 GB. Also swap space = 6 GB.
INFO 02-12 00:53:44 config.py:542] This model supports multiple tasks: {'generate', 'classify', 'embed', 'reward', 'score'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config using kwargs = {'load_in_8bit': False, 'load_in_4bit': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_



INFO 02-12 00:53:46 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-12 00:53:48 model_runner.py:1115] Loading model weights took 2.1024 GB
INFO 02-12 00:53:48 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 02-12 00:53:49 worker.py:267] Memory profiling takes 1.23 seconds
INFO 02-12 00:53:49 worker.py:267] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.49) = 11.71GiB
INFO 02-12 00:53:49 worker.py:267] model weights take 2.10GiB; non_torch_memory takes 0.06GiB; PyTorch activation peak memory takes 2.06GiB; the rest of the memory reserved for KV Cache is 7.49GiB.
INFO 02-12 00:53:49 executor_base.py:110] # CUDA blocks: 4721, # CPU blocks: 3780
INFO 02-12 00:53:49 executor_base.py:115] Maximum concurrency for 256 tokens per request: 295.06x
INFO 02-12 00:53:52 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error oc

Capturing CUDA graph shapes: 100%|██████████| 31/31 [00:16<00:00,  1.88it/s]

INFO 02-12 00:54:08 model_runner.py:1562] Graph capturing finished in 16 secs, took 0.57 GiB
INFO 02-12 00:54:08 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 20.75 seconds





In [8]:
accuracy_scores = []
for k in range(1):
    results = evaluate_model(model, tokenizer, test_data, runs=1, seed=k)
    accuracy_scores.append(results["answer_accuracy"])

print(accuracy_scores)

100%|██████████| 1319/1319 [22:49<00:00,  1.04s/it]

Percentage of correct answers: 0.600
Score of correct formats: 0.000
[0.6004548900682335]





In [9]:
temperature=0.0
top_p=0.95
max_tokens=1024

prompt = [{'role': 'user', 'content': test_data[1]["question"]}] 
text = tokenizer.apply_chat_template(prompt, tokenize = False, add_generation_prompt = True)
    
sampling_params = SamplingParams(
    temperature = temperature,
    top_p = top_p,
    max_tokens = max_tokens,
)

output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    use_tqdm=False,
)[0].outputs[0].text

print(output)

Here's how to solve the problem:

* **White fiber:** The robe needs half as much white fiber as blue fiber, so it needs 2 bolts / 2 = 1 bolt of white fiber.
* **Total fiber:**  The robe needs 2 bolts of blue + 1 bolt of white = 3 bolts of fiber in total. 

**Answer:** It takes a total of 3 bolts of fiber to make the robe. 



In [10]:
del(model)

In [11]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "grpo_gemma_saved_lora_2b",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    dtype = None,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)

==((====))==  Unsloth 2025.2.5: Fast Gemma patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.684 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: vLLM loading unsloth/gemma-2b-it-bnb-4bit with actual GPU utilization = 27.25%
Unsloth: Your GPU has CUDA compute capability 8.6 with VRAM = 23.68 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 256. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 4.18 GB. Also swap space = 6 GB.
INFO 02-12 01:17:20 config.py:542] This model supports multiple tasks: {'generate', 'classify', 'embed', 'reward', 'score'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config using kwargs = {'load_in_8bit': False, 'load_in_4bit': True, 'bnb_4bit_compute_dtype': 'bfloat16', 'bnb_4bit_qua

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 02-12 01:17:23 model_runner.py:1115] Loading model weights took 1.9369 GB
INFO 02-12 01:17:24 worker.py:267] Memory profiling takes 0.49 seconds
INFO 02-12 01:17:24 worker.py:267] the current vLLM instance can use total_gpu_memory (23.68GiB) x gpu_memory_utilization (0.27) = 6.45GiB
INFO 02-12 01:17:24 worker.py:267] model weights take 1.94GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.76GiB; the rest of the memory reserved for KV Cache is 2.76GiB.
INFO 02-12 01:17:24 executor_base.py:110] # CUDA blocks: 10054, # CPU blocks: 21845
INFO 02-12 01:17:24 executor_base.py:115] Maximum concurrency for 256 tokens per request: 628.38x
INFO 02-12 01:17:27 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_uti

Capturing CUDA graph shapes: 100%|██████████| 27/27 [00:13<00:00,  2.06it/s]

INFO 02-12 01:17:40 model_runner.py:1562] Graph capturing finished in 13 secs, took 0.13 GiB
INFO 02-12 01:17:40 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 17.42 seconds



Unsloth 2025.2.5 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.


In [None]:
accuracy_scores = []
for k in range(1):
    results = evaluate_model(model, tokenizer, test_data, runs=1, seed=k)
    accuracy_scores.append(results["answer_accuracy"])

print(accuracy_scores)

  1%|          | 7/1319 [00:02<09:07,  2.40it/s]

In [None]:
temperature=0.0
top_p=0.95
max_tokens=1024

prompt = [{'role': 'user', 'content': test_data[1]["question"]}] 
text = tokenizer.apply_chat_template(prompt, tokenize = False, add_generation_prompt = True)
    
sampling_params = SamplingParams(
    temperature = temperature,
    top_p = top_p,
    max_tokens = max_tokens,
)

output = model.fast_generate(
    [text],
    sampling_params = sampling_params,
    use_tqdm=False,
)[0].outputs[0].text

print(output)