In [2]:
%%capture
# Skip restarting message in Colab
import sys; modules = list(sys.modules.keys())
for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None

!pip install unsloth vllm
!pip install --upgrade pillow

# SFT

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/Phi-4",
    model_name="unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.7, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["gate_proj", "up_proj", "down_proj",],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-11 06:21:48 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.9: Fast Llama patching. Transformers: 4.48.3. vLLM: 0.7.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/phi-3.5-mini-instruct-bnb-4bit with actual GPU utilization = 69.2%
Unsloth: Your GPU has CUDA compute capability 8.0 with VRAM = 39.56 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 24.67 G

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-11 06:22:12 model_runner.py:1115] Loading model weights took 2.1371 GB
INFO 03-11 06:22:12 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-11 06:22:16 worker.py:267] Memory profiling takes 2.65 seconds
INFO 03-11 06:22:16 worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.69) = 27.37GiB
INFO 03-11 06:22:16 worker.py:267] model weights take 2.14GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 0.39GiB; the rest of the memory reserved for KV Cache is 24.76GiB.
INFO 03-11 06:22:16 executor_base.py:111] # cuda blocks: 4225, # CPU blocks: 1024
INFO 03-11 06:22:16 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 33.01x
INFO 03-11 06:22:20 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error o

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:57<00:00,  1.34s/it]

INFO 03-11 06:23:18 model_runner.py:1562] Graph capturing finished in 57 secs, took 1.09 GiB
INFO 03-11 06:23:18 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 65.32 seconds



Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.3.9 patched 32 layers with 0 QKV layers, 0 O layers and 32 MLP layers.


In [None]:
from datasets import load_from_disk
from google.colab import drive

drive.mount('/content/drive')
sft_train = load_from_disk("/content/drive/MyDrive/cs234/sft_train_20k")
counts = {}
for ex in sft_train:
  counts[ex['task_type']] = counts.get(ex['task_type'], 0) + 1
print(counts)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
{'verifiable_math': 14576, 'verifiable_code': 1432, 'code_output_prediction': 1762, 'llm_judgeable_groundtruth_similarity': 2230}


In [None]:
from trl import apply_chat_template

processed_train = sft_train.map(apply_chat_template,
                                fn_kwargs={"tokenizer": tokenizer},
                                num_proc=10,
                                remove_columns=sft_train.features)

In [None]:
from trl import SFTConfig, SFTTrainer

train_config = SFTConfig(
    bf16=is_bfloat16_supported(),      # True
    fp16=not is_bfloat16_supported(),  # False
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    log_level="info",
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    optim="paged_adamw_8bit",
    num_train_epochs=1,
    max_steps=-1,
    output_dir="./checkpoint_dir",
    overwrite_output_dir=True,
    remove_unused_columns=True,
    save_steps=10000,
    save_total_limit=1,
    seed=712,
    warmup_ratio=0.2,
    dataset_text_field="text",
    packing=True,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    args=train_config,
    train_dataset=processed_train,
)

trainer.train(resume_from_checkpoint=True)

PyTorch: setting up devices
PyTorch: setting up devices


Tokenizing to ["text"] (num_proc=12):   0%|          | 0/20000 [00:00<?, ? examples/s]

Packing train dataset (num_proc=12):   0%|          | 0/20000 [00:00<?, ? examples/s]

Using auto half precision backend
Loading model from ./checkpoint_dir/checkpoint-2431.
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 19,451 | Num Epochs = 1 | Total steps = 2,431
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 69,206,016/2,078,346,240 (3.33% trained)
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 1
  Continuing training from global step 2431
  Will skip the first 1 epochs then the first 0 batches in the first epoch.


Training completed. Do not forget to share your model on huggingface.co/models =)




Step,Training Loss


TrainOutput(global_step=2431, training_loss=0.0, metrics={'train_runtime': 0.0139, 'train_samples_per_second': 1403802.862, 'train_steps_per_second': 175448.293, 'total_flos': 2.365688085573796e+17, 'train_loss': 0.0})

In [None]:
# import shutil

# shutil.copytree("./checkpoint_dir", "/content/drive/MyDrive/cs234/unsloth_sft_ckpts/phi-3.5-mini-instruct/")

In [None]:
# model.save_pretrained("/content/drive/MyDrive/cs234/unsloth_sft_ckpts/loras/phi-3.5-mini-instruct") # Local saving

# SFT Eval

In [None]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/Phi-4",
    model_name="/content/drive/MyDrive/cs234/unsloth_sft_ckpts/phi-3.5-mini-instruct/checkpoint-2431",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.7, # Reduce if out of memory
)

FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.3.9: Fast Llama patching. Transformers: 4.48.3. vLLM: 0.7.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/phi-3.5-mini-instruct-bnb-4bit with actual GPU utilization = 18.18%
Unsloth: Your GPU has CUDA compute capability 8.0 with VRAM = 39.56 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 4.49 GB. Also swap space = 6 GB.
INFO 03-11 06:46:48 config.py:549] This model supports multiple tasks: {'generate', 'embed', 'reward', 'score', 'classify'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config u

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-11 06:46:53 model_runner.py:1115] Loading model weights took 2.1129 GB
INFO 03-11 06:46:54 worker.py:267] Memory profiling takes 0.80 seconds
INFO 03-11 06:46:54 worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.18) = 7.19GiB
INFO 03-11 06:46:54 worker.py:267] model weights take 2.11GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.23GiB; the rest of the memory reserved for KV Cache is 4.85GiB.
INFO 03-11 06:46:54 executor_base.py:111] # cuda blocks: 827, # CPU blocks: 1024
INFO 03-11 06:46:54 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 6.46x
INFO 03-11 06:46:59 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utiliza

Capturing CUDA graph shapes: 100%|██████████| 27/27 [00:39<00:00,  1.45s/it]

INFO 03-11 06:47:38 model_runner.py:1562] Graph capturing finished in 39 secs, took 0.72 GiB
INFO 03-11 06:47:38 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 45.34 seconds





PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
              (k_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
              (v_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
              (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
              (rotary_emb): LlamaRotaryEmbedding()
            )
            (mlp): LlamaMLP(
              (gate_proj): lora.Linear(
                (base_layer): Linear4bit(in_features=3072, out_features=8192, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
              

In [None]:
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : "You are an expert assistant at estimating market size, TAM, and other relevant statistics. Please box your final answer as just a single number without any units, e.g. \\boxed{{1.0}}. We will interpret your answer as being in dollars."},
    {"role": "user", "content": "What is the total market size for commercial lithographic printing in the US?"},
], tokenize = False)

text += "\n<|assistant|>\n<think>"
text = text.replace('\n<|endoftext|>', '')

print(text)

input_ids = tokenizer.encode(text, return_tensors='pt').to("cuda")
print(type(input_ids))
print(input_ids)

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(input_ids, streamer=text_streamer, max_new_tokens=4096, pad_token_id=tokenizer.eos_token_id)

<|system|>
You are an expert assistant at estimating market size, TAM, and other relevant statistics. Please box your final answer as just a single number without any units, e.g. \boxed{{1.0}}. We will interpret your answer as being in dollars.<|end|>
<|user|>
What is the total market size for commercial lithographic printing in the US?<|end|>
<|assistant|>
<think>
<class 'torch.Tensor'>
tensor([[32006,   887,   526,   385, 17924, 20255,   472,  4844,  1218,  9999,
          2159, 29892,   323,  5194, 29892,   322,   916,  8018, 13964, 29889,
          3529,  3800,   596,  2186,  1234,   408,   925,   263,  2323,  1353,
          1728,   738, 10340, 29892,   321, 29889, 29887, 29889,   320,  1884,
           287,  6224, 29896, 29889, 29900, 27243,  1334,   674,  6613,   596,
          1234,   408,  1641,   297, 17208, 29889, 32007, 32010,  1724,   338,
           278,  3001,  9999,  2159,   363, 12128,   301,   389, 12122, 14010,
           297,   278,  3148, 29973, 32007, 32001,   529

KeyboardInterrupt: 

'<|system|>\nYou are an expert assistant at estimating market size, TAM, and other relevant statistics. Please box your final answer as just a single number without any units, e.g. \\boxed{{1.0}}. We will interpret your answer as being in dollars.<|end|>\n<|user|>\nWhat is the total market size for commercial lithographic printing in the US?<|end|>\n<|assistant|>\n<think>'

# GRPO

In [2]:
from google.colab import drive

import pandas as pd

drive.mount('/content/drive')
raw_rl_data = pd.read_excel("/content/drive/MyDrive/cs234/market_stats.xlsx", usecols="A:D")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from datasets import Dataset

def transform(row):
  system_prompt = (
    "For the following questions, please make estimates based on your knowledge only; do not consult outside sources. Respond in the following format:\n<think> ... reasoning process here ... </think>\n<answer> ... </answer>\n"
  )

  user_template = "You are an expert assistant at estimating market size, TAM, and other relevant statistics. Please box your final answer as just a single number without any units, e.g. \\boxed{{1.0}}. We will interpret your answer as being in {unit}.\n{prompt}"
  # "\n<|assistant|>\n<think>" -- will we need to manually add this? hm.

  return {
      "prompt": [
          {"role": "system", "content": system_prompt},
          {"role": "user", "content": user_template.format(unit=row["Unit"], prompt=row['Prompt'])}
      ],
      "ground_truth": row["Value"]
  }


rl_data = Dataset.from_pandas(raw_rl_data).map(transform).remove_columns(["Prompt", "Unit", "Value", "Source"])

Map:   0%|          | 0/1285 [00:00<?, ? examples/s]

In [4]:
rl_data[0]

{'prompt': [{'content': 'For the following questions, please make estimates based on your knowledge only; do not consult outside sources. Respond in the following format:\n<think> ... reasoning process here ... </think>\n<answer> ... </answer>\n',
   'role': 'system'},
  {'content': 'You are an expert assistant at estimating market size, TAM, and other relevant statistics. Please box your final answer as just a single number without any units, e.g. \\boxed{1.0}. We will interpret your answer as being in dollars.\nWhat is the total revenue of nuclear power providers in the US?',
   'role': 'user'}],
 'ground_truth': 37900000000.0}

In [5]:
# max_length = max([len(tokenizer.encode(row[0]['content'])) + len(tokenizer.encode(row[1]['content'])) for row in rl_data['prompt']])
# max_length

In [6]:
import numpy as np
import re

def accuracy_reward(completions, ground_truth, **kwargs):
  rewards = []
  for message_obj, gt in zip(completions, ground_truth):
    completion = message_obj[0]['content']
    match_obj = re.match(r"[\s\S]*<answer>(.*?)</answer>.*", completion)
    if match_obj is not None:
      raw_answer = match_obj.group(1)
      try:
        extracted_answer = float(re.sub(r"[^\d\.]", "", raw_answer))
        rewards.append(
            float((abs(extracted_answer - float(gt)) / float(gt)) <= 0.05)
        )
      except:
        rewards.append(0.0)
    else:
        rewards.append(0.0)
  return rewards


def score(x):
    # Approx. equal to 1 at |x|=0
    # Approx. equal to 0 at |x|=1
    # Assign higher reward as we get closer to |x|=0
    # R(0.5) = 0.04, R(0.25) = 0.46, R(0.1) = 0.88, ...
    # return 1 / np.sqrt(2 * np.pi * 0.1592) * np.exp(-0.5 * (x / 0.2) ** 2)
    return 1 / np.sqrt(2 * np.pi * 0.1592) * np.exp(-0.5 * (x / 0.3) ** 2)


def smoother_accuracy(completions, ground_truth, **kwargs):
  rewards = []
  for message_obj, gt in zip(completions, ground_truth):
    completion = message_obj[0]['content']
    match_obj = re.match(r"[\s\S]*<answer>(.*?)</answer>.*", completion)
    if match_obj is not None:
      raw_answer = match_obj.group(1)
      try:
        extracted_answer = float(re.sub(r"[^\d\.]", "", raw_answer))
        rewards.append(
            2 * score(float(abs(extracted_answer - float(gt))) / float(gt))
        )
      except Exception as e:
        print(e)
        rewards.append(0.0)
    else:
        rewards.append(0.0)
  return rewards


def format_reward_func(completions, ground_truth, **kwargs):
  total = 0
  pattern = r"^[\s]*<think>.*?</think>[\s]*<answer>.*?</answer>[\s]*$"
  at_least_once = [1 if re.match(pattern, completion[0]['content'], re.S) else 0 for completion in completions]
  exactly_once = [1 if len(re.findall("<think>", completion[0]['content'], re.S)) > 1 else 0 for completion in completions]
  # return [((gte1 + eq1) - 1) / 5 for gte1, eq1 in zip(at_least_once, exactly_once)]
  return [(gte1 + eq1) / 2 for gte1, eq1 in zip(at_least_once, exactly_once)]


def length_penalty(completions, ground_truth, **kwargs):
  return [-len(cmpl[0]['content']) / 105400 for cmpl in completions]

In [7]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
max_seq_length = 512 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.7, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["gate_proj", "up_proj", "down_proj",],
)

model.load_lora("/content/drive/MyDrive/cs234/unsloth_sft_ckpts/loras/phi-3.5-mini-instruct");

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-13 01:14:02 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.9: Fast Llama patching. Transformers: 4.48.3. vLLM: 0.7.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/phi-3.5-mini-instruct-bnb-4bit with actual GPU utilization = 69.2%
Unsloth: Your GPU has CUDA compute capability 8.0 with VRAM = 39.56 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 512. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 24.67 GB

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-13 01:14:32 model_runner.py:1115] Loading model weights took 2.1371 GB
INFO 03-13 01:14:32 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-13 01:14:35 worker.py:267] Memory profiling takes 2.66 seconds
INFO 03-13 01:14:35 worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.69) = 27.37GiB
INFO 03-13 01:14:35 worker.py:267] model weights take 2.14GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 0.38GiB; the rest of the memory reserved for KV Cache is 24.77GiB.
INFO 03-13 01:14:36 executor_base.py:111] # cuda blocks: 4226, # CPU blocks: 1024
INFO 03-13 01:14:36 executor_base.py:116] Maximum concurrency for 512 tokens per request: 132.06x
INFO 03-13 01:14:40 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error o

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:59<00:00,  1.38s/it]

INFO 03-13 01:15:40 model_runner.py:1562] Graph capturing finished in 60 secs, took 1.09 GiB
INFO 03-13 01:15:40 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 67.49 seconds



Not an error, but Unsloth cannot patch Attention layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Not an error, but Unsloth cannot patch O projection layer with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.3.9 patched 32 layers with 0 QKV layers, 0 O layers and 32 MLP layers.


In [8]:
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 1e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 1, # Increase to 4 for smoother training
    num_generations = 4, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 1024,
    num_train_epochs = 1, # Set to 1 for a full training run
    max_steps = -1,
    save_steps = 10000,
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = "./grpo_ckpts",
)

trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[smoother_accuracy, format_reward_func, length_penalty],
    args=training_args,
    train_dataset=rl_data,
)
trainer.train()

Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 1 to the `num_generations` of 4


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,285 | Num Epochs = 1 | Total steps = 1,285
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 1 x 1) = 4
 "-____-"     Trainable parameters = 69,206,016/2,078,346,240 (3.33% trained)
Unsloth: Input IDs of length 513 > the model's max sequence length of 512.
We shall truncate it ourselves. It's imperative if you correct this issue first.


Step,Training Loss,reward,reward_std,completion_length,kl,rewards / smoother_accuracy,rewards / format_reward_func,rewards / length_penalty
1,-0.0,0.488623,0.004255,389.0,0.0,0.005798,0.5,-0.017175
2,0.0,0.362433,0.252431,391.0,0.0,0.003875,0.375,-0.016442
3,0.0,-0.006573,0.010164,372.75,0.0,0.006776,0.0,-0.013349
4,-0.0,0.631742,0.276601,378.0,0.0,0.146191,0.5,-0.01445
5,-0.0,0.483359,0.000186,384.0,0.0,0.0,0.5,-0.016641
6,-0.0,0.360323,0.249033,385.0,0.0,0.0,0.375,-0.014677
7,0.0,0.363598,0.249286,300.5,0.000771,0.001933,0.375,-0.013335
8,-0.0,0.237094,0.286914,337.75,0.0,0.0,0.25,-0.012906
9,5.2116,0.237203,0.289108,388.0,130.289047,0.001933,0.25,-0.01473
10,0.0069,0.50897,0.381352,387.0,0.171503,0.149331,0.375,-0.015361


Unsloth: Will smartly offload gradients to save VRAM!
could not convert string to float: '.'
could not convert string to float: '..'
could not convert string to float: '.1.0.'
could not convert string to float: '3300000000100.000010.0133000000'
could not convert string to float: '.'
could not convert string to float: '..'
could not convert string to float: ''
could not convert string to float: '..'
could not convert string to float: '.'
could not convert string to float: '.'
could not convert string to float: ''
could not convert string to float: '.'
could not convert string to float: ''
could not convert string to float: '.'
could not convert string to float: '.'
could not convert string to float: ''
could not convert string to float: '.'
could not convert string to float: '10000.231013.10'
could not convert string to float: '....'
could not convert string to float: '......'
could not convert string to float: '.'
could not convert string to float: '2.53700087.5.'
could not convert str

Step,Training Loss,reward,reward_std,completion_length,kl,rewards / smoother_accuracy,rewards / format_reward_func,rewards / length_penalty
1,-0.0,0.488623,0.004255,389.0,0.0,0.005798,0.5,-0.017175
2,0.0,0.362433,0.252431,391.0,0.0,0.003875,0.375,-0.016442
3,0.0,-0.006573,0.010164,372.75,0.0,0.006776,0.0,-0.013349
4,-0.0,0.631742,0.276601,378.0,0.0,0.146191,0.5,-0.01445
5,-0.0,0.483359,0.000186,384.0,0.0,0.0,0.5,-0.016641
6,-0.0,0.360323,0.249033,385.0,0.0,0.0,0.375,-0.014677
7,0.0,0.363598,0.249286,300.5,0.000771,0.001933,0.375,-0.013335
8,-0.0,0.237094,0.286914,337.75,0.0,0.0,0.25,-0.012906
9,5.2116,0.237203,0.289108,388.0,130.289047,0.001933,0.25,-0.01473
10,0.0069,0.50897,0.381352,387.0,0.171503,0.149331,0.375,-0.015361


could not convert string to float: '.'
could not convert string to float: '..'
could not convert string to float: '0.130000000.0013000.'
could not convert string to float: '20.735.1'
could not convert string to float: '.'
could not convert string to float: '.'
could not convert string to float: ''
could not convert string to float: ''
could not convert string to float: ''
could not convert string to float: ''
could not convert string to float: '.'
could not convert string to float: '1.590.10.05'
could not convert string to float: ''
could not convert string to float: '.'
could not convert string to float: '..50000..'
could not convert string to float: '.'
could not convert string to float: ''
could not convert string to float: ''
could not convert string to float: '.'
could not convert string to float: '...25'
could not convert string to float: '1.0.'
could not convert string to float: '..500000.'
could not convert string to float: '...'
could not convert string to float: ''
could not 

TrainOutput(global_step=1285, training_loss=4334.7821935846005, metrics={'train_runtime': 16641.028, 'train_samples_per_second': 0.077, 'train_steps_per_second': 0.077, 'total_flos': 0.0, 'train_loss': 4334.7821935846005})

In [11]:
path_specifier = "2smooth0.3-positive-formats-in-01-cosine-scheduler-lr1e-6--rewards-synth-only-sft-phi-3.5-mini-instruct"

In [20]:
# !mv ./grpo_ckpts/trainer_state.json ./grpo_ckpts/checkpoint-920/trainer_state.json

In [10]:
# torch.save(trainer.scheduler, "./grpo_ckpts/checkpoint-920/scheduler.pt")

In [None]:
model.save_pretrained(f"/content/drive/MyDrive/cs234/unsloth_sft_ckpts/rl_loras/{path_specifier}") # Local saving

In [None]:
import matplotlib.pyplot as plt
import json

def moving_average(data, window_size):
    return np.convolve(data, np.ones(window_size) / window_size, mode='same')

reward_metrics = [entry['reward'] for entry in trainer.state.log_history if 'reward' in entry]
reward_std_metrics = [entry['reward_std'] for entry in trainer.state.log_history if 'reward_std' in entry]

n_samples = 4
errors = [std / np.sqrt(n_samples) for std in reward_std_metrics]

window_size = 20
smoothed_rewards = moving_average(reward_metrics, window_size)
smoothed_stds = moving_average(reward_std_metrics, window_size)
smoothed_errors = moving_average(errors, window_size)
smoothed_lb = [val - err for val, err in zip(smoothed_rewards, smoothed_errors)]
smoothed_ub = [val + err for val, err in zip(smoothed_rewards, smoothed_errors)]

plt.plot(range(len(rl_data)), smoothed_rewards)
plt.fill_between(range(len(rl_data)), smoothed_lb, smoothed_ub, alpha=0.5)
plt.xlabel("Step")
plt.ylabel("Reward")
plt.title(f"Smoothed rewards over time (W={window_size})")

plt.savefig(f"/content/drive/MyDrive/cs234/{path_specifier}_smoothed_rewards.png")

with open(f"/content/drive/MyDrive/cs234/{path_specifier}_metrics.json", "w") as f:
  json.dump({
      "rewards": reward_metrics,
      "stds": reward_std_metrics,
  }, f)

In [None]:
plt.clf()
plt.plot(range(len(rl_data)), reward_metrics)
plt.xlabel("Step")
plt.ylabel("Reward")
plt.title(f"Rewards over time")
plt.savefig(f"/content/drive/MyDrive/cs234/{path_specifier}_rewards.png")

# GRPO Eval

In [3]:
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch

max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name="/content/drive/MyDrive/cs234/unsloth_rl_ckpts/2smooth0.3-lr1e-6-augmented-sft-ckpt--rewards-phi-3.5-mini-instruct/checkpoint-1285",
    model_name="unsloth/phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.7, # Reduce if out of memory
)

FastLanguageModel.for_inference(model)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 03-13 19:13:58 __init__.py:207] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.10: Fast Llama patching. Transformers: 4.48.3. vLLM: 0.7.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/phi-3.5-mini-instruct-bnb-4bit with actual GPU utilization = 69.2%
Unsloth: Your GPU has CUDA compute capability 8.0 with VRAM = 39.56 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 24.67 

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-13 19:14:23 model_runner.py:1115] Loading model weights took 2.1371 GB
INFO 03-13 19:14:23 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 03-13 19:14:26 worker.py:267] Memory profiling takes 2.72 seconds
INFO 03-13 19:14:26 worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.69) = 27.37GiB
INFO 03-13 19:14:26 worker.py:267] model weights take 2.14GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 0.39GiB; the rest of the memory reserved for KV Cache is 24.76GiB.
INFO 03-13 19:14:27 executor_base.py:111] # cuda blocks: 4225, # CPU blocks: 1024
INFO 03-13 19:14:27 executor_base.py:116] Maximum concurrency for 2048 tokens per request: 33.01x
INFO 03-13 19:14:31 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error o

Capturing CUDA graph shapes: 100%|██████████| 43/43 [00:59<00:00,  1.39s/it]

INFO 03-13 19:15:31 model_runner.py:1562] Graph capturing finished in 60 secs, took 1.09 GiB
INFO 03-13 19:15:31 llm_engine.py:436] init engine (profile, create kv cache, warmup model) took 67.69 seconds





tokenizer_config.json:   0%|          | 0.00/3.37k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((

In [27]:
### SANITY CHECK ###
text = tokenizer.apply_chat_template([
    {"role" : "system", "content" : "You are an expert assistant at estimating market size, TAM, and other relevant statistics. Please box your final answer as just a single number without any units, e.g. \\boxed{{1.0}}. We will interpret your answer as being in dollars."},
    {"role": "user", "content": "What is the total market size for commercial lithographic printing in the US?"},
], tokenize = False)

text += "\n<|assistant|>\n<think>"
text = text.replace('\n<|endoftext|>', '')

print(text)

input_ids = tokenizer.encode(text, return_tensors='pt').to("cuda")
print(type(input_ids))
print(input_ids)

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt=True)

_ = model.generate(input_ids, streamer=text_streamer, max_new_tokens=4096, pad_token_id=tokenizer.eos_token_id)

<|system|>
You are an expert assistant at estimating market size, TAM, and other relevant statistics. Please box your final answer as just a single number without any units, e.g. \boxed{{1.0}}. We will interpret your answer as being in dollars.<|end|>
<|user|>
What is the total market size for commercial lithographic printing in the US?<|end|>
<|assistant|>
<think>
<class 'torch.Tensor'>
tensor([[32006,   887,   526,   385, 17924, 20255,   472,  4844,  1218,  9999,
          2159, 29892,   323,  5194, 29892,   322,   916,  8018, 13964, 29889,
          3529,  3800,   596,  2186,  1234,   408,   925,   263,  2323,  1353,
          1728,   738, 10340, 29892,   321, 29889, 29887, 29889,   320,  1884,
           287,  6224, 29896, 29889, 29900, 27243,  1334,   674,  6613,   596,
          1234,   408,  1641,   297, 17208, 29889, 32007, 32010,  1724,   338,
           278,  3001,  9999,  2159,   363, 12128,   301,   389, 12122, 14010,
           297,   278,  3148, 29973, 32007, 32001,   529

In [4]:
from google.colab import drive

import pandas as pd

drive.mount('/content/drive')
raw_rl_data = pd.read_excel("/content/drive/MyDrive/cs234/market_stats.xlsx", usecols="A:D")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from datasets import Dataset

SYSTEM_PROMPT = """
You are an expert assistant at estimating market size, TAM, and other relevant statistics. We will interpret your answer as being in {unit}.
Respond in the following format:
<think>
...
</think>
<answer>
...
</answer>
"""

def transform(row):
  return {
      "prompt": [
          {"role": "system", "content": SYSTEM_PROMPT.format(unit=row["Unit"])},
          {"role": "user", "content": row['Prompt']}
      ],
      "ground_truth": row["Value"]
  }


rl_data = Dataset.from_pandas(raw_rl_data).map(transform).remove_columns(["Prompt", "Unit", "Value", "Source"])

Map:   0%|          | 0/1285 [00:00<?, ? examples/s]

In [6]:
import numpy as np
import re

def score(x):
    # Approx. equal to 1 at |x|=0
    # Approx. equal to 0 at |x|=1
    # Assign higher reward as we get closer to |x|=0
    # R(0.5) = 0.04, R(0.25) = 0.46, R(0.1) = 0.88, ...
    # return 1 / np.sqrt(2 * np.pi * 0.1592) * np.exp(-0.5 * (x / 0.2) ** 2)
    return 1 / np.sqrt(2 * np.pi * 0.1592) * np.exp(-0.5 * (x / 0.3) ** 2)


def smoother_accuracy_from_text(completion, reference):
    match_obj = re.match(r"[\s\S]*<answer>(.*?)</answer>.*", completion, re.S)
    if match_obj is not None:
      raw_answer = match_obj.group(1)
      try:
        extracted_answer = float(re.sub(r"[^\d\.]", "", raw_answer))
        return 2 * score(float(abs(extracted_answer - float(reference))) / float(reference))
      except Exception as e:
        print(e)
        return 0.0
    else:
        return 0.0

In [7]:
### SANITY CHECK ###
from tqdm import tqdm
rewards = []
for idx, row in tqdm(enumerate(rl_data)):
  text = tokenizer.apply_chat_template(row['prompt'], tokenize = False)
  text += "\n<|assistant|>\n<think>"
  text = text.replace('\n<|endoftext|>', '')

  input_ids = tokenizer.encode(text, return_tensors='pt').to("cuda")

  response = tokenizer.decode(model.generate(input_ids, max_new_tokens=4096, pad_token_id=tokenizer.eos_token_id)[0])
  rewards.append(smoother_accuracy_from_text(response, float(row['ground_truth'])))

0it [00:00, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
1it [00:10, 10.21s/it]

could not convert string to float: '991.2118.8118.8.'


2it [02:55, 101.60s/it]

could not convert string to float: '...'


3it [05:42, 131.15s/it]

could not convert string to float: '2023....'


4it [08:27, 144.53s/it]

could not convert string to float: '...'


5it [11:11, 151.86s/it]

could not convert string to float: '...'


6it [13:56, 156.33s/it]

could not convert string to float: '...'


7it [16:41, 159.11s/it]

could not convert string to float: '...'


8it [19:26, 160.90s/it]

could not convert string to float: '..1..2..3..4....'


9it [22:11, 162.25s/it]

could not convert string to float: '...'


10it [24:57, 163.29s/it]

could not convert string to float: '.....'


11it [25:10, 117.32s/it]

could not convert string to float: '...'


12it [27:55, 131.98s/it]

could not convert string to float: '...'


13it [30:41, 142.10s/it]

could not convert string to float: '...'


14it [33:26, 148.98s/it]

could not convert string to float: '...'


15it [36:10, 153.71s/it]

could not convert string to float: '...'


16it [38:55, 157.04s/it]

could not convert string to float: '1..2...3.0..4...'


17it [41:41, 159.63s/it]

could not convert string to float: '.......'


19it [45:01, 135.16s/it]

could not convert string to float: '...'


20it [47:46, 144.07s/it]

could not convert string to float: '.....'


21it [50:24, 148.17s/it]

could not convert string to float: '1..2..3..4....'


22it [53:01, 151.05s/it]

could not convert string to float: '...'


23it [55:39, 153.05s/it]

could not convert string to float: '...'


24it [58:17, 154.55s/it]

could not convert string to float: '.1..2..3..4.....'


25it [1:00:55, 155.49s/it]

could not convert string to float: '1..2..3....'


26it [1:03:33, 156.25s/it]

could not convert string to float: '...'


27it [1:06:11, 156.69s/it]

could not convert string to float: '...'


28it [1:08:50, 157.37s/it]

could not convert string to float: '...'


29it [1:11:35, 159.78s/it]

could not convert string to float: '.1.....2....3....0.'


30it [1:11:47, 115.49s/it]

could not convert string to float: '...'


31it [1:14:32, 130.44s/it]

could not convert string to float: '...'


32it [1:17:18, 141.11s/it]

could not convert string to float: '...'


33it [1:20:04, 148.46s/it]

could not convert string to float: '...'


34it [1:22:50, 153.59s/it]

could not convert string to float: '...'


35it [1:25:35, 157.05s/it]

could not convert string to float: '...'


36it [1:28:20, 159.51s/it]

could not convert string to float: '...'


37it [1:31:05, 161.28s/it]

could not convert string to float: '...'


38it [1:33:51, 162.58s/it]

could not convert string to float: '...'


39it [1:34:02, 117.08s/it]

could not convert string to float: '100.150.10000.10015010000150000000150.'


40it [1:36:47, 131.44s/it]

could not convert string to float: '........'


41it [1:39:32, 141.59s/it]

could not convert string to float: '2023....'


42it [1:42:17, 148.65s/it]

could not convert string to float: '2023..1..2..3...'


43it [1:45:02, 153.62s/it]

could not convert string to float: '.'


44it [1:47:48, 157.09s/it]

could not convert string to float: '...'


45it [1:50:33, 159.68s/it]

could not convert string to float: '....'


46it [1:53:20, 161.67s/it]

could not convert string to float: '2023......'


47it [1:56:05, 162.80s/it]

could not convert string to float: '2023....'


48it [1:58:51, 163.59s/it]

could not convert string to float: '...'


49it [2:01:35, 163.94s/it]

could not convert string to float: '..1..2..3..10550..'


50it [2:04:21, 164.46s/it]

could not convert string to float: '..'


51it [2:07:06, 164.70s/it]

could not convert string to float: '...'


52it [2:09:52, 165.02s/it]

could not convert string to float: '...'


53it [2:12:38, 165.26s/it]

could not convert string to float: '.'


54it [2:15:23, 165.22s/it]

could not convert string to float: '..'


55it [2:18:09, 165.51s/it]

could not convert string to float: '2023...'


56it [2:20:55, 165.47s/it]

could not convert string to float: '.'


57it [2:23:40, 165.34s/it]

could not convert string to float: '..'


58it [2:26:25, 165.43s/it]

could not convert string to float: '2023...'


59it [2:29:11, 165.56s/it]

could not convert string to float: '...'


60it [2:31:56, 165.41s/it]

could not convert string to float: '..'


61it [2:34:42, 165.48s/it]

could not convert string to float: '2023....'


62it [2:37:27, 165.36s/it]

could not convert string to float: '..'


63it [2:40:12, 165.20s/it]

could not convert string to float: '..'


64it [2:42:57, 165.32s/it]

could not convert string to float: '1......2....3..4..5..6..7...'


65it [2:45:44, 165.64s/it]

could not convert string to float: '..'


66it [2:48:29, 165.46s/it]

could not convert string to float: '2023......'


67it [2:51:14, 165.38s/it]

could not convert string to float: '..'


68it [2:53:59, 165.44s/it]

could not convert string to float: '2023......'


69it [2:56:46, 165.63s/it]

could not convert string to float: '1...2...3......'


70it [2:59:31, 165.55s/it]

could not convert string to float: '1....2..3..4...'


71it [3:02:16, 165.38s/it]

could not convert string to float: '...'


72it [3:05:01, 165.41s/it]

could not convert string to float: '.1..2....3...'


73it [3:07:46, 165.21s/it]

could not convert string to float: '..'


74it [3:10:31, 165.14s/it]

could not convert string to float: '..'


75it [3:13:17, 165.29s/it]

could not convert string to float: '2023......'


76it [3:16:02, 165.18s/it]

could not convert string to float: '2023....'


77it [3:18:48, 165.51s/it]

could not convert string to float: '2023....'


78it [3:21:33, 165.43s/it]

could not convert string to float: '1..2..3..4.0.5...'


79it [3:24:19, 165.54s/it]

could not convert string to float: '.1....2..3...'


80it [3:27:05, 165.73s/it]

could not convert string to float: '2023.....'


81it [3:27:38, 125.76s/it]

could not convert string to float: '2023......'


82it [3:30:23, 137.58s/it]

could not convert string to float: '...'


83it [3:33:08, 145.99s/it]

could not convert string to float: '...'


84it [3:35:54, 151.91s/it]

could not convert string to float: '...'


84it [3:37:43, 155.52s/it]


KeyboardInterrupt: 

In [8]:
sum(rewards)/len(rewards)

0.0

In [9]:
# TRAINED PHI-3.5 MINI MODEL
# 0.21617820773568455

# RAW: 0 after 85 steps