# GRPO RL Training

In [1]:
!pip install transformers trl



In [2]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

## Import libraries

In [3]:
import torch
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from trl import GRPOTrainer, GRPOConfig
from datasets import load_dataset, Dataset
import re
import pandas as pd
from tqdm import tqdm
import os

os.environ["WANDB_DISABLED"] = "true"

## Some Utility Functions

In [4]:
def generate_responses(model,
                       tokenizer,
                       user_message=None,
                       system_message=None,
                       max_new_tokens=300,
                       full_message=None):

    # Format chat using tokenizer's chat template
    if full_message:
        messages = full_message
    else:
        messages = []
        if system_message:
            messages.append({"role": "system", "content": system_message})
        messages.append({"role": "user", "content": user_message})

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False,
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:]
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()

    return response

def test_model_with_questions(model,
                              tokenizer,
                              questions,
                              system_message=None,
                              title="Model Output"):

    print(f"\n=== {title} ===")
    for i, question in enumerate(questions, 1):
        response = generate_responses(model, tokenizer, question, system_message)
        print(f"\nModel Input {i}:\n{question}\nModel Output {i}:\n{response}\n")

def load_model_and_tokenizer(model_name,
                             use_gpu = False):

    # Load base model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)

    if use_gpu:
        model.to("cuda")

    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
                {% if message['role'] == 'system' %}System: {{ message['content'] }}\n
                {% elif message['role'] == 'user' %}User: {{ message['content'] }}\n
                {% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }} <|endoftext|>
                {% endif %}
                {% endfor %}"""

    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer



def display_dataset(dataset):
    # Visualize the dataset
    rows = []
    for i in range(3):
        example = dataset[i]
        user_msg = next(m['content'] for m in example['messages'] if m['role'] == 'user')
        assistant_msg = next(m['content'] for m in example['messages'] if m['role'] == 'assistant')
        rows.append({
            'User Prompt': user_msg,
            'Assistant Response': assistant_msg
        })

    # Display as table
    df = pd.DataFrame(rows)
    pd.set_option('display.max_colwidth', None)  # Avoid truncating long strings
    display(df)

## Prepare for evaluation dataset for Math: GSM8K

In [5]:
USE_GPU = True

SYSTEM_PROMPT = (
    "You are a helpful assistant that solves problems step-by-step. "
    "Always include the final numeric answer inside \\boxed{}."
)

In [6]:
def reward_func(completions, ground_truth, **kwargs):
    # Regular expression to capture content inside \boxed{}
    matches = [re.search(r"\\boxed\{(.*?)\}", completion[0]['content']) for completion in completions]
    contents = [match.group(1) if match else "" for match in matches]
    # Reward 1 if the content is the same as the ground truth, 0 otherwise
    return [1.0 if c == gt else 0.0 for c, gt in zip(contents, ground_truth)]

In [7]:
sample_pred = [[{"role": "assistant",
                 "content": r"...Calculating the answer. \boxed{72}"}]]
ground_truth = ["72"]
reward = reward_func(sample_pred, ground_truth)
print(f"Positive Sample Reward: {reward}")

Positive Sample Reward: [1.0]


In [8]:
sample_pred = [[{"role": "assistant",
                 "content": r"...Calculating the answer \boxed{71}"}]]
ground_truth = ["72"]
reward = reward_func(sample_pred, ground_truth)
print(f"Negative Sample Reward: {reward}")

Negative Sample Reward: [0.0]


## Load the Evaluation Dataset

In [9]:
data_num = 5

df = pd.read_parquet("grpo-test-ds.parquet")

In [10]:
eval_dataset = Dataset.from_pandas(df.sample(data_num))
sample_df = eval_dataset.to_pandas()
display(sample_df)

Unnamed: 0,question,answer,__index_level_0__
0,Lloyd has an egg farm. His chickens produce 25...,"In a week, Lloyd's egg farm produces 252 x 7 =...",50
1,Charlie has boots that are five times the size...,"If Sophie wears size five shoes, Charlie wears...",983
2,Mia and Emma are currently 16 years apart in a...,"If Mia is 40 years old, Emma is 40+16 = <<40+1...",1277
3,James decides to buy birthday candles for his ...,His younger son is 12-4=<<12-4=8>>8 years old\...,563
4,Joseph had 3 times as many notebooks as Martha...,"Since she bought 5 more to have 7, she origina...",897


In [11]:
def post_processing(example):
    match = re.search(r"####\s*(-?\d+)", example["answer"])
    example["ground_truth"] = match.group(1) if match else None
    example["prompt"] = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": example["question"]}
    ]
    return example
eval_dataset = eval_dataset.map(post_processing).remove_columns(["question", "answer"])


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [12]:
sample_df = eval_dataset.select(range(5)).to_pandas()
display(sample_df)

Unnamed: 0,__index_level_0__,ground_truth,prompt
0,50,294,[{'content': 'You are a helpful assistant that...
1,983,15,[{'content': 'You are a helpful assistant that...
2,1277,48,[{'content': 'You are a helpful assistant that...
3,563,12,[{'content': 'You are a helpful assistant that...
4,897,1,[{'content': 'You are a helpful assistant that...


## Load the model and evaluate

In [13]:
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen2.5-0.5B-Instruct", USE_GPU)

In [14]:
# Store predictions and ground truths
all_preds = []
all_labels = []

for example in tqdm(eval_dataset):
    input_prompt = example["prompt"]
    ground_truth = example["ground_truth"]
    # Run the model to generate an answer
    with torch.no_grad():
        response = generate_responses(model, tokenizer,
                                      full_message = input_prompt)
    all_preds.append([{"role": "assistant", "content": response}])
    all_labels.append(ground_truth)
    print(response)
    print("___________________________________________________________________")
    print("Ground truth: ", ground_truth)
    print("___________________________________________________________________")

# 3. Evaluate using reward_func
rewards = reward_func(all_preds, all_labels)

# 4. Report accuracy
accuracy = sum(rewards) / len(rewards)
print(f"Evaluation Accuracy: {accuracy:.2%}")
del model, tokenizer

  0%|          | 0/5 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
 20%|██        | 1/5 [00:14<00:56, 14.16s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


To determine how much Lloyd makes from selling his eggs, we need to follow these steps:

1. Calculate the number of dozens of eggs produced in one week.
2. Determine the revenue generated from selling those dozens of eggs.

First, let's find out how many dozens of eggs Lloyd produces in one week. Since there are 7 days in a week, we divide the total number of eggs by 7:
\[
\text{Number of dozens} = \frac{252}{7}
\]
Performing the division gives us:
\[
\frac{252}{7} = 36
\]
So, Lloyd produces 36 dozens of eggs in one week.

Next, we calculate the revenue from selling these dozens of eggs. Each dozen costs $2, so the revenue is:
\[
\text{Revenue} = 36 \times 2 = 72
\]

Therefore, the amount of money Lloyd makes on eggs per week is:
\[
\boxed{72}
\]
___________________________________________________________________
Ground truth:  294
___________________________________________________________________


 40%|████      | 2/5 [00:29<00:45, 15.15s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


To determine the average size of shoes worn by Charlie and Sophie, we need to follow these steps:

1. Identify the size of Sophie's boots.
2. Calculate the size of Charlie's boots based on the given relationship.
3. Find the total size of both shoes combined.
4. Divide the total size by 2 to find the average size.

Let's start with the first step. The problem states that Sophie wears size five boots. Therefore, the size of Sophie's boots is 5.

Next, we know that Charlie's boots are five times the size of Sophie's boots. So, we calculate the size of Charlie's boots as follows:
\[ \text{Size of Charlie's boots} = 5 \times 5 = 25 \]

Now, we find the total size of both shoes combined by adding the sizes of Charlie's and Sophie's boots:
\[ \text{Total size} = 5 + 25 = 30 \]

Finally, we find the average size of the two shoes by dividing the total size by 2:
\[ \text{Average size} = \frac{30}{2} = 15 \]

Thus, the average size of the shoes worn by Charlie and Sophie is \(\boxed{15}\).
____

 60%|██████    | 3/5 [00:43<00:28, 14.34s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


To find the average of Mia's and Emma's ages, we first need to determine their current ages.

Given:
- Mia is 40 years old.
- Mia is younger than Emma.

Since Mia is older than Emma, Emma must be 40 - 40 = 0 years old.

Now, we have two ages: Mia (40) and Emma (0).

The average of these two ages is calculated as follows:

\[
\text{Average} = \frac{\text{Mia's age} + \text{Emma's age}}{2}
\]

Substituting the given values:

\[
\text{Average} = \frac{40 + 0}{2} = \frac{40}{2} = 20
\]

Therefore, the average of Mia's and Emma's ages is 20 years.
___________________________________________________________________
Ground truth:  48
___________________________________________________________________


 80%|████████  | 4/5 [00:50<00:11, 11.61s/it]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


To determine how much James spends on candles, we need to follow these steps:

1. **Identify the age difference between the two sons:**
   - The first son is 12 years old.
   - The second son is 4 years younger than the first son.
   - Therefore, the second son's age is:
     \[
     12 - 4 = 8 \text{ years old}
     \]

2. **Calculate the total number of candles needed:**
   - Each candle pack contains 5 candles.
   - There are 2 sons, so the total number of candles required is:
     \[
     2 \times 5 = 10 \text{ packs}
     \]

3. **Determine the cost per pack:**
   - Each pack costs $3.

4. **Calculate the total cost:**
   - Multiply the number of packs by the cost per pack:
     \[
     10 \text{ packs} \times 3 \text{ dollars/pack} = 30 \text{ dollars}
     \]

Thus, the total amount James spends on candles is \(\boxed{30}\).
___________________________________________________________________
Ground truth:  12
___________________________________________________________________


100%|██████████| 5/5 [00:55<00:00, 11.06s/it]

Let's denote the number of notebooks Martha originally had as \( M \). According to the problem, Joseph initially had 3 times as many notebooks as Martha, so Joseph had \( 3M \) notebooks.

After buying 5 more notebooks, Martha's total number of notebooks became \( M + 5 \), and Joseph's total number of notebooks became \( 3M + 5 \).

The difference between Joseph's new total and Martha's original total is:

\[ (3M + 5) - (M + 5) = 2M \]

This means that after buying 5 more notebooks, Joseph has 2 notebooks more than Martha. Therefore, the answer is:

\[
\boxed{2}
\]
___________________________________________________________________
Ground truth:  1
___________________________________________________________________
Evaluation Accuracy: 20.00%





## Loading the training dataset

In [15]:
train_data_num = 100

train_df = pd.read_parquet("grpo-train-ds.parquet")

train_dataset = Dataset.from_pandas(train_df.sample(train_data_num, random_state = 23))

# Apply to dataset
train_dataset = train_dataset.map(post_processing)
train_dataset = train_dataset.remove_columns(["question", "answer"])
if not USE_GPU:
    train_dataset = train_dataset.select(range(10))
print(train_dataset[0])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

{'__index_level_0__': 1264, 'ground_truth': '2', 'prompt': [{'content': 'You are a helpful assistant that solves problems step-by-step. Always include the final numeric answer inside \\boxed{}.', 'role': 'system'}, {'content': 'A monitor is 21 inches by 12 inches.  There are 100 dots per inch.  How many total pixels are there?', 'role': 'user'}]}


In [16]:
len(train_dataset)

100

## GRPO Training

In [18]:
import torch
import gc

#del model, tokenizer

#torch.cuda.empty_cache()
#gc.collect()

In [19]:
config = GRPOConfig(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_generations=4, # Can set as high as 64 or 128
    num_train_epochs=1,
    learning_rate=5e-6,
    logging_steps=5,
    report_to="none"
#    no_cuda= not USE_GPU     # keeps the whole run on CPU, incl. MPS
)

<div style="background-color:#fff6ff; padding:13px; border-width:3px; border-color:#efe6ef; border-style:solid; border-radius:6px">
<p> 💻 &nbsp; <b>Note:</b> We're performing GROP on a small model <code>HuggingFaceTB/SmolLM2-135M-Instruct</code> and a smaller training dataset to to ensure the full training process can run on limited computational resources. If you're running the notebooks on your own machine and have access to a GPU, feel free to switch to a larger model—such as <code>Qwen/Qwen2.5-0.5B-Instruct</code>—to perform full GRPO and reproduce the results shown above.</p>
</div>

In [20]:
## If this block hangs or the kernel restarts during training, please skip loading the previous 0.5B model for evaluation

model, tokenizer = load_model_and_tokenizer("Qwen/Qwen2.5-0.5B-Instruct", USE_GPU)

In [21]:
grpo_trainer = GRPOTrainer(
    model=model,
    args=config,
    reward_funcs=reward_func,
    train_dataset=train_dataset,
)

grpo_trainer.train()

Step,Training Loss
5,0.0
10,0.0
15,0.0
20,-0.0
25,-0.0
30,0.0
35,0.0
40,-0.0
45,-0.0
50,-0.0


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TrainOutput(global_step=50, training_loss=-8.940696716308593e-10, metrics={'train_runtime': 1243.3773, 'train_samples_per_second': 0.08, 'train_steps_per_second': 0.04, 'total_flos': 0.0, 'train_loss': -8.940696716308593e-10})

## Store the final model

In [None]:
grpo_trainer.save_model("./grpo_model1")

## Results of the fully trained Qwen model

<hr>

**Note:** Due to limited computational resources, we used a small model and dataset for GRPO training. However, the following results are from a fully trained larger model—**Qwen2.5-0.5B**—to demonstrate the complete outcome of the GRPO process. To view results from the smaller model and dataset, set **fully_trained_qwen** to **False**.

In [None]:
fully_trained_qwen = True
if fully_trained_qwen:
    final_model, tokenizer = load_model_and_tokenizer("./grpo_model", USE_GPU)
else:
    final_model = grpo_trainer.model

In [None]:
# Store predictions and ground truths
all_preds = []
all_labels = []

for example in tqdm(eval_dataset):
    input_prompt = example["prompt"]
    ground_truth = example["ground_truth"]
    # Run the model to generate an answer
    with torch.no_grad():
        response = generate_responses(final_model, tokenizer,
                                      full_message = input_prompt)
    all_preds.append([{"role": "assistant", "content": response}])
    all_labels.append(ground_truth)
    print(response)
    print("___________________________________________________________________")
    print("Ground truth: ", ground_truth)
    print("___________________________________________________________________")

# 3. Evaluate using reward_func
rewards = reward_func(all_preds, all_labels)

# 4. Report accuracy
accuracy = sum(rewards) / len(rewards)
print(f"Evaluation Accuracy: {accuracy:.2%}")