### Online Reforce Learning(RL) using Group Relative Policy Optimization (GRPO)

In [None]:
#### import libraries
import torch
from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from trl import  GRPOTrainer, GRPOConfig
from datasets import load_dataset, Dataset
import re
import pandas as pd
from tqdm import tqdm


##### Helpers function

In [None]:
def generate_responses(model, tokenizer, full_message=None,  max_new_tokens=500):
    # Format chat using tokenizer's chat template
  

    prompt = tokenizer.apply_chat_template(
        full_message, 
        tokenize=False, 
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Convert messages to token IDs, send to device incase the model is on gpu
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    # Recommended to use vllm, sgland or TensorRt
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    # extract the generated ids and the responses
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:]
    # Decode response to text base response
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
    return response


In [None]:
def test_model_with_questions(model, tokenizer, questions,
                             system_message=None, title="Model Output"):
    print(f"\n==== {title} ====\n")
    rows = []
    for i, question in enumerate(questions, 1):
        response = generate_responses(model, tokenizer, question, system_message)
        rows.append({"User Prompt": question, "Assistant Response": response})
    df = pd.DataFrame(rows)
    pd.set_option('display.max_colwidth', None) # avoid truncating long text
    display(df)


In [None]:
def load_model_and_tokenizer(model_name, use_gpu=True):
    """
    Load the model and tokenizer from the model name
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    if use_gpu:
        model.to("cuda")
    # if the model doesn't have a chat template, we need to define it
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
            {% if message['role'] == 'system' % }System: {{ message['content'] }}\n
            {% elif message['role'] == 'user' % }User: {{ message['content'] }}\n
            {% elif message['role'] == 'assistant' % }Assistant: {{ message['content'] }} <|endoftext|>\n
            {% endif %}
            {% endfor %}"""
    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
        
    return model, tokenizer


#### Prepare for evaluation dataset for Math: GSM8K


In [None]:
USE_GPU = True
SYSTEM_PROMPT = (
    "You are helpful assistant that solves problems step-by-step."
    "Always include the final numeric answer inside \\boxed{}."
)

In [None]:
def reward_func(completions, ground_truth, **kwargs):
    # Regular expression to capture content inside \boxed{}
    matches = [re.search(r"\\boxed\{(.*?)\}", completion[0]['content']) for completion in completions]
    contents = [match.group(1) if match else "" for match in matches]
    # Reward 1 if the content is the same as the ground truth, 0 otherwise
    rewards = [1.0 if c == gt else 0.0 for c, gt in zip(contents, ground_truth)]
    return rewards

In [None]:
sample_pred = [
    [
        {
            "role": "assistant",
            "content": r"... Calculating the answer. \boxed{72}"
        }
    ]
]
ground_truth = ["72"]
reward = reward_func(sample_pred, ground_truth)
print(f"Positive Sample Reward: {reward}")

In [None]:
sample_pred = [
    [
        {
            "role": "assistant",
            "content": r"... Calculating the answer. \boxed{71}"
        }
    ]
]
ground_truth = ["72"]
reward = reward_func(sample_pred, ground_truth)
print(f"Negative Sample Reward: {reward}")


#### Load the Evaluation Dataset

In [None]:
# set up the display configures in pandas
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', 0)

data_num = 5
eval_dataset = load_dataset("openai/gsm8k", "main")["test"].select(range(data_num))
sample_df = eval_dataset.to_pandas()
display(sample_df)

In [None]:
def post_processing(example):
    match = re.search(r"####\s*(-?\d+)", example["answer"])
    example["ground_truth"] = match.group(1) if match else None
    example["prompt"] = [
        {
            "role": "system",
            "content": SYSTEM_PROMPT
        },
        {
            "role": "user",
            "content": example["question"]
        }
    ]
    return example
eval_dataset = eval_dataset.map(post_processing).remove_columns(["question", "answer"])

In [None]:
sample_df = eval_dataset.select(range(5)).to_pandas()
display(sample_df)

### Load the model and evaluate

In [None]:
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen2.5-0.5B-Instruct", USE_GPU)

In [None]:
#1. Store predictions and ground truths
all_preds = []
all_labels = []

for example in tqdm(eval_dataset):
    input_prompt = example["prompt"]
    ground_truth = example["ground_truth"]
    #2.Run the model to generate an answer
    with torch.no_grad():
        response = generate_responses(model, tokenizer, full_message=input_prompt)
    all_preds.append([
        {
            "role": "assistant",
            "content": response
        }
    ])
    all_labels.append(ground_truth)
 #  print(f"Predicted: {response}")
 #  print(f"Ground Truth: {ground_truth}")

#3. Evaluate using reward_func
rewards = reward_func(all_preds, all_labels)

#4. Report accuracy
accuracy = sum(rewards) / len(rewards)
#rint(f"Evaluation Accuracy: {accuracy:.2%}")

df = pd.DataFrame({"predictions": all_preds, "ground_truth": all_labels, "rewards": rewards})
display(df)
print(f"Evaluation Accuracy: {accuracy:.2%}")
del model, tokenizer


### Loading the training dataset

In [None]:
dataset = load_dataset("openai/gsm8k", "main")
train_dataset = dataset["train"]

# Apply to dataset
train_dataset = train_dataset.map(post_processing)
train_dataset = train_dataset.remove_columns(["question", "answer"])
if not USE_GPU:
    train_dataset = train_dataset.select(range(20))
# limit the dataset to 100 examples for testing on gpu for now
train_dataset = train_dataset.select(range(30))

print(train_dataset[0])

#### GRPO Training

In [None]:
config = GRPOConfig(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_generations=4, # Can set as high as 64 or 128
    num_train_epochs=1,
    learning_rate=5e-6,
    logging_steps=2,
    no_cuda=USE_GPU, # Set to False if you want to use GPU
)

In [None]:
## if this block hangs or the kernel restarts during training, please skip loading the previous 0.5B model for evaluation
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen2.5-0.5B-Instruct", USE_GPU)

grpo_trainer = GRPOTrainer(
    model=model,
    args=config,
    reward_funcs=reward_func,
    train_dataset=train_dataset
)

grpo_trainer.train()

### Results of the fully trained Qwen Model

In [None]:
model = grpo_trainer.model

# Store predictions and ground truths
all_preds = []
all_labels = []

for example in tqdm(eval_dataset):
    input_prompt = example["prompt"]
    ground_truth = example["ground_truth"]
    #2.Run the model to generate an answer
    with torch.no_grad():
        response = generate_responses(model, tokenizer, full_message=input_prompt)
    all_preds.append([
        {
            "role": "assistant",
            "content": response
        }
    ])
    all_labels.append(ground_truth)
# 3. Evaluate using reward_func
rewards = reward_func(all_preds, all_labels)
# 4. Report accuracy
accuracy = sum(rewards) / len(rewards)

df = pd.DataFrame({"predictions": all_preds, "ground_truth": all_labels, "rewards": rewards})
display(df)
print(f"Evaluation Accuracy: {accuracy:.2%}")
del model, tokenizer



