To run this, press "*Runtime*" and press "*Run all*" on a **free** Tesla T4 Google Colab instance!
<div class="align-center">
<a href="https://unsloth.ai/"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
<a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord button.png" width="145"></a>
<a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a></a> Join Discord if you need help + ⭐ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐
</div>

To install Unsloth on your own computer, follow the installation instructions on our Github page [here](https://docs.unsloth.ai/get-started/installing-+-updating).

You will learn how to do [data prep](#Data), how to [train](#Train), how to [run the model](#Inference), & [how to save it](#Save)

Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).


### News

**Read our [blog post](https://unsloth.ai/blog/r1-reasoning) for guidance to train reasoning model.** GRPO notebook is inspired by [@shxf0072](https://x.com/shxf0072/status/1886085377146180091), [@Teknium1](https://x.com/Teknium1/status/1885077369142337550), [@willccbb](https://gist.github.com/willccbb/4676755236bb08cab5f4e54a0475d6fb)

Visit our docs for all our [model uploads](https://docs.unsloth.ai/get-started/all-our-models) and [notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks).


### Installation

In [1]:
%%capture
# Skip restarting message in Colab
import sys; modules = list(sys.modules.keys())
for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None

!pip install unsloth vllm
!pip install --upgrade pillow
!pip install transformers==4.31.0
!pip install rouge_score bert_score datasets evaluate scikit-learn
# If you are running this notebook on local, you need to install `diffusers` too
!pip install diffusers
# Temporarily install a specific TRL nightly version
!pip install git+https://github.com/huggingface/trl.git@e95f9fb74a3c3647b86f251b7e230ec51c64b72b

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

True

### Unsloth

Use `PatchFastRL` before all functions to patch GRPO and other RL algorithms!

In [None]:
import os
from unsloth import FastLanguageModel, PatchFastRL

# Set temporary directory for unsloth
os.environ["UNSLOTH_TEMP_DIR"] = "/tmp/unsloth"

PatchFastRL("GRPO", FastLanguageModel)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 02-13 21:34:55 __init__.py:190] Automatically detected platform cuda.


2025-02-13 21:34:55,597	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


### Dataset

We will use the Medical O1 Reasoninng SFT found here on Hugging Face. From the authors: This dataset is used to fine-tune HuatuoGPT-o1, a medical LLM designed for advanced medical reasoning. This dataset is constructed using GPT-4o, which searches for solutions to verifiable medical problems and validates them through a medical verifier.

In [None]:
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split

SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

def get_medical_questions(split="train") -> Dataset:
    # Load the raw dataset from the hub
    data = load_dataset('FreedomIntelligence/medical-o1-reasoning-SFT', 'en')[split]
    # Convert to a pandas DataFrame for easier splitting
    df = data.to_pandas()

    if split == "train":
        # Split the data into 70% train and 30% temporary data
        train_df, temp_df = train_test_split(df, test_size=0.01, random_state=42)
        # Split the temporary data equally into eval and test (each 2.5% of the total)
        eval_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

        # Convert back to Hugging Face Datasets
        train_dataset = Dataset.from_pandas(train_df)
        eval_dataset = Dataset.from_pandas(eval_df)
        test_dataset = Dataset.from_pandas(test_df)

        # Define a mapping function to create the prompt and answer fields
        def map_fn(x):
            return {
                'prompt': [
                    {'role': 'system', 'content': SYSTEM_PROMPT},
                    {'role': 'user', 'content': x['Question']}
                ],
                'answer': x['Response']  # The reference answer
            }

        # Apply the mapping function to each split
        train_dataset = train_dataset.map(map_fn)
        eval_dataset = eval_dataset.map(map_fn)
        test_dataset = test_dataset.map(map_fn)

        return train_dataset, eval_dataset, test_dataset  # Return all three splits
    else:
        # For non-"train" splits, just map over the data (if needed)
        data = data.map(lambda x: {
            'prompt': [
                {'role': 'system', 'content': SYSTEM_PROMPT},
                {'role': 'user', 'content': x['Question']}
            ],
            'answer': x['Response']
        })
        return data

# To get the three splits, call:
train_dataset, eval_dataset, test_dataset = get_medical_questions(split="train")
train_dataset, eval_dataset, test_dataset

Map: 100%|██████████| 25117/25117 [00:01<00:00, 24626.03 examples/s]
Map: 100%|██████████| 127/127 [00:00<00:00, 22961.19 examples/s]
Map: 100%|██████████| 127/127 [00:00<00:00, 23791.89 examples/s]


(Dataset({
     features: ['Question', 'Complex_CoT', 'Response', '__index_level_0__', 'prompt', 'answer'],
     num_rows: 25117
 }),
 Dataset({
     features: ['Question', 'Complex_CoT', 'Response', '__index_level_0__', 'prompt', 'answer'],
     num_rows: 127
 }),
 Dataset({
     features: ['Question', 'Complex_CoT', 'Response', '__index_level_0__', 'prompt', 'answer'],
     num_rows: 127
 }))

Load up `Qwen 2.5 3B Instruct`, and set parameters

In [5]:
import os
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch


max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower

base_model_name = "Qwen/Qwen2.5-3B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_model_name,
    max_seq_length = max_seq_length,
    load_in_4bit = True, # False for LoRA 16bit
    fast_inference = True, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.5, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ], # Remove QKVO if out of memory
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Enable long context finetuning
    random_state = 3407,
)

==((====))==  Unsloth 2025.2.5: Fast Qwen2 patching. Transformers: 4.48.3.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.641 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: vLLM loading unsloth/qwen2.5-3b-instruct-unsloth-bnb-4bit with actual GPU utilization = 49.19%
Unsloth: Your GPU has CUDA compute capability 8.9 with VRAM = 23.64 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 1024. Num Sequences = 224.
Unsloth: vLLM's KV Cache can use up to 9.21 GB. Also swap space = 6 GB.
INFO 02-13 21:35:11 config.py:542] This model supports multiple tasks: {'generate', 'score', 'embed', 'reward', 'classify'}. Defaulting to 'generate'.
Unsloth: vLLM Bitsandbytes config using kw



INFO 02-13 21:35:13 weight_utils.py:252] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.19it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.18it/s]

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.02it/s]
Loading safetensors checkpoint shards: 100% Completed | 1/1 [00:00<00:00,  2.02it/s]



INFO 02-13 21:35:15 model_runner.py:1115] Loading model weights took 2.2160 GB
INFO 02-13 21:35:15 punica_selector.py:18] Using PunicaWrapperGPU.
INFO 02-13 21:35:16 worker.py:267] Memory profiling takes 0.87 seconds
INFO 02-13 21:35:16 worker.py:267] the current vLLM instance can use total_gpu_memory (23.64GiB) x gpu_memory_utilization (0.49) = 11.63GiB
INFO 02-13 21:35:16 worker.py:267] model weights take 2.22GiB; non_torch_memory takes 0.08GiB; PyTorch activation peak memory takes 1.23GiB; the rest of the memory reserved for KV Cache is 8.11GiB.
INFO 02-13 21:35:16 executor_base.py:110] # CUDA blocks: 14763, # CPU blocks: 10922
INFO 02-13 21:35:16 executor_base.py:115] Maximum concurrency for 1024 tokens per request: 230.67x
INFO 02-13 21:35:21 model_runner.py:1434] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error

Capturing CUDA graph shapes: 100%|██████████| 31/31 [00:19<00:00,  1.61it/s]

INFO 02-13 21:35:40 model_runner.py:1562] Graph capturing finished in 19 secs, took 0.73 GiB
INFO 02-13 21:35:40 llm_engine.py:431] init engine (profile, create kv cache, warmup model) took 25.36 seconds



sh: 0: getcwd() failed: No such file or directory
Unsloth 2025.2.5 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


### Reward Functions
<a name="Data"></a>

All reward functions were generated by deep seek.

In [6]:
import re

# ------ Correctness Reward (Fixed Parameter Order) ------
def correctness_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    """
    Evaluates the correctness of model-generated responses (fixed syntax).
    """
    rewards = []
    for completion, ans in zip(completions, answer):
        try:
            response = completion[0]['content'].strip()
            reward = 1.0 if response == ans.strip() else 0.0
        except (KeyError, IndexError):
            reward = 0.0  # Penalize malformed responses
        rewards.append(reward)

        # Debug print (optional)
        # print(f"Correctness Check | Response: {response[:20]}... | Correct: {ans} | Reward: {reward}")

    return rewards

# ------ Enhanced Perplexity Reward (Optimized) ------
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Initialize perplexity model once (add device management)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
perplexity_model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(DEVICE)
perplexity_tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')

def calculate_perplexity(texts: list[str]) -> list[float]:
    """Batch perplexity calculation with error handling"""
    perplexities = []
    for text in texts:
        if not text.strip():  # Handle empty responses
            return 1000.0

        try:
            encodings = perplexity_tokenizer(text, return_tensors='pt').to(DEVICE)
            with torch.no_grad():
                outputs = perplexity_model(**encodings, labels=encodings.input_ids)
            perplexities.append(torch.exp(outputs.loss).item())
        except:
            perplexities.append(1000.0)  # High penalty for errors

    return perplexities

def coherence_fluency_reward_func(completions, **kwargs) -> list[float]:
    """Optimized fluency reward with batch processing"""
    responses = [c[0]['content'].strip() for c in completions]
    perplexities = calculate_perplexity(responses)
    return [1 / (p + 1e-6) for p in perplexities]  # Stable calculation


def combined_reward_func(prompts, completions, answer, **kwargs) -> list[float]:
    correctness = correctness_reward_func(prompts, completions, answer)
    fluency = coherence_fluency_reward_func(completions)

    return [
        0.7 * c +  # Higher weight for correctness
        0.3 * f    # Lower weight for fluency
        for c, f in zip(correctness, fluency)
    ]

<a name="Train"></a>
### Train the model

Now set up GRPO Trainer and all configurations!

In [7]:
from trl import GRPOConfig, GRPOTrainer
import os

# Configure to save checkpoints to HuggingFace Hub
checkpoint_repo = "matthewchung74/Qwen2.5_3B-GRPO-medical-reasoning-checkpoints"

# Set HuggingFace cache directory to avoid cluttering project directory
os.environ['HF_HOME'] = os.path.join(os.path.expanduser('~'), '.cache/huggingface')

# Define batch parameters
per_device_train_batch_size = 2
gradient_accumulation_steps = 4

# Calculate total number of steps to process each sample once
total_steps = len(train_dataset) // (per_device_train_batch_size * gradient_accumulation_steps)
total_steps = 20
num_checkpoints = 10
num_checkpoints = 2

training_args = GRPOConfig(
    use_vllm = True, # use vLLM for fast inference!
    learning_rate = 5e-6,
    adam_beta1 = 0.9,
    adam_beta2 = 0.99,
    weight_decay = 0.1,
    warmup_ratio = 0.1,
    lr_scheduler_type = "cosine",
    optim = "paged_adamw_8bit",
    logging_steps = 1,
    bf16 = is_bfloat16_supported(),
    fp16 = not is_bfloat16_supported(),
    per_device_train_batch_size = per_device_train_batch_size,
    gradient_accumulation_steps = gradient_accumulation_steps, # Increase to 4 for smoother training
    num_generations = 6, # Decrease if out of memory
    max_prompt_length = 256,
    max_completion_length = 300,
    max_steps = total_steps, # Train for exact number of steps to cover dataset once
    save_steps = total_steps // num_checkpoints, # Save 10 checkpoints during training
    max_grad_norm = 0.1,
    report_to = "none", # Can use Weights & Biases
    output_dir = os.path.join(os.path.expanduser('~'), '.cache/huggingface', checkpoint_repo),
    save_strategy = "steps",
    hub_model_id = checkpoint_repo,
    push_to_hub = True,
)

torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. In order to use Torch DDP, launch your script with `python -m torch.distributed.launch


And let's run the trainer! If you scroll up, you'll see a table of rewards. The goal is to see the `reward` column increase!

You might have to wait 150 to 200 steps for any action. You'll probably get 0 reward for the first 100 steps. Please be patient!

| Step | Training Loss | reward    | reward_std | completion_length | kl       |
|------|---------------|-----------|------------|-------------------|----------|
| 1    | 0.000000      | 0.125000  | 0.000000   | 200.000000        | 0.000000 |
| 2    | 0.000000      | 0.072375  | 0.248112   | 200.000000        | 0.000000 |
| 3    | 0.000000      | -0.079000 | 0.163776   | 182.500000        | 0.000005 |


In [8]:
trainer = GRPOTrainer(
    model = model,
    processing_class = tokenizer,
    reward_funcs = [
        combined_reward_func,
    ],
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
)
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 25,117 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 20
 "-____-"     Number of trainable parameters = 119,734,272
sh: 0: getcwd() failed: No such file or directory


FileNotFoundError: [Errno 2] No such file or directory: 'grpo_trainer_lora_model'

Now we load the LoRA and test:

In [None]:
# Load the first test sample
first_test_sample = eval_dataset[0]  # Access the first sample

# Format the input prompt
text = tokenizer.apply_chat_template(
    first_test_sample['prompt'],
    tokenize=False,
    add_generation_prompt=True
)

from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0.2,
    top_p=0.95,
    max_tokens=1024,
)

# Generate the output
output = model.fast_generate(
    text,
    sampling_params=sampling_params,
    lora_request=model.load_lora("grpo_saved_lora"),
)[0].outputs[0].text

# If you want to display the user question and the reference answer,
# extract the user question from the prompt (assuming it’s at index 1)
user_question = first_test_sample['prompt'][1]['content']

print("\n===== User Question (Ground Truth) =====")
print(user_question)  # Prints the user question

print("\n===== Reference Answer (Ground Truth) =====")
print(first_test_sample['answer'])  # Prints the correct reference answer

print("\n===== Model Output (Generated Response) =====")
print(output)  # Prints the model's generated response

Our reasoning model is much better - it's not always correct, since we only trained it for an hour or so - it'll be better if we extend the sequence length and train for longer!

<a name="Save"></a>
### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
import os

user_name = "matthewchung74"
model_name = "Qwen2.5_3B-GRPO-medical-reasoning"

if True:
    model.push_to_hub_merged(f"{user_name}/{model_name}", tokenizer, save_method="lora", token=os.environ["HF_TOKEN"])

## 📊 Model Evaluation on Medical QA Dataset

### 🔹 Overview
This code evaluates the performance of a language model on the **Medical-O1-Reasoning-SFT** dataset by generating responses to test questions and comparing them against ground-truth answers using standard NLP evaluation metrics.

### 🔹 How It Works
1. **Loads the test dataset**: Retrieves question-answer pairs for evaluation.
2. **Generates responses using the model**: The model is prompted with test questions formatted using a predefined chat template.
3. **Computes evaluation metrics**:
   - **ROUGE Score**: Measures overlap between generated and reference text.
   - **BLEU Score**: Measures n-gram similarity between model output and reference.
   - **BERTScore**: Uses deep learning embeddings to assess semantic similarity.
4. **Aggregates and displays evaluation results**.

### 🔹 Evaluation Metrics
| Metric      | Description |
|-------------|-------------|
| **ROUGE**   | Measures overlap between reference and generated text using recall-based F1 scores. Good for summarization tasks. |
| **BLEU**    | Compares n-grams (word sequences) between generated and reference text, focusing on precision. Good for translation tasks. |
| **BERTScore** | Uses transformer embeddings to compare semantic meaning rather than exact word overlap. More robust for open-ended generation. |

### 🔹 Expected Output
After running the code, you will see three scores:
- **ROUGE Scores** (ROUGE-1, ROUGE-2, ROUGE-L)
- **BLEU Score** (ranges from 0 to 1; higher is better)
- **BERTScore F1** (semantic similarity; typically between 0.5 - 1.0)

### 🔹 How to Interpret Results
- **Higher scores** indicate better alignment between model predictions and ground-truth answers.
- If **ROUGE and BLEU are low**, the model may struggle with **word overlap**.
- If **BERTScore is high but ROUGE is low**, the model may be **paraphrasing well but not matching exact words**.

### 🔹 Next Steps
- **If performance is poor**, consider:
  - Improving **prompt design** to get better model responses.
  - **Fine-tuning the model** on more relevant medical datasets.
  - Adjusting **sampling parameters** (e.g., `temperature`, `top_p`) for more diverse or deterministic responses.

---


Manually Clear VRAM

In [None]:
import gc
import torch
torch.cuda.empty_cache()
gc.collect()
torch.cuda.empty_cache()

In [None]:
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)

In [None]:
# Inference code
from unsloth import FastLanguageModel
from vllm import SamplingParams
import os

max_seq_length = 512

base_model_name = "Qwen/Qwen2.5-3B-Instruct"
user_name = "matthewchung74"
model_name = "Qwen2.5_3B-GRPO-medical-reasoning"

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    fast_inference=True,
    token=os.environ["HF_TOKEN"],
    offload_folder="./offload"
)

# Load LoRA weights
lora_request = model.load_lora(f"{user_name}/{model_name}")

# Prepare model for inference
model = FastLanguageModel.for_inference(model)

def generate_response(prompt):
    input_text = tokenizer.apply_chat_template(
        prompt,
        tokenize=False,
        add_generation_prompt=True
    )
    
    sampling_params = SamplingParams(
        temperature=0.2,
        top_p=0.95,
        max_tokens=512,
    )
    
    output = model.fast_generate(
        input_text,
        sampling_params=sampling_params,
        lora_request=lora_request,
    )
    return output[0].outputs[0].text.strip()

In [None]:
# Evaluation code
import evaluate
from tqdm import tqdm
import numpy as np

# Initialize metrics
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")
bertscore_metric = evaluate.load("bertscore")

# Convert to pandas and select samples for evaluation
eval_subset = eval_dataset.to_pandas().head(3)  # Using 3 samples for debugging

predictions = []
references = []

# Generate predictions
for _, sample in tqdm(eval_subset.iterrows(), total=len(eval_subset), desc="Evaluating"):
    prediction = generate_response(sample["prompt"])
    predictions.append(prediction)
    references.append(sample["answer"].strip())

# Calculate metrics
rouge_scores = rouge_metric.compute(predictions=predictions, references=references)
bertscore_results = bertscore_metric.compute(predictions=predictions, references=references, lang="en")
bleu_score = bleu_metric.compute(
    predictions=predictions,
    references=[[ref] for ref in references]
)

# Calculate average BERTScore F1
bertscore_f1 = np.mean(bertscore_results["f1"])

print("\n--- Evaluation Results ---")
print(f"ROUGE Scores: {rouge_scores}")
print(f"BLEU Score: {bleu_score['bleu']}")
print(f"BERTScore F1: {bertscore_f1:.4f}")
