## Install the required libraries

In [1]:
%%capture
!pip install -U --no-deps transformers datasets sentence-transformers tqdm bitsandbytes huggingface_hub unsloth unsloth_zoo trl langdetect

## Load Model and Tokenizer in your notebook

In [2]:
# Use Unsloth for loading models in 4 , 8 or 16 bit. It will reduce the ram usage. And will also provide faster training

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True # 4bit quantization to reduce memory usage.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.6: Fast Llama patching. Transformers: 4.55.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Add adapter layers and specify their dimensions using PEFT technique

In [3]:
# Use parameter efficient fine tuning. 
# This will add adapter layers in both attention and feed forward neural networks which will be trained for new dataset.
# Rest of the layers will be frozen.

# Here 'r' is the rank of the matrices and lora_alpha is the hyperparameter which multiplies with the trainable parameter matrices. 
# Usually lora_alpha = r, or = 2r.

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank of the matrix. Could be 8, 16, 32, 64 or 128 ...

    # Modules assiggned for training. For lower memory start with only "q_proj", "k_proj", "v_proj"

    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],


    lora_alpha = 32,
    
    lora_dropout = 0,
    bias = "none",

    use_gradient_checkpointing = "unsloth", 
    random_state = 3807,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.8.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## Use alpaca prompt style technique to provide instructions to the model 

In [4]:
# Choosing alpaca style prompt. You can choose any other prompt as well.

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Tells the model where to stop.

# Obtain the "text" column which consists of (Physics problem + Step-by-step reasoning + Correct answer + EOS_TOKEN). It is useful for SFT (not for GRPO)

def formatting_prompts_func(examples):
    instructions = examples["Question"]                 # Physics problem
    inputs       = examples["Text Reasoning Trace"]     # Step-by-step reasoning
    outputs      = examples["Final Answer"]             # Correct answer

    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):

        text = alpaca_prompt.format(instruction.strip(), input_text.strip(), output.strip()) + EOS_TOKEN
        texts.append(text)

    return {"text": texts}

from datasets import load_dataset
dataset = load_dataset("multimodal-reasoning-lab/Physics", split="train")
dataset = dataset.map(formatting_prompts_func, batched=True)

In [None]:
dataset

Dataset({
    features: ['Question', 'Text Reasoning Trace', 'Final Answer', 'problem_image_1', 'problem_image_2', 'reasoning_image_1', 'reasoning_image_2', 'reasoning_image_3', 'reasoning_image_4', 'text'],
    num_rows: 7090
})

## Keeping only the text data and remove the images from the dataset

In [5]:
dataset = dataset.remove_columns([
    'problem_image_1',
    'problem_image_2',
    'reasoning_image_1',
    'reasoning_image_2',
    'reasoning_image_3',
    'reasoning_image_4',
])

In [6]:
dataset

Dataset({
    features: ['Question', 'Text Reasoning Trace', 'Final Answer', 'text'],
    num_rows: 7090
})

## Taking the dataset which has length < 500 to avoid filling up the memory

In [6]:
# The "text" column will be passed to the model. 

dataset = dataset.map(
    lambda example: {"text_length": len(example["text"].split())}
)
dataset = dataset.filter(
    lambda example: example["text_length"] < 500
)

In [7]:
# Dataset length is reduced from 7090 to 260. 
# It is not an issue, the model can still learn to generate the style of the desired output.

dataset

Dataset({
    features: ['Question', 'Text Reasoning Trace', 'Final Answer', 'text', 'text_length'],
    num_rows: 260
})

In [8]:
## These columns are already converted to "text" column, so now they are not required
dataset = dataset.remove_columns(
    ['Question', 'Text Reasoning Trace', 'Final Answer', 'text_length']
)

## Supervised fine-tuning

In [9]:
# Using trl library for Supervised fine-tuning.

from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False,
    args = SFTConfig(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 50,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc. It helps in visualizing the training.
    ),
)

In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 260 | Num Epochs = 2 | Total steps = 50
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,1.0845
2,1.1029
3,1.0554
4,1.037
5,0.8973
6,0.7894
7,0.6718
8,0.5925
9,0.5193
10,0.4786


In [12]:
# Test the inference of the trained model.

FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Next word of the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output. Blank because model will add its output here.
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nNext word of the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\n13<|end_of_text|>']

## Load the dataset for GRPO

In [23]:
from datasets import load_dataset

# Load Physics dataset 

dataset = load_dataset(
    "multimodal-reasoning-lab/Physics",
    split="train",  
)

dataset = dataset.filter(lambda ex: len(ex["Question"].split()) < 500)


dataset = dataset.select(range(min(5000, len(dataset))))

# Define GRPO formatting function. It Convert Physics dataset samples into GRPO-ready format.

def formatting_prompts_for_grpo(examples):

    prompt = (
        f"Below is an instruction and an input. Provide the best possible answer.\n\n"
        f"### Instruction:\n{examples['Question'].strip()}\n\n"
        f"### Input:\n{examples['Text Reasoning Trace'].strip()}\n\n"
        f"### Response:"
    )
    reference = examples["Final Answer"].strip()
    return {"prompt": prompt, "reference": reference}

In [24]:
grpo_dataset = dataset.map(formatting_prompts_for_grpo)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [13]:
# For GRPO we only need "prompt" and "reference". Therefore, we can remove other columns.

dataset = dataset.remove_columns([
    col for col in dataset.column_names
    if col not in ["Question", "Text Reasoning Trace", "Final Answer", "text_length"]
])

In [22]:
len(dataset)

7090

## Define Reward function 

### The reward designing is one of the most difficult part. This is because the reasoning includes many steps, and even though the final answer is incorrect it does not mean the entire resoning is wrong. So the complete training of the model depends only on the reward function. One can try different approaches.

In [25]:
import re
# "sentence_bleu" will provide scores or "rewards" based on how much the generated answer is close to the ground truth: "reference"
from nltk.translate.bleu_score import sentence_bleu

def reward_fn(prompts, completions, completion_ids=None, **kwargs):

    references = kwargs.get("references", [""] * len(completions))
    rewards = []

    for idx, (sample, ref) in enumerate(zip(completions, references)):
        score = 0.0
        debug_info = {}
        ref = ref.strip()
        sample = sample.strip()

        if ref:
            numeric_ref = re.findall(r"[-+]?\d*\.\d+|\d+", ref)
            numeric_sample = re.findall(r"[-+]?\d*\.\d+|\d+", sample)
            if numeric_ref and numeric_sample:
                try:
                    if abs(float(numeric_ref[-1]) - float(numeric_sample[-1])) < 1e-3:
                        score += 1.0

                except:
                    debug_info["correctness"] = 0.0
            elif ref.lower() in sample.lower():
                score += 1.0

        else:
            if re.search(r"\d", sample):
                score += 0.2

        if any(kw in sample.lower() for kw in ["because", "therefore", "hence"]):
            score += 0.5


        if re.search(r"(Reasoning:|Step-by-step reasoning:)", sample) and \
           re.search(r"(Final Answer:)", sample):
            score += 0.5

        if ref:
            bleu = sentence_bleu([ref.split()], sample.split())
            score += bleu * 0.5
        else:
            length_penalty = max(0, 1 - abs(len(sample.split()) - 50) / 50)
            score += 0.3 * length_penalty

        if "completely unrelated" in sample.lower():
            score -= 1.0

        rewards.append(score)

    return rewards

In [None]:
dataset

Dataset({
    features: ['prompt', 'reference'],
    num_rows: 7090
})

In [None]:
dataset['reference'][0]

'(a) The distances traveled by the blocks from their initial resting points as a function of time are:\n- For the block of mass M:\n  x₁(t) = (1/6)(4A + μg)t²\n- For the block of mass 2M, assuming it moves (i.e., A > μg/2):\n  x₂(t) = (1/6)(2A - μg)t²\n  If A ≤ μg/2, the block of mass 2M does not move, so x₂(t) = 0.\n\n(b) The block of mass 2M will remain stationary if its calculated acceleration is less than or equal to zero. This occurs when:\nA ≤ μg / 2\nThe maximum acceleration for which the 2M block remains stationary is A = μg/2.\n\nNo, there is no case for A > 0 in which the block of mass 2M moves to the right. The tension force from the string always pulls it to the left, and friction can only oppose motion, not initiate it in the rightward direction.'

In [None]:
print(sum(1 for r in dataset['reference'] if not r.strip()), "/", len(dataset))

0 / 7090


## GRPO training using trl library

### I only trained upto 200 timesteps but one can do it for long to get better results. For 200 timesteps it takes 1.5 hours on A100 GPU.

In [26]:
from trl import GRPOConfig, GRPOTrainer

args = GRPOConfig(
    output_dir="outputs-grpo",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=5e-6,
    logging_steps=5,
    num_generations=4,
    max_prompt_length=512,
    max_completion_length=256,
    loss_type="dr_grpo",
    scale_rewards=False,
)

trainer = GRPOTrainer(
    model=model,                 
    processing_class=tokenizer,  
    reward_funcs=[reward_fn],    # must return a scalar reward
    train_dataset=grpo_dataset   # IterableDataset with "prompt", "reference"
)

trainer.train()


Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 2 to the `num_generations` of 4
Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 4 to the `num_generations` of 8


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,000 | Num Epochs = 3 | Total steps = 7,500
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 2 x 1) = 16
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmrmohit254[0m ([33mmrmohit254-talentsprint[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,reward,reward_std,completions / mean_length,completions / min_length,completions / max_length,completions / clipped_ratio,completions / mean_terminated_length,completions / min_terminated_length,completions / max_terminated_length,kl,entropy,rewards / reward_fn / mean,rewards / reward_fn / std
1,0.0005,0.313125,0.183643,208.9375,38.0,256.0,0.75,67.75,38.0,111.0,0.497453,0,0.313125,0.177477
2,0.0005,0.2815,0.183957,231.0,50.0,256.0,0.8125,122.666672,50.0,250.0,0.475979,No Log,0.2815,0.178934
3,0.0006,0.263375,0.086396,196.3125,42.0,256.0,0.6875,65.0,42.0,102.0,0.613213,No Log,0.263375,0.092433
4,0.0005,0.482625,0.268592,142.5,41.0,256.0,0.375,74.400002,41.0,137.0,0.540869,No Log,0.482625,0.266226
5,0.0004,0.39425,0.183756,140.8125,22.0,256.0,0.3125,88.454552,22.0,246.0,0.384775,No Log,0.39425,0.198859
6,0.0005,0.351375,0.18019,153.625,31.0,256.0,0.5,51.25,31.0,77.0,0.541739,No Log,0.351375,0.233511
7,0.0006,0.2615,0.10253,194.4375,44.0,256.0,0.6875,59.0,44.0,98.0,0.554981,No Log,0.2615,0.10022
8,0.0004,0.387875,0.190237,96.0,19.0,256.0,0.25,42.666668,19.0,96.0,0.408046,No Log,0.387875,0.192637
9,0.0004,0.305125,0.138179,161.875,27.0,256.0,0.4375,88.666664,27.0,234.0,0.405808,No Log,0.305125,0.147174
10,0.0007,0.358375,0.279753,219.9375,30.0,256.0,0.8125,63.666668,30.0,98.0,0.722891,No Log,0.358375,0.276007


KeyboardInterrupt: 

### Sample output from the trained model

In [42]:
sample = {
    "prompt": """Below is an instruction and an input. Provide the best possible answer.

### Instruction:
What is the escape velocity from Earth?

### Input:
Reason using known physical constants and give the final value.

### Response:"""
}

In [44]:
from IPython.display import Markdown

inputs = tokenizer(sample["prompt"], return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=500,
    do_sample=True,     # sampling helps avoid repetition
    top_p=0.9,
    temperature=0.7
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Below is an instruction and an input. Provide the best possible answer.

### Instruction:
What is the escape velocity from Earth?

### Input:
Reason using known physical constants and give the final value.

### Response: 
The escape velocity is determined by the gravitational force between the object and the Earth. The gravitational force is given by the equation $F_g=G\frac{m_1m_2}{r^2}$, where $G$ is the gravitational constant, $m_1$ and $m_2$ are the masses of the two objects, and $r$ is the distance between them. For an object to escape Earth's gravitational pull, it must have a velocity greater than this escape velocity. Therefore, the escape velocity can be calculated as:
$v_{\text{escape}} = \sqrt{\frac{2GM}{r}}$
Plugging in the known values: $G = 6.67 \times 10^{-11} \text{ m}^3 \text{ kg}^{-1} \text{ s}^{-2}$, $M = 5.972 \times 10^{24} \text{ kg}$ (mass of Earth), and $r = 6.378 \times 10^6 \text{ m}$ (radius of Earth), we get:
$v_{\text{escape}} = \sqrt{\frac{2(6.67 \times 10

In [58]:
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
display(Markdown(answer))

Below is an instruction and an input. Provide the best possible answer.

### Instruction:
What is the escape velocity from Earth?

### Input:
Reason using known physical constants and give the final value.

### Response: 
The escape velocity is determined by the gravitational force between the object and the Earth. The gravitational force is given by the equation $F_g=G\frac{m_1m_2}{r^2}$, where $G$ is the gravitational constant, $m_1$ and $m_2$ are the masses of the two objects, and $r$ is the distance between them. For an object to escape Earth's gravitational pull, it must have a velocity greater than this escape velocity. Therefore, the escape velocity can be calculated as:
$v_{\text{escape}} = \sqrt{\frac{2GM}{r}}$
Plugging in the known values: $G = 6.67 \times 10^{-11} \text{ m}^3 \text{ kg}^{-1} \text{ s}^{-2}$, $M = 5.972 \times 10^{24} \text{ kg}$ (mass of Earth), and $r = 6.378 \times 10^6 \text{ m}$ (radius of Earth), we get:
$v_{\text{escape}} = \sqrt{\frac{2(6.67 \times 10^{-11})(5.972 \times 10^{24})}{6.378 \times 10^6}}$
$v_{\text{escape}} \approx 11.2 \text{ km/s}$