In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# The model ID for Llama 3.2 3B Instruct
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Configure 4-bit quantization to save memory
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model with the specified quantization config
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=quantization_config,
    device_map="auto", # Automatically map model layers to available devices
)

2025-07-04 13:20:26.632562: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
from datasets import Dataset

# A simple dataset of prompts
# prompts_data = [
#     {"prompt": "Explain the concept of photosynthesis in a simple way."},
#     {"prompt": "What are the main differences between Python lists and tuples?"},
#     {"prompt": "Write a short, encouraging note to someone starting a new project."}
# ]

prompts_data = [
    {"prompt": "Explain the concept of photosynthesis in a simple way."},
    {"prompt": "What are the main differences between Python lists and tuples?"},
    {"prompt": "Write a short, encouraging note to someone starting a new project."},
    {"prompt": "Describe the water cycle using simple language."},
    {"prompt": "What is the importance of cybersecurity in today’s world?"},
    {"prompt": "Summarize the plot of 'Romeo and Juliet' in a few sentences."},
    {"prompt": "List three effective ways to manage stress."},
    {"prompt": "Explain how a credit score works."},
    {"prompt": "Write a thank-you message to a mentor."},
    {"prompt": "What are some beginner-friendly programming languages?"},
    {"prompt": "Give tips for staying productive while working from home."},
    {"prompt": "What causes seasons to change throughout the year?"},
    {"prompt": "Compare renewable and non-renewable energy sources."},
    {"prompt": "What are some common interview questions and good responses?"},
    {"prompt": "Write a simple explanation of blockchain technology."},
    {"prompt": "How does exercise benefit mental health?"},
    {"prompt": "What are some best practices for creating strong passwords?"},
    {"prompt": "Describe the structure of a plant cell."},
    {"prompt": "Write a motivational quote for students preparing for exams."},
    {"prompt": "What is the role of the United Nations?"}
]


# Convert the list of dictionaries to a Hugging Face Dataset object
train_dataset = Dataset.from_list(prompts_data)

print(train_dataset)
# Expected output:
# Dataset({
#     features: ['prompt'],
#     num_rows: 3
# })

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


Dataset({
    features: ['prompt'],
    num_rows: 20
})


In [12]:
from pprint import pprint

# Pretty print the first 5 rows
for i in range(2):
    pprint(train_dataset[i])
    print("-" * 50)


{'prompt': 'Explain the concept of photosynthesis in a simple way.'}
--------------------------------------------------
{'prompt': 'What are the main differences between Python lists and tuples?'}
--------------------------------------------------


In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
print(tokenizer.special_tokens_map)


{'bos_token': '<|begin_of_text|>', 'eos_token': '<|eot_id|>'}


In [None]:
def length_reward_func(completions, **kwargs):
    """
    A simple reward function that scores responses based on their length.

    Args:
        completions (list of str): A list of responses generated by the model.
        **kwargs: The trainer passes other arguments  here, which we ignore.

    Returns:
        list of float: A list of reward scores for each completion.
    """
    # The function returns a list of scores, one for each completion
    return [float(len(c)) for c in completions]

In [7]:
from trl import GRPOTrainer, GRPOConfig
from peft import LoraConfig

# PEFT configuration for LoRA
# This tells the trainer to only train a small set of adapter weights
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

# GRPO training configuration
grpo_config = GRPOConfig(
    output_dir="./grpo_llama3.2_finetuned",
    beta=0.1,  # The KL-divergence regularization coefficient
    max_prompt_length=256,
    max_completion_length=512,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    learning_rate=5e-5,
    logging_steps=5,
    report_to="tensorboard", # Set to "wandb" or "tensorboard" for experiment tracking
    num_generations=2,
)

# Initialize the trainer
trainer = GRPOTrainer(
    model=model,
    args=grpo_config,
    processing_class=tokenizer,
    train_dataset=train_dataset,
    reward_funcs=[length_reward_func], # Pass our reward function in a list
    peft_config=peft_config,
)

# Start the fine-tuning process
print("Starting GRPO fine-tuning...")
# trainer.train()
print("Fine-tuning complete!")

# Save the trained adapter model
# trainer.save_model("./grpo_llama3.2_finetuned")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting GRPO fine-tuning...
Fine-tuning complete!


## Testing the model

In [8]:
from peft import PeftModel

# Reload the base model to merge with the adapter
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=quantization_config,
    device_map="auto",
)

# Load the LoRA adapter and merge it with the base model
model = PeftModel.from_pretrained(base_model, "./grpo_llama3.2_finetuned")
model = model.merge_and_unload()

print("\n--- Testing the fine-tuned model ---")

# Create a test prompt
test_prompt = "What are the main differences between Python lists and tuples?"

# Format the prompt using the model's chat template
messages = [
    {"role": "user", "content": test_prompt}
]

# The tokenizer will apply the chat template and prepare the input
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

# Generate a response
outputs = model.generate(inputs, max_new_tokens=200, do_sample=True, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Testing the fine-tuned model ---
system

Cutting Knowledge Date: December 2023
Today Date: 04 Jul 2025

user

What are the main differences between Python lists and tuples?assistant

Python lists and tuples are both data structures that can store multiple values, but they have several key differences:

1. **Immutability**: Tuples are immutable, meaning their contents cannot be modified after creation. Lists, on the other hand, are mutable, allowing elements to be added or removed.

2. **Syntax**: Tuples are defined using parentheses `()` and elements are separated by commas. Lists are defined using square brackets `[]` and elements are also separated by commas.

3. **Performance**: Tuples are generally faster than lists because they are immutable, which allows Python to optimize them more efficiently.

4. **Memory Usage**: Tuples use less memory than lists because they do not need to store the additional metadata required for mutable objects.

5. **Indexing**: Both tuples and list

In [8]:
# Before trainer.train() and after model/tokenizer setup

# Select a few example prompts from your dataset
example_prompts = [
    train_dataset[0]["prompt"],
    train_dataset[1]["prompt"],
    train_dataset[2]["prompt"],
    # Add more as needed
]

print("--- Initial Model Responses (before training) ---")
for i, prompt in enumerate(example_prompts):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    # Generate a completion. Adjust max_new_tokens as needed, but keep it consistent
    # with your max_completion_length from GRPOConfig for fair comparison.
    # Set pad_token_id to eos_token_id for cleaner generation with some models
    generated_ids = model.generate(
        **inputs,
        max_new_tokens=grpo_config.max_completion_length,
        do_sample=True, # Use sampling if you want more varied responses
        temperature=0.7, # Adjust temperature for creativity vs. determinism
        top_p=0.9,       # Top-p sampling
        pad_token_id=tokenizer.eos_token_id # Important for avoiding long padding
    )
    generated_text = tokenizer.decode(generated_ids[0, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    print(f"\nPrompt {i+1}: {prompt}")
    print(f"Generated Text {i+1} (Length: {len(generated_text.split())} words):") # Simple word count
    print(generated_text)
    print("-" * 50)

# Then proceed with your trainer setup and trainer.train()

--- Initial Model Responses (before training) ---

Prompt 1: Explain the concept of photosynthesis in a simple way.
Generated Text 1 (Length: 192 words):
 Photosynthesis is the process by which plants, algae, and some bacteria convert light energy from the sun into chemical energy in the form of organic compounds, such as glucose. This process is essential for life on Earth, as it provides the energy and organic compounds needed to support the food chain.

Here's a simple explanation of photosynthesis:

**Step 1: Light absorption**
Plants, algae, and some bacteria absorb light energy from the sun through specialized pigments such as chlorophyll.

**Step 2: Water absorption**
These organisms absorb water from the soil through their roots.

**Step 3: Carbon dioxide absorption**
They also absorb carbon dioxide from the air.

**Step 4: Energy conversion**
The light energy is converted into chemical energy, which is stored in the form of glucose (a type of sugar).

**Step 5: Oxygen release*

In [9]:
# After trainer.train() and trainer.save_model()

from peft import PeftModel

# Reload the base model (if you unloaded it or if you want a clean slate)
# If your 'model' variable still holds the trained PEFT model, you can skip this reload.
# Assuming you saved to "./grpo_llama3.2_finetuned"
# You might need to reload the base model first, then load the adapter on top.

# Base model without adapter (if you need it clean)
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=quantization_config,
    device_map="auto",
)
# Then load the adapter
trained_model = PeftModel.from_pretrained(base_model, "./grpo_llama3.2_finetuned")

# OR, if 'model' object is still your trained PEFT model, just use it directly:
trained_model = model # Assuming 'model' variable still contains the trained PEFT model

print("\n--- Fine-Tuned Model Responses (after training) ---")
for i, prompt in enumerate(example_prompts): # Use the same example prompts
    inputs = tokenizer(prompt, return_tensors="pt").to(trained_model.device)
    generated_ids = trained_model.generate(
        **inputs,
        max_new_tokens=grpo_config.max_completion_length,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(generated_ids[0, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    print(f"\nPrompt {i+1}: {prompt}")
    print(f"Generated Text {i+1} (Length: {len(generated_text.split())} words):") # Simple word count
    print(generated_text)
    print("-" * 50)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]


--- Fine-Tuned Model Responses (after training) ---

Prompt 1: Explain the concept of photosynthesis in a simple way.
Generated Text 1 (Length: 272 words):
 Photosynthesis is the process by which plants, algae, and some bacteria convert light energy from the sun into chemical energy in the form of glucose, a type of sugar that serves as a source of energy for the plant. This process occurs in specialized organelles called chloroplasts, which contain pigments such as chlorophyll that absorb light energy.
The overall equation for photosynthesis is:
6 CO2 + 6 H2O + light energy → C6H12O6 (glucose) + 6 O2
In simpler terms, plants take in carbon dioxide and water, use sunlight to convert them into glucose and oxygen, and release the oxygen into the air as a byproduct.
Here's a step-by-step explanation of the photosynthesis process:
1. Light absorption: Chlorophyll and other pigments in the chloroplast absorb light energy from the sun.
2. Water absorption: Plants absorb water from the soil 