<a href="https://colab.research.google.com/github/pastrop/kaggle/blob/master/GRPO_Study_Ntbk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GRPO Study Notebook

In [1]:
%%capture
!pip install torch transformers

# Finetuning LLM with the GRPO

generating training examples using LLM

In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn as nn # Import the torch.nn module and alias it as nn
import torch.optim as optim # Import the torch.optim module and alias it as optim
import torch.nn.functional as F

In [8]:
# Load GPT model & tokenizer
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Define a question (prompt)
prompt = "What people like or dislike about working out?"

# Tokenize the input
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

# Generate multiple responses using sampling
num_responses = 5  # Generate 5 different completions
responses = model.generate(
    input_ids,
    max_length=50,
    do_sample=True,  # Enables sampling instead of greedy decoding
    top_k=50,  # Consider top 50 tokens at each step
    top_p=0.9,  # Nucleus sampling: keeps top tokens contributing to 90% probability
    temperature=0.7,  # Controls randomness (lower = more deterministic)
    num_return_sequences=num_responses  # Generates multiple responses
)

# Decode and print responses
#decoded_responses = [tokenizer.decode(output, skip_special_tokens=True) for output in responses]
#Decode $ remove the prompt &print
decoded_responses = [
    tokenizer.decode(output[len(input_ids[0]):], skip_special_tokens=True).strip()
    for output in responses
]
for i, response in enumerate(decoded_responses):
    print(f"Response {i+1}: {response}")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Response 1: Well, it's like the idea of being in the gym, or your girlfriend or boyfriend, or whatever. But if you're doing it that way, and you're having a great time and
Response 2: People like to work out. You have to be very careful and you have to be very careful about what you say. You have to be very careful about what you say. And I think that
Response 3: I don't think I have an answer for that.

The last thing you want to do is to give up.

It's a really good idea to do it for a few
Response 4: Workout is a great way to get your body to feel good. I am not saying that it is not possible to get fit and get better, but it is very important to get your body
Response 5: I think it's a really cool thing to do. I think people like to work out. I think people like to be physically active. I think people like to eat. I think people like


**Using a trained sentiment model as a Reward Model**

In [None]:
# Load sentiment classifier
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
reward_model = AutoModelForSequenceClassification.from_pretrained(model_name)
reward_tokenizer = AutoTokenizer.from_pretrained(model_name)

def get_reward_scores(responses):
    """Scores responses using a sentiment classifier."""
    inputs = reward_tokenizer(responses, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():  # No gradient calculation needed
        outputs = reward_model(**inputs)
    scores = torch.softmax(outputs.logits, dim=1)[:, 1]  # Probability of "positive" class
    return scores.detach()  # Detach to prevent computation graph tracking

In [16]:
#Test Run for the Reward Model (This is a test cell just to make sure get_reward_scores works properly)
'''
#Example responses
responses = [
    "I love exercising, it makes me feel amazing!",
    "Exercise is okay, but it's tiring.",
    "I hate exercising, it's the worst!"
]
'''
# Get reward scores
reward_scores = get_reward_scores(decoded_responses)

# Print scores
for i, (decoded_response, score) in enumerate(zip(decoded_responses, reward_scores)):
    print(f"Response {i+1}: {decoded_response}\nScore: {score.item():.3f}\n")


Response 1: Well, it's like the idea of being in the gym, or your girlfriend or boyfriend, or whatever. But if you're doing it that way, and you're having a great time and
Score: 1.000

Response 2: People like to work out. You have to be very careful and you have to be very careful about what you say. You have to be very careful about what you say. And I think that
Score: 0.035

Response 3: I don't think I have an answer for that.

The last thing you want to do is to give up.

It's a really good idea to do it for a few
Score: 0.868

Response 4: Workout is a great way to get your body to feel good. I am not saying that it is not possible to get fit and get better, but it is very important to get your body
Score: 0.999

Response 5: I think it's a really cool thing to do. I think people like to work out. I think people like to be physically active. I think people like to eat. I think people like
Score: 1.000



Define a Policy Networks

In [21]:
# Define a simple policy network that scores responses
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Output: scalar score for each response
        )

    def forward(self, x):
        return self.fc(x).squeeze(-1)  # Ensure output is [batch_size] shape

# Instantiate the policy network
policy_net = PolicyNetwork(input_dim=768)  # GPT-2 embedding size
policy_optimizer = optim.Adam(policy_net.parameters(), lr=1e-5)

training policy network with KL term

In [None]:
def compute_kl_divergence(original_logits, new_logits):
    """Compute KL divergence between the original GPT-2 outputs and the policy network outputs."""
    original_probs = F.softmax(original_logits, dim=-1)
    new_probs = F.softmax(new_logits, dim=-1)
    kl_div = F.kl_div(new_probs.log(), original_probs, reduction="batchmean")  # KL(P || Q)
    return kl_div

def train_policy_network_with_kl(responses, reward_scores, beta=0.01):
    """Updates the policy network using GRPO-style policy gradients with KL divergence."""

    # Get response embeddings from GPT-2
    response_inputs = gpt_tokenizer(responses, return_tensors="pt", padding=True, truncation=True)
    response_embeddings = gpt_model.transformer.wte(response_inputs["input_ids"]).mean(dim=1)  # Mean token embedding

    # Get logits from the original GPT-2 model (before policy updates)
    with torch.no_grad():
        original_logits = gpt_model(response_inputs["input_ids"]).logits.mean(dim=1)

    # Get predicted scores from the policy network
    predicted_scores = policy_net(response_embeddings).squeeze()

    # Compute Advantage (A = reward - baseline)
    baseline = reward_scores.mean()
    advantage = reward_scores - baseline

    # Compute policy loss (GRPO-style clipped loss)
    clip_ratio = 0.2
    policy_loss = -torch.min(
        predicted_scores * advantage,
        torch.clamp(predicted_scores, 1 - clip_ratio, 1 + clip_ratio) * advantage
    ).mean()

    # Compute KL divergence loss
    new_logits = policy_net(response_embeddings)  # Policy's logits
    kl_loss = compute_kl_divergence(original_logits, new_logits)

    # Final loss with KL penalty
    total_loss = policy_loss + beta * kl_loss

    # Backpropagation
    policy_optimizer.zero_grad()
    total_loss.backward()
    policy_optimizer.step()

    return total_loss.item(), kl_loss.item()

# Training loop with KL regularization
num_epochs = 5
for epoch in range(num_epochs):
    responses = generate_responses(prompt)
    reward_scores = get_reward_scores(responses)
    loss, kl = train_policy_network_with_kl(responses, reward_scores)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss:.4f}, KL Divergence: {kl:.4f}")


# Fine-Tuning an LLM with a Policy Network
Below is a simplified PyTorch example that fine-tunes an LLM (e.g., GPT) using a trained policy.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the LLM (e.g., GPT-2 for simplicity)
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Define a simple policy network that scores responses
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # Output: scalar score for each response
        )

    def forward(self, x):
        return self.fc(x)

# Instantiate the policy network
policy_net = PolicyNetwork(input_dim=768)  # GPT's embedding size
optimizer = optim.Adam(model.parameters(), lr=1e-5)

# Simulated training batch
texts = ["What is the capital of France?", "How does photosynthesis work?"]
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

# Step 1: Generate responses from the LLM
with torch.no_grad():
    outputs = model.generate(inputs["input_ids"], max_length=50)

# Step 2: Get response embeddings (used as input to policy network)
response_embeddings = model.transformer.wte(outputs)  # Word token embeddings
response_scores = policy_net(response_embeddings.mean(dim=1))  # Score each response

# Step 3: Compute policy loss (GRPO-style clipped loss)
baseline = response_scores.mean()  # Baseline for advantage calculation
advantage = response_scores - baseline  # Compute advantage function
policy_loss = -torch.min(response_scores * advantage, torch.clamp(response_scores, 0.8, 1.2) * advantage).mean()

# Step 4: Backpropagate the loss to fine-tune the LLM
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()

print(f"Policy Loss: {policy_loss.item()}")


# Policy & Reward Models
*random examples*

 (this is a untrained neural net used as an example, it will give random rewards) to Score Responses

In [None]:
class RewardModel(nn.Module):
    """Simple reward model that scores responses."""
    def __init__(self, embedding_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 1),  # Output: a single score per response
            nn.Sigmoid()  # Normalize score between 0 and 1
        )

    def forward(self, response_embedding):
        return self.fc(response_embedding)

# Instantiate reward model
reward_model = RewardModel(embedding_dim=768)  # Assuming GPT's embeddings

# Convert responses to embeddings
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
response_inputs = tokenizer(decoded_responses, return_tensors="pt", padding=True, truncation=True)
response_embeddings = model.transformer.wte(response_inputs["input_ids"]).mean(dim=1)  # Averaging token embeddings

# Score each response
reward_scores = reward_model(response_embeddings).squeeze()

# Print scores
for i, (response, score) in enumerate(zip(decoded_responses, reward_scores.tolist())):
    print(f"Response {i+1}: {response} | Score: {score:.3f}")

toy example of the policy network

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class PolicyNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, state):
        return self.fc(state)

def compute_advantage(rewards, values, gamma=0.99):
    """Computes advantage estimates using reward and value function."""
    returns = []
    advs = []
    G = 0
    for t in reversed(range(len(rewards))):
        G = rewards[t] + gamma * G  # Compute return
        returns.insert(0, G)
        advs.insert(0, G - values[t])  # Advantage = Return - Value Estimate
    return torch.tensor(advs, dtype=torch.float32)

# Hyperparameters
epsilon = 0.2
gamma = 0.99
learning_rate = 0.01

# Sample data (dummy example)
state_dim = 4
action_dim = 2
states = torch.rand((5, state_dim))  # 5 sample states
actions = torch.tensor([0, 1, 0, 1, 0])  # Actions taken
old_probs = torch.tensor([0.4, 0.6, 0.5, 0.7, 0.5])  # Old policy probabilities
rewards = [1, 0, 1, 1, 0]  # Rewards received
values = [0.5, 0.4, 0.6, 0.7, 0.3]  # Value estimates

# Initialize network and optimizer
policy_net = PolicyNetwork(state_dim, action_dim)
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)

# Compute advantage
advantages = compute_advantage(rewards, values, gamma)

# Compute new policy probabilities
new_probs = policy_net(states).gather(1, actions.view(-1, 1)).squeeze()

# Compute probability ratio
ratios = new_probs / old_probs

# Compute clipped and unclipped loss
clipped_ratios = torch.clamp(ratios, 1 - epsilon, 1 + epsilon)
loss = -torch.min(ratios * advantages, clipped_ratios * advantages).mean()

# Perform gradient ascent
optimizer.zero_grad()
loss.backward()
optimizer.step()

print(f"Policy Loss: {loss.item()}")


Train the Policy with GRPO
Train the policy network to predict higher scores for better responses.
Update it using a reinforcement learning algorithm like GRPO.
Example: Training the Policy Network

Using the Policy for Action Selection
Once the policy is trained, it can be used to select actions in the environment.

Example: Action Selection in a Trained Policy

In [None]:
import torch

def select_action(policy_net, state):
    """Selects an action based on the trained policy."""
    with torch.no_grad():  # No gradients needed for inference
        action_probs = policy_net(state)
        action = torch.multinomial(action_probs, 1)  # Sample from the policy distribution
    return action.item()

# Example usage:
state = torch.rand((1, 4))  # Example state (assuming 4D input)
action = select_action(policy_net, state)
print(f"Selected Action: {action}")
