In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig
from trl import PPOTrainer, PPOConfig
from datasets import load_dataset

from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B-Instruct"
# Load policy model (Qwen2.5-0.5B-Instruct)
policy_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,
    torch_dtype="auto"
)
policy_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

input_text = "What is the capital of France?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output = model.generate(input_ids)
print(tokenizer.decode(output[0]))


In [None]:
# Total parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params}")

# Trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable Parameters: {trainable_params}")


In [None]:
# Create LoRA config
lora_config = LoraConfig(
    r=8,                      # rank of LoRA (smaller = lighter)
    lora_alpha=16,             # scaling factor
    target_modules=["q_proj", "v_proj"],  # depends on model arch
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to model
policy_model = get_peft_model(policy_model, lora_config)

# Optional: See how many parameters will be trained
policy_model.print_trainable_parameters()


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # or load_in_8bit=True if you prefer
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype="float16"
)

reward_model = AutoModelForSequenceClassification.from_pretrained(
    "OpenAssistant/reward-model-deberta-v3-large",
    device_map="auto",
    quantization_config=bnb_config
)

reward_tokenizer = AutoTokenizer.from_pretrained("OpenAssistant/reward-model-deberta-v3-large")


In [None]:
# 4. Load Prompts
dataset = load_dataset("tatsu-lab/alpaca", split="train")
prompts = [sample["instruction"] for sample in dataset.select(range(1000))]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [None]:
import torch
import torch.nn.functional as F
from torch.optim import Adam

# Step 0: Setup
device = "cuda" if torch.cuda.is_available() else "cpu"

policy_model.train()

# Optimizer only for LoRA trainable parameters
optimizer = Adam(filter(lambda p: p.requires_grad, policy_model.parameters()), lr=1e-4)

# Baseline for reward
baseline_reward = 0.0
gamma = 0.99  # discount

clip_eps = 0.2  # PPO clip range
batch_size = 4  # Set your batch size

policy_tokenizer.padding_side = "left"
reward_tokenizer.padding_side = "left"
policy_tokenizer.padding_side = "left"
policy_tokenizer.pad_token = policy_tokenizer.eos_token


# Make batches of prompts
for batch_idx in range(0, len(prompts), batch_size):
    
    batch_prompts = prompts[batch_idx:batch_idx+batch_size]

    # 1. Tokenize batch of prompts
    inputs = policy_tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

    # 2. Generate batch of responses
    full_output = policy_model.generate(**inputs, max_new_tokens=128, do_sample=True, pad_token_id=policy_tokenizer.pad_token_id)

    # 3. Get prompt lengths to split
    prompt_lengths = inputs.input_ids.shape[1]  # Assume fixed prompt length in batch

    generated_tokens = full_output[:, prompt_lengths:]

    # 4. Decode responses
    responses_text = policy_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    # 5. Compute rewards for each response
    reward_inputs = reward_tokenizer(responses_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    reward_logits = reward_model(**reward_inputs).logits
    reward_scores = reward_logits.squeeze()  # shape: (batch_size,)

    # 6. Compute advantages
    advantages = reward_scores - baseline_reward  # vectorized

    # 7. Compute old logprobs
    with torch.no_grad():
        full_logits = policy_model(full_output).logits

    log_probs = F.log_softmax(full_logits[:, prompt_lengths-1:-1, :], dim=-1)
    chosen_log_probs = torch.gather(log_probs, 2, generated_tokens.unsqueeze(-1)).squeeze(-1)
    old_log_probs = chosen_log_probs.detach()

    # 8. Forward current policy again
    current_logits = policy_model(full_output).logits
    current_log_probs = F.log_softmax(current_logits[:, prompt_lengths-1:-1, :], dim=-1)
    current_chosen_log_probs = torch.gather(current_log_probs, 2, generated_tokens.unsqueeze(-1)).squeeze(-1)

    # 9. Calculate ratio
    log_ratio = (current_chosen_log_probs - old_log_probs)
    ratio = torch.exp(log_ratio)

    # 10. PPO clipped loss
    unclipped_loss = -advantages.unsqueeze(1) * ratio
    clipped_loss = -advantages.unsqueeze(1) * torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps)
    policy_loss = torch.max(unclipped_loss, clipped_loss).mean()

    # 11. Optimize
    optimizer.zero_grad()
    policy_loss.backward()
    optimizer.step()

    # 12. Update baseline reward
    baseline_reward = gamma * baseline_reward + (1 - gamma) * reward_scores.mean().item()

    print(f"Batch {batch_idx//batch_size}: Avg Reward={reward_scores.mean():.5f}, Avg Advantage={advantages.mean():.5f}, Loss={policy_loss.item():.5f}")
    print("-" * 50)


In [None]:
input_text = "What is the capital of France?"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
output = policy_model.generate(input_ids)
print(tokenizer.decode(output[0]))

In [None]:
len(prompts)

In [None]:
import torch
import torch.nn.functional as F
from torch.optim import Adam
from tqdm import tqdm 

# Setup
device = "cuda" if torch.cuda.is_available() else "cpu"

policy_model.train()
optimizer = Adam(filter(lambda p: p.requires_grad, policy_model.parameters()), lr=1e-4)

policy_tokenizer.padding_side = "left"
reward_tokenizer.padding_side = "left"
policy_tokenizer.padding_side = "left"
policy_tokenizer.pad_token = policy_tokenizer.eos_token
# PPO Hyperparameters
batch_size = 4
clip_eps = 0.2
gamma = 0.99  # baseline discount
max_prompt_len = 512
max_gen_tokens = 128

# KL regularization parameters
beta_kl = 0.01        # initial KL penalty weight
target_kl = 0.02      # target KL divergence
kl_adapt = 1.5        # adaptation factor for beta

# Initialize reward baseline
baseline_reward = 0.0

# Setup tqdm progress bar
pbar = tqdm(range(0, len(prompts), batch_size), desc="PPO Training", dynamic_ncols=True)

for batch_idx in pbar:

    batch_prompts = prompts[batch_idx:batch_idx+batch_size]

    # 1. Tokenize prompts
    inputs = policy_tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=max_prompt_len).to(device)

    # 2. Generate responses
    full_output = policy_model.generate(**inputs, max_new_tokens=max_gen_tokens, do_sample=True, pad_token_id=policy_tokenizer.pad_token_id)

    # 3. Get prompt lengths
    prompt_len = inputs.input_ids.shape[1]
    generated_tokens = full_output[:, prompt_len:]

    # 4. Decode responses
    responses_text = policy_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

    # 5. Compute rewards
    reward_inputs = reward_tokenizer(responses_text, return_tensors="pt", padding=True, truncation=True, max_length=max_prompt_len).to(device)
    reward_logits = reward_model(**reward_inputs).logits
    reward_scores = reward_logits.squeeze()  # (batch_size,)

    # 6. Compute advantages
    advantages = reward_scores - baseline_reward  # (batch_size,)

    # 7. Compute old logprobs
    with torch.no_grad():
        full_logits_old = policy_model(full_output).logits

    log_probs_old = F.log_softmax(full_logits_old[:, prompt_len-1:-1, :], dim=-1)
    chosen_log_probs_old = torch.gather(log_probs_old, 2, generated_tokens.unsqueeze(-1)).squeeze(-1)
    old_log_probs = chosen_log_probs_old.detach()

    # 8. Forward pass current model
    full_logits_current = policy_model(full_output).logits
    log_probs_current = F.log_softmax(full_logits_current[:, prompt_len-1:-1, :], dim=-1)
    chosen_log_probs_current = torch.gather(log_probs_current, 2, generated_tokens.unsqueeze(-1)).squeeze(-1)

    # 9. Calculate ratio
    log_ratio = chosen_log_probs_current - old_log_probs
    ratio = torch.exp(log_ratio)

    # 10. Calculate KL divergence (forward KL)
    kl_per_token = old_log_probs.exp() * (old_log_probs - chosen_log_probs_current)
    kl_batch = kl_per_token.sum(dim=1).mean()  # average over batch

    # 11. PPO surrogate loss
    unclipped_loss = -advantages.unsqueeze(1) * ratio
    clipped_loss = -advantages.unsqueeze(1) * torch.clamp(ratio, 1-clip_eps, 1+clip_eps)
    ppo_loss = torch.max(unclipped_loss, clipped_loss).mean()

    # 12. Total loss = PPO loss + KL penalty
    total_loss = ppo_loss + beta_kl * kl_batch

    # 13. Backward and optimize
    optimizer.zero_grad()
    total_loss.backward()
    optimizer.step()

    # 14. Update baseline reward
    baseline_reward = gamma * baseline_reward + (1 - gamma) * reward_scores.mean().item()

    # 15. Adapt beta_kl if needed
    if kl_batch > target_kl * 1.5:
        beta_kl *= kl_adapt
    elif kl_batch < target_kl / 1.5:
        beta_kl /= kl_adapt

        # Update tqdm live bar
    pbar.set_postfix({
        "loss": f"{total_loss.item():.5f}",
        "reward": f"{reward_scores.mean().item():.5f}",
        "adv": f"{advantages.mean().item():.5f}",
        "kl": f"{kl_batch.item():.5f}",
        "beta_kl": f"{beta_kl:.5f}"
    })


In [None]:
input_text = "What is the animal: dog or house?"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
output = policy_model.generate(input_ids)
print(tokenizer.decode(output[0]))