In [1]:
from trl import GRPOTrainer, GRPOConfig


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOTrainer, GRPOConfig
import torch


In [3]:
model_name = "tiiuae/falcon-rw-1b"  # Substitute for a small model (2B unavailable here)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.eos_token_id


In [4]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

In [5]:
def reward_function_1(prompts, completions, **kwargs):
    return [1.0 if "thank you" in output.lower() else 0.0 for output in completions]

def reward_function_2(prompts, completions, **kwargs):
    return [len(output.strip()) / 50.0 for output in completions]  # normalize length

def combined_reward(prompts, completions, **kwargs):
    r1 = reward_function_1(prompts, completions)
    r2 = reward_function_2(prompts, completions)
    return [x + y for x, y in zip(r1, r2)]


In [1]:
from datasets import Dataset

prompts = [
    "Write a polite customer service reply.",
    "How do you make pasta?",
    "Give a motivational message to a student."
]

dataset = Dataset.from_dict({"prompt": prompts})


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
dataset

Dataset({
    features: ['prompt'],
    num_rows: 3
})

In [5]:
from pprint import pprint

# Pretty print the first 5 rows
for i in range(2):
    pprint(dataset[i])
    print("-" * 50)


{'prompt': 'Write a polite customer service reply.'}
--------------------------------------------------
{'prompt': 'How do you make pasta?'}
--------------------------------------------------


In [11]:
config1 = GRPOConfig(
    learning_rate=1e-5,
    max_prompt_length=10,
    max_completion_length=50,
    logging_steps = 1,
    num_train_epochs=4,
    # max_steps = 4,
    # report_to="none",
    report_to="wandb",
    output_dir="logs"
)


In [12]:
from peft import LoraConfig

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["query_key_value"]
)


In [13]:
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=[reward_function_1, reward_function_2],
    args=config1,
    train_dataset=dataset,
    peft_config=peft_config
)





No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Step,Training Loss
1,-0.1393
2,0.0
3,0.0
4,-0.02
5,0.0
6,-0.2975
7,0.0
8,-0.2649


TrainOutput(global_step=8, training_loss=-0.09019962468184328, metrics={'train_runtime': 16.7427, 'train_samples_per_second': 0.717, 'train_steps_per_second': 0.478, 'total_flos': 0.0, 'train_loss': -0.09019962468184328})

: 