<a href="https://colab.research.google.com/github/llk010502/RL_lab/blob/main/RL_playground.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install trl
! pip install -U bitsandbytes

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## TRL--GRPO
* Load model: 6.0 / 40.0 GB
* During training: 31.9-39.4 / 40.0 GB

In [None]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator
from trl import GRPOConfig, GRPOTrainer
from peft import (
    TaskType,
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)

# prepare dataset
dataset = load_dataset("trl-lib/tldr", split="train").select(range(500))

# define one or more reward functions
# simple example: length constraint
def reward_len(completions, **kwargs):
    return [-abs(20 - len(c)) for c in completions]

# load model
model_name = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",
        "up_proj","gate_proj","down_proj"
    ],
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# training configuration
training_args = GRPOConfig(
    run_name = "Llama-3.2-1B-Instruct-GRPO",
    output_dir="Llama-3.2-1B-Instruct-GRPO",
    learning_rate=1e-4,
    lr_scheduler_type='cosine',
    logging_steps=5,
    bf16=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    num_generations=4, # number of generations for each query
    max_prompt_length=256,
    max_completion_length=786,
    num_train_epochs=1,
    save_steps=100,
    report_to="wandb",
    max_steps=10
)
trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=reward_len,
    args=training_args,
    train_dataset=dataset,
    #peft_config=peft_config
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mll3713[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,0.1234
10,0.0449


TrainOutput(global_step=10, training_loss=0.08412548154592514, metrics={'train_runtime': 252.4023, 'train_samples_per_second': 0.158, 'train_steps_per_second': 0.04, 'total_flos': 0.0, 'train_loss': 0.08412548154592514})

## TRL--LoRA_GRPO
* Load model: 6.0 / 40.0 GB
* Initialize training: 23.6 / 40.0 GB


In [None]:
# training configuration
training_args = GRPOConfig(
    run_name = "Llama-3.2-1B-Instruct-GRPO-LoRA",
    output_dir="Llama-3.2-1B-Instruct-GRPO-LoRA",
    learning_rate=1e-4,
    lr_scheduler_type='cosine',
    logging_steps=5,
    bf16=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    num_generations=4, # number of generations for each query
    max_prompt_length=256,
    max_completion_length=786,
    num_train_epochs=1,
    save_steps=100,
    report_to="wandb",
    max_steps=10
)


trainer = GRPOTrainer(
    model=model,
    processing_class=tokenizer,
    reward_funcs=reward_len,
    args=training_args,
    train_dataset=dataset,
    peft_config=peft_config # activate peft
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Currently logged in as: [33mll3713[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,0.3793
10,0.4149


TrainOutput(global_step=10, training_loss=0.39709086418151857, metrics={'train_runtime': 170.2145, 'train_samples_per_second': 0.235, 'train_steps_per_second': 0.059, 'total_flos': 0.0, 'train_loss': 0.39709086418151857})

## verl--PPO
	•	Model weights: approximately 0.5*4B*3 = 6 GB
	•	Optimizer states: approximately 0.5*4B*2*2 = 8 GB
	•	Gradient buffers: approximately 4 GB
	•	FSDP & engine buffers: approximately 5–7 GB
	•	RL caches (log-probs, values, advantages): approximately 3–4 GB

In [None]:
! git clone https://github.com/volcengine/verl.git

Cloning into 'verl'...
remote: Enumerating objects: 10432, done.[K
remote: Counting objects: 100% (447/447), done.[K
remote: Compressing objects: 100% (333/333), done.[K
remote: Total 10432 (delta 326), reused 114 (delta 114), pack-reused 9985 (from 3)[K
Receiving objects: 100% (10432/10432), 5.41 MiB | 11.53 MiB/s, done.
Resolving deltas: 100% (7133/7133), done.


In [None]:
! pip install verl fastapi uvicorn flash-attn vllm

In [None]:
! pip install -U datasets fsspec huggingface_hub

In [None]:
%cd verl/
! python examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k

/content/verl
Creating parquet from Arrow format: 100% 8/8 [00:00<00:00, 166.40ba/s]
Creating parquet from Arrow format: 100% 2/2 [00:00<00:00, 269.08ba/s]


In [None]:
# test model loading
! python -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2.5-0.5B-Instruct')"

2025-05-25 21:27:43.742180: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-25 21:27:43.758707: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748208463.779643    3443 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748208463.785977    3443 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-25 21:27:43.806547: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [None]:
! PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
 data.train_files=$HOME/data/gsm8k/train.parquet \
 data.val_files=$HOME/data/gsm8k/test.parquet \
 data.train_batch_size=256 \
 data.max_prompt_length=512 \
 data.max_response_length=256 \
 actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
 actor_rollout_ref.actor.optim.lr=1e-6 \
 actor_rollout_ref.model.use_remove_padding=True \
 actor_rollout_ref.actor.ppo_mini_batch_size=64 \
 actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
 actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
 actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
 actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
 actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
 critic.optim.lr=1e-5 \
 critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
 critic.ppo_micro_batch_size_per_gpu=4 \
 algorithm.kl_ctrl.kl_coef=0.001 \
 trainer.logger=['console'] \
 trainer.val_before_train=False \
 trainer.default_hdfs_dir=null \
 trainer.n_gpus_per_node=1 \
 trainer.nnodes=1 \
 trainer.save_freq=10 \
 trainer.test_freq=10 \
 trainer.total_epochs=1 2>&1 | tee verl_demo.log

### verl--GRPO (RAM usage: 14-16GB)
	•	Model weights: approximately 3 GB -- use bfloat16 this time(2B per parameter)
	•	Optimizer states: approximately 8 GB
	•	Gradient buffers: approximately 2 GB
	•	FSDP & engine buffers: approximately 4 GB
	•	RL caches (log-probs, values, advantages): approximately 0.5 GB


#### parameter explanation
* choose GRPO algo: algorithm.adv_estimator=grpo
* actor-level kl penalty: actor_rollout_ref.actor.use_kl_loss=True
  (**Reparameterization need extra calculation (low-variance trick)**)
* not consider tensor parallel: actor_rollout_ref.rollout.tensor_model_parallel_size=1
* GRPO trajectory group: actor_rollout_ref.rollout.n=5
  

#### Customer reward function:
 custom_reward_function.path="/content/verl/reward_len.py" \

In [None]:
# customize reward function
reward_code = '''
def compute_score(data_source, solution_str, ground_truth, extra_info=None):
    # reward generation length that is close to 20
    return -abs(20 - len(solution_str))
'''

with open("reward_len.py", "w") as f:
    f.write(reward_code)

In [None]:
import os
reward_path = os.path.abspath("reward_len.py")
print("Absolute reward path:", reward_path)

Absolute reward path: /content/verl/reward_len.py


In [None]:
! python3 -m verl.trainer.main_ppo \
    algorithm.adv_estimator=grpo \
    data.train_files=$HOME/data/gsm8k/train.parquet \
    data.val_files=$HOME/data/gsm8k/test.parquet \
    data.train_batch_size=256 \
    data.max_prompt_length=512 \
    data.max_response_length=256 \
    actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
    actor_rollout_ref.actor.use_kl_loss=True \
    actor_rollout_ref.actor.kl_loss_coef=0.001 \
    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
    actor_rollout_ref.rollout.n=5 \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
    algorithm.kl_ctrl.kl_coef=0.001 \
    custom_reward_function.path="/content/verl/reward_len.py" \
    trainer.critic_warmup=0 \
    trainer.logger=['console'] \
    trainer.val_before_train=False \
    trainer.n_gpus_per_node=1 \
    trainer.nnodes=1 \
    trainer.save_freq=10 \
    trainer.test_freq=10 \
    trainer.total_epochs=1 2>&1 | tee verl_demo.log

2025-05-25 21:32:10,654	INFO worker.py:1879 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
[36m(TaskRunner pid=5907)[0m {'actor_rollout_ref': {'actor': {'checkpoint': {'contents': ['model',
[36m(TaskRunner pid=5907)[0m                                                              'optimizer',
[36m(TaskRunner pid=5907)[0m                                                              'extra']},
[36m(TaskRunner pid=5907)[0m                                  'clip_ratio': 0.2,
[36m(TaskRunner pid=5907)[0m                                  'clip_ratio_c': 3.0,
[36m(TaskRunner pid=5907)[0m                                  'clip_ratio_high': 0.2,
[36m(TaskRunner pid=5907)[0m                                  'clip_ratio_low': 0.2,
[36m(TaskRunner pid=5907)[0m                                  'entropy_coeff': 0,
[36m(TaskRunner pid=5907)[0m                                  'fsdp_config': {'fsdp_size': -1,
[36m(TaskRunner pid=5907)