In [1]:
!pip install git+https://github.com/huggingface/trl.git

Collecting git+https://github.com/huggingface/trl.git
  Cloning https://github.com/huggingface/trl.git to /tmp/pip-req-build-eyfrau_m
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/trl.git /tmp/pip-req-build-eyfrau_m
  Resolved https://github.com/huggingface/trl.git to commit 1b1242cc6522feb4eb063feb20097a79b11b127a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: trl
  Building wheel for trl (pyproject.toml) ... [?25l[?25hdone
  Created wheel for trl: filename=trl-0.26.0.dev0-py3-none-any.whl size=491839 sha256=d2f9493bf865ef279652e824d2197d5d3ff7da8799a037131edf916783cd69d9
  Stored in directory: /tmp/pip-ephem-wheel-cache-0f3mtfxw/wheels/0e/8f/95/dfd1c9271445f7e7e2fcfd9dfdcc8fabf9adc68edd4f2ea5fd
Successfully built trl
Installing collected packages: trl
Successfully installed trl-0.2

In [2]:
!pip install -q -U trl accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# This notebook has been adapted form the PPO example found here:
# https://github.com/huggingface/trl/blob/1b1242cc6522feb4eb063feb20097a79b11b127a/examples/scripts/ppo/ppo_tldr.py

# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# /// script
# dependencies = [
#     "trl",
#     "peft",
#     "trackio",
#     "kernels",
# ]
# ///

import os
import shutil

import torch
from accelerate import PartialState
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    HfArgumentParser,
)

from trl import ModelConfig, ScriptArguments, get_kbit_device_map, get_peft_config, get_quantization_config
from trl.experimental.ppo import PPOConfig, PPOTrainer

  from trl.experimental.ppo import PPOConfig, PPOTrainer


In [4]:
from dataclasses import fields
print([f.name for f in fields(ScriptArguments)])
print("dataset_name" in [f.name for f in fields(ScriptArguments)])

print([f.name for f in fields(PPOConfig)])
print("eval_steps" in [f.name for f in fields(PPOConfig)])

print([f.name for f in fields(ModelConfig)])
print("load_in_4bit" in [f.name for f in fields(ModelConfig)])

['dataset_name', 'dataset_config', 'dataset_train_split', 'dataset_test_split', 'dataset_streaming', 'gradient_checkpointing_use_reentrant', 'ignore_bias_buffers']
True
['output_dir', 'overwrite_output_dir', 'do_train', 'do_eval', 'do_predict', 'eval_strategy', 'prediction_loss_only', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'per_gpu_train_batch_size', 'per_gpu_eval_batch_size', 'gradient_accumulation_steps', 'eval_accumulation_steps', 'eval_delay', 'torch_empty_cache_steps', 'learning_rate', 'weight_decay', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'max_grad_norm', 'num_train_epochs', 'max_steps', 'lr_scheduler_type', 'lr_scheduler_kwargs', 'warmup_ratio', 'warmup_steps', 'log_level', 'log_level_replica', 'log_on_each_node', 'logging_dir', 'logging_strategy', 'logging_first_step', 'logging_steps', 'logging_nan_inf_filter', 'save_strategy', 'save_steps', 'save_total_limit', 'save_safetensors', 'save_on_each_node', 'save_only_model', 'restore_callback_states_from_c

In [5]:
# Enable logging in a Hugging Face Space
os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")

# Instead of script arguments. I will set global variables

# ModelConfig
model_name_or_path = "EleutherAI/pythia-70m-deduped" #"EleutherAI/pythia-1b-deduped"

# PPOConfig
reward_model_path = "cleanrl/EleutherAI_pythia-1b-deduped__reward__tldr"
output_dir = "/content/pythia-70m-deduped-tldr-preference-sft-trl-style-ppo"
eval_strategy = "steps"
learning_rate = 3e-6
per_device_train_batch_size = 1
gradient_accumulation_steps = 4 # 64
total_episodes = 2 #30000
sft_model_path = "cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr"
missing_eos_penalty = 1.0
response_length = 33 #53
eval_steps = 100
stop_token = "eos"

# ScriptArguments
dataset_name = "trl-internal-testing/tldr-preference-sft-trl-style"
dataset_test_split = "validation"

script_args = ScriptArguments(
    dataset_name=dataset_name,
    dataset_test_split=dataset_test_split
)

training_args = PPOConfig(
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    total_episodes=total_episodes,
    eval_strategy=eval_strategy,
    eval_steps=eval_steps,
    sft_model_path=sft_model_path,
    reward_model_path=reward_model_path,
    output_dir=output_dir,
    stop_token=stop_token,
    response_length=response_length,
    missing_eos_penalty=missing_eos_penalty,
    bf16=True
)

model_args = ModelConfig(
    model_name_or_path=model_name_or_path,
    load_in_4bit=True
)

#parser = HfArgumentParser((ScriptArguments, PPOConfig, ModelConfig))
#script_args, training_args, model_args = parser.parse_args_into_dataclasses()


"""
python examples/scripts/ppo/ppo_tldr.py \
    --dataset_name trl-internal-testing/tldr-preference-sft-trl-style \
    --dataset_test_split validation \
    --learning_rate 3e-6 \
    --output_dir pythia-1b-deduped-tldr-preference-sft-trl-style-ppo \
    --per_device_train_batch_size 1 \
    --gradient_accumulation_steps 64 \
    --total_episodes 30000 \
    --model_name_or_path EleutherAI/pythia-1b-deduped \
    --sft_model_path cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr \
    --reward_model_path cleanrl/EleutherAI_pythia-1b-deduped__reward__tldr \
    --missing_eos_penalty 1.0 \
    --stop_token eos \
    --response_length 53 \
    --eval_strategy steps \
    --eval_steps 100

accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml \
    examples/scripts/ppo/ppo_tldr.py \
    --dataset_name trl-internal-testing/tldr-preference-sft-trl-style \
    --dataset_test_split validation \
    --output_dir pythia-1b-deduped-tldr-preference-sft-trl-style-ppo \
    --learning_rate 3e-6 \
    --per_device_train_batch_size 16 \
    --gradient_accumulation_steps 4 \
    --total_episodes 1000000 \
    --model_name_or_path EleutherAI/pythia-1b-deduped \
    --sft_model_path cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr \
    --reward_model_path cleanrl/EleutherAI_pythia-1b-deduped__reward__tldr \
    --local_rollout_forward_batch_size 16 \
    --missing_eos_penalty 1.0 \
    --stop_token eos \
    --eval_strategy steps \
    --eval_steps 100
"""

'\npython examples/scripts/ppo/ppo_tldr.py     --dataset_name trl-internal-testing/tldr-preference-sft-trl-style     --dataset_test_split validation     --learning_rate 3e-6     --output_dir pythia-1b-deduped-tldr-preference-sft-trl-style-ppo     --per_device_train_batch_size 1     --gradient_accumulation_steps 64     --total_episodes 30000     --model_name_or_path EleutherAI/pythia-1b-deduped     --sft_model_path cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr     --reward_model_path cleanrl/EleutherAI_pythia-1b-deduped__reward__tldr     --missing_eos_penalty 1.0     --stop_token eos     --response_length 53     --eval_strategy steps     --eval_steps 100\n\naccelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml     examples/scripts/ppo/ppo_tldr.py     --dataset_name trl-internal-testing/tldr-preference-sft-trl-style     --dataset_test_split validation     --output_dir pythia-1b-deduped-tldr-preference-sft-trl-style-ppo     --learning_rate 3e-6     --per_devic

In [6]:
def prepare_dataset(dataset, tokenizer):
        """pre-tokenize the dataset before training; only collate during training"""

        def tokenize(element):
            # tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% endif %}{% endfor %}"
            input_ids = tokenizer.apply_chat_template(
                element["messages"][:1],
                padding=False,
                add_generation_prompt=True,
            )
            return {"input_ids": input_ids, "lengths": len(input_ids)}

        return dataset.map(
            tokenize,
            remove_columns=dataset.column_names,
            num_proc=training_args.dataset_num_proc,
        )

In [7]:
# remove output_dir if exists
shutil.rmtree(training_args.output_dir, ignore_errors=True)

################
# Model & Tokenizer
################
dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
model_kwargs = dict(
    revision=model_args.model_revision,
    attn_implementation=model_args.attn_implementation,
    dtype=dtype,
)
quantization_config = get_quantization_config(model_args)
if quantization_config is not None:
    # Passing None would not be treated the same as omitting the argument, so we include it only when valid.
    model_kwargs["device_map"] = get_kbit_device_map()
    model_kwargs["quantization_config"] = quantization_config

tokenizer = AutoTokenizer.from_pretrained(
    model_args.model_name_or_path, padding_side="left", trust_remote_code=model_args.trust_remote_code
)
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
# Set a chat template for the tokenizer to enable apply_chat_template
tokenizer.chat_template = tokenizer.chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] }}{% endif %}{% endfor %}"
value_model = AutoModelForSequenceClassification.from_pretrained(
    training_args.reward_model_path, trust_remote_code=model_args.trust_remote_code, num_labels=1
)
reward_model = AutoModelForSequenceClassification.from_pretrained(
    training_args.reward_model_path, trust_remote_code=model_args.trust_remote_code, num_labels=1
)
policy = AutoModelForCausalLM.from_pretrained(
    training_args.sft_model_path, trust_remote_code=model_args.trust_remote_code
)

peft_config = get_peft_config(model_args)
if peft_config is None:
    ref_policy = AutoModelForCausalLM.from_pretrained(
        training_args.sft_model_path, trust_remote_code=model_args.trust_remote_code
    )
else:
    ref_policy = None

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/745 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.05G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.05G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [8]:
dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)
train_dataset = dataset[script_args.dataset_train_split]
eval_dataset = dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None

# Compute that only on the main process for faster data processing.
# see: https://github.com/huggingface/trl/pull/1255
with PartialState().local_main_process_first():
    train_dataset = prepare_dataset(train_dataset, tokenizer)
    if eval_dataset is not None:
        eval_dataset = prepare_dataset(eval_dataset, tokenizer)
    # filtering
    train_dataset = train_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=training_args.dataset_num_proc)
    if eval_dataset is not None:
        eval_dataset = eval_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=training_args.dataset_num_proc)

assert train_dataset[0]["input_ids"][-1] != tokenizer.eos_token_id, "The last token should not be an EOS token"


README.md:   0%|          | 0.00/780 [00:00<?, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/159M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/160M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/18.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/116722 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6447 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6553 [00:00<?, ? examples/s]

Map:   0%|          | 0/116722 [00:00<?, ? examples/s]

Map:   0%|          | 0/6447 [00:00<?, ? examples/s]

Filter:   0%|          | 0/116722 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6447 [00:00<?, ? examples/s]

In [9]:
################
# Training
################
trainer = PPOTrainer(
    args=training_args,
    processing_class=tokenizer,
    model=policy,
    ref_model=ref_policy,
    reward_model=reward_model,
    value_model=value_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
)
trainer.train()

# Save and push to hub
trainer.save_model(training_args.output_dir)
if training_args.push_to_hub:
    trainer.push_to_hub(dataset_name=script_args.dataset_name)

trainer.generate_completions()

===training policy===


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmarc-bishara[0m ([33mbishara[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 28.88 MiB is free. Process 16723 has 39.52 GiB memory in use. Of the allocated memory 38.78 GiB is allocated by PyTorch, and 244.06 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)