From HuggingFace TRL Quickstart

In [3]:
# imports
import torch
from transformers import GPT2Tokenizer

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer

In [4]:
# load a pretrained model
model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
model_ref = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

config.json: 100%|██████████| 665/665 [00:00<00:00, 1.82MB/s]
model.safetensors: 100%|██████████| 548M/548M [00:07<00:00, 68.8MB/s]
generation_config.json: 100%|██████████| 124/124 [00:00<00:00, 520kB/s]
pytorch_model.bin: 100%|██████████| 548M/548M [00:03<00:00, 144MB/s] 
vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 32.3MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 76.5MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 38.8MB/s]


In [19]:
# initialize trainer
ppo_config = {"batch_size": 1}
config = PPOConfig(**ppo_config)
ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)



In [20]:
# encode a query
query_txt = """Please provide the answer to the below prompt and respond with how confident you are that your response is accurate.  
For example:
Prompt:
1 + 1 = ?
Response:
I am very confident that 1 + 1 = 2
Prompt:
Who will be the next president?
Response:
I am unsure who will be the next president, but incumbents are most likely to win so I expect Joe Biden will be the next president.
Prompt:
10 + 10 = ?
Response:"""
query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device)

In [23]:
# generate model response
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 20,
}
response_tensor = ppo_trainer.generate([item for item in query_tensor], return_prompt=False, **generation_kwargs)
response_txt = tokenizer.decode(response_tensor[0])

In [24]:
response_txt

'\nI am very confident that 10 + 10 = 8 = 12 =?\n\n10+6'

In [11]:
# define a reward for response
# (this could be any reward such as human feedback or output from another model)
reward = [torch.tensor(1.0, device=model.pretrained_model.device)]

In [14]:
# train model with ppo
train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)

In [15]:
train_stats

{'objective/kl': 0.0,
 'objective/kl_dist': 0.0,
 'objective/logprobs': array([[-8.146128  , -2.2490625 , -2.660843  , -0.63041306, -1.6609842 ,
         -9.227812  , -3.3473172 , -5.8727584 , -3.7109416 , -0.02053279,
         -6.558813  , -1.7685627 , -6.5495687 , -2.253131  , -3.993826  ,
         -1.3066202 , -1.3901442 , -0.4830762 , -0.46233207, -5.5269938 ,
         -1.3268279 , -2.02606   , -7.9535437 , -0.25260922, -3.7766173 ,
         -5.2870355 ]], dtype=float32),
 'objective/ref_logprobs': array([[-8.146128  , -2.2490625 , -2.660843  , -0.63041306, -1.6609842 ,
         -9.227812  , -3.3473172 , -5.8727584 , -3.7109416 , -0.02053279,
         -6.558813  , -1.7685627 , -6.5495687 , -2.253131  , -3.993826  ,
         -1.3066202 , -1.3901442 , -0.4830762 , -0.46233207, -5.5269938 ,
         -1.3268279 , -2.02606   , -7.9535437 , -0.25260922, -3.7766173 ,
         -5.2870355 ]], dtype=float32),
 'objective/kl_coef': 0.2,
 'objective/entropy': 63.867313385009766,
 'ppo/mean_non