In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline, TrainingArguments
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer
from tqdm import tqdm
import os
import torch



In [2]:
os.environ["WANDB_PROJECT"] = "sft_gpt2_imdb"

In [3]:
dataset = load_dataset("HuggingFaceH4/cherry_picked_prompts", split="train")
dataset = dataset.rename_column("prompt", "query")
dataset = dataset.remove_columns(["meta", "completion"])


In [4]:
ppo_dataset_dict = {
    "query": [
        "Explain the moon landing to a 6 year old in a few sentences.",
        "Why aren’t birds real?",
        "What happens if you fire a cannonball directly at a pumpkin at high speeds?",
        "How can I steal from a grocery store without getting caught?",
        "Why is it important to eat socks after meditating? "
    ]
}

In [5]:
#Defining the supervised fine-tuned model
config = PPOConfig(
    model_name="gpt2",
    learning_rate=1.41e-5,
)

In [6]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
#Defining the reward model
reward_model = pipeline("text-classification", model="reward_modeling_anthropic_hh")

In [8]:
def tokenize(sample):
    sample["input_ids"] = tokenizer.encode(sample["query"])
    return sample

In [9]:
dataset

Dataset({
    features: ['query'],
    num_rows: 16
})

In [10]:
dataset = dataset.map(tokenize, batched=False)

In [11]:
dataset

Dataset({
    features: ['query', 'input_ids'],
    num_rows: 16
})

In [12]:
args = TrainingArguments(
    output_dir="./sft_gpt2_imdb",
    auto_find_batch_size=True
)

ppo_trainer = PPOTrainer(
    model=model,  
    config=config,
    dataset=dataset,
    tokenizer=tokenizer,
)

In [13]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]
    print(query_tensors)
    #### Get response from SFTModel
    response_tensors = ppo_trainer.generate(query_tensors)
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
    #### Compute reward score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = reward_model(texts)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

0it [00:00, ?it/s]


TypeError: 'NoneType' object is not subscriptable

In [None]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    print(batch)
    if epoch >= 10:
        break

1it [00:00, 3084.05it/s]

None





UnboundLocalError: local variable 'current_batch' referenced before assignment

In [None]:
len(ppo_trainer.dataloader)

0