In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%pip install transformers trl wandb

Collecting trl
  Downloading trl-0.9.4-py3-none-any.whl.metadata (11 kB)
Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
     ---------------------------------------- 0.0/43.8 kB ? eta -:--:--
     ---------------------------------------- 43.8/43.8 kB 1.1 MB/s eta 0:00:00
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.4-py3-none-any.whl.metadata (7.9 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.3-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-none-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp310-none-win_amd64.whl.metadata (3.9 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl)
  Downloading shtab-1.7.1-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.9.4-py3-none-any.whl (226 kB)
   ---------------------------------------- 0


[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler




## Configuration

In [2]:
config = PPOConfig(
    model_name="lvwerra/gpt2-imdb",
    learning_rate=1.41e-5,
    log_with="wandb",
)

sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

In [3]:
import wandb

wandb.init()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmiguel_kjh[0m ([33msiani-ai[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Datasets

In [8]:
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [9]:
dataset = build_dataset(config)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/24895 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors


## Load pre-trained GPT2 language models (STEP 1)


In [10]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token

pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  return self.fget.__get__(instance, owner)()


## Initialize PPOTrainer

In [16]:
ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016666666666666666, max=1.0…

## Load BERT classifier (STEP 2)

In [12]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [13]:
text = "this movie was really bad!!"
sentiment_pipe(text, **sent_kwargs)



[[{'label': 'NEGATIVE', 'score': 2.3350484371185303},
  {'label': 'POSITIVE', 'score': -2.726576328277588}]]

In [14]:
text = "this movie was really good!!"
sentiment_pipe(text, **sent_kwargs)



[[{'label': 'NEGATIVE', 'score': -2.294790267944336},
  {'label': 'POSITIVE', 'score': 2.557040214538574}]]

## RL optimization loop (STEP 3)

In [17]:
output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)


generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}

In [19]:
for batch in tqdm(ppo_trainer.dataloader):
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

  2%|▏         | 3/194 [00:35<39:44, 12.49s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 194/194 [49:41<00:00, 15.37s/it]


# Model inspection

In [22]:
gen_kwargs = {"min_length": -1, "top_k": 0.0, "top_p": 1.0, "do_sample": True, "pad_token_id": tokenizer.eos_token_id}

In [23]:
#### get a batch from the dataset
bs = 16
game_data = dict()
dataset.set_format("pandas")
df_batch = dataset[:].sample(bs)
game_data["query"] = df_batch["query"].tolist()
query_tensors = df_batch["input_ids"].tolist()

response_tensors_ref, response_tensors = [], []

#### get response from gpt2 and gpt2_ref
for i in range(bs):
    gen_len = output_length_sampler()
    output = ref_model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors_ref.append(output)
    output = model.generate(
        torch.tensor(query_tensors[i]).unsqueeze(dim=0).to(device), max_new_tokens=gen_len, **gen_kwargs
    ).squeeze()[-gen_len:]
    response_tensors.append(output)

#### decode responses
game_data["response (before)"] = [tokenizer.decode(response_tensors_ref[i]) for i in range(bs)]
game_data["response (after)"] = [tokenizer.decode(response_tensors[i]) for i in range(bs)]

#### sentiment analysis of query/response pairs before/after
texts = [q + r for q, r in zip(game_data["query"], game_data["response (before)"])]
game_data["rewards (before)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

texts = [q + r for q, r in zip(game_data["query"], game_data["response (after)"])]
game_data["rewards (after)"] = [output[1]["score"] for output in sentiment_pipe(texts, **sent_kwargs)]

# store results in a dataframe
df_results = pd.DataFrame(game_data)
df_results



Unnamed: 0,query,response (before),response (after),rewards (before),rewards (after)
0,Now this is,"how ""brother"" and ""son"" react when the","a tease, but funny still. I loved it especially",1.350929,2.618836
1,I don't think I could,do justice to this selection like I am used t...,"rate this movie very well. It's a fun, atmosp...",-1.169286,2.673902
2,"Franco proves, once","again, that no amount of","again, if he can pull",-0.333157,1.091633
3,My roommate had bought,some splatterbox stuff and he took my pack,"this project, and it was such a great family",-0.261592,2.383694
4,As a,"game played in this way, it",'Vanita' the film directly,1.29823,0.079692
5,Rodney Dangerfield,seems to want to please the viewers with his ...,then directed a quite brilliantly hilarious s...,-0.122941,2.784786
6,This movie was,"my ""best"" film at the festival. My question",beautiful and inspiring. This unique cinemati...,2.1681,2.891832
7,This movie is wonderful,". The story, the",", winning a national Oscar",2.792063,2.818151
8,Twelve,episodes per series. I can't imagine why this...,is a wonderfully well-crafted film that is a ...,-0.754983,2.941937
9,Even the first 10 minutes of,"the movie should not be a bad thing, more so ...",this movie and I will definitely recommend it...,-0.02376,2.683607


In [24]:
print("mean:")
display(df_results[["rewards (before)", "rewards (after)"]].mean())
print()
print("median:")
display(df_results[["rewards (before)", "rewards (after)"]].median())

mean:


rewards (before)    0.386966
rewards (after)     2.279158
dtype: float64


median:


rewards (before)   -0.073350
rewards (after)     2.678755
dtype: float64

## Save the trained model

In [25]:
model.save_pretrained("gpt2-imdb-rlhf", push_to_hub=True)
tokenizer.save_pretrained("gpt2-imdb-rlhf", push_to_hub=True)

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

('gpt2-imdb-rlhf\\tokenizer_config.json',
 'gpt2-imdb-rlhf\\special_tokens_map.json',
 'gpt2-imdb-rlhf\\vocab.json',
 'gpt2-imdb-rlhf\\merges.txt',
 'gpt2-imdb-rlhf\\added_tokens.json',
 'gpt2-imdb-rlhf\\tokenizer.json')