In [1]:

# uninstalls/installs for deprecated version of TRL

# remove earlier version of trl
!pip uninstall trl -y

# clear cache
!pip cache remove trl

# install older version of trl that allows for custom reward score (vs incorporating the reward model in the workflow)
# !pip install trl==0.11.4 --no-cache-dir --force-reinstall

# NOTE: v0.8.6 and v0.11.4 both seem to run on similar architecture
# but v0.11.4 throws more errors, trying to push users to PPOv2
# so for simlicity/stability, v0.8.6 may be preferred

# !pip install trl==0.11.4
!pip install trl==0.8.6



Found existing installation: trl 0.11.4
Uninstalling trl-0.11.4:
  Successfully uninstalled trl-0.11.4
[0mFiles removed: 0
Collecting trl==0.8.6
  Using cached trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Using cached trl-0.8.6-py3-none-any.whl (245 kB)
Installing collected packages: trl
Successfully installed trl-0.8.6


In [None]:
import torch
import trl
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from transformers import AutoTokenizer, pipeline

import torch
from datasets import Dataset

import random

from datasets import load_dataset

from tqdm import tqdm

# confirm TRL install
print('TRL Version:', trl.__version__)

TRL Version: 0.8.6


In [3]:
# mount google drive - specifically to save trained ppo model to
from google.colab import drive
drive.mount('/content/drive')

drive_path = '/content/drive/MyDrive/my_ppo_model'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# model set up
# (PPO requires a model with a value head)
# PPO also requires a reference model, but this model is generated by the PPOTrainer automatically
model = AutoModelForCausalLMWithValueHead.from_pretrained('gpt2')
tokenizer = AutoTokenizer.from_pretrained('gpt2', padding_side='left')
tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# sarcasm reward model

sarcasm_model = pipeline('text-classification', model='marcbishara/SarcasmRewardModel')

In [None]:
# custom reward function
# CURRENTLY REPLACED BY DIRECT CALL WITHIN THE TRAINING LOOP


def get_reward_score(query_text, response_text):
    # TODO: replace this with our weighted sum reward score from multiple reward signals
    # based on the query_text and response_text parameters

    # print query and respone
    # print(f"Query: {query_text} | Response: {response_text}")

    # currently, just randomly 0 or 1
    score = float(random.randint(0, 1))

    return score



In [6]:
# initialize PPOConfig
config = PPOConfig(
    model_name='gpt2',
    learning_rate=1.41e-5,
    # batch_size=16,
    # mini_batch_size=16,
    # gradient_accumulation_steps=1
)



In [7]:

# def tokenize(sample):
#     tokenized_output = tokenizer(
#         sample['text'],
#         truncation=True,
#         max_length=128,
#         padding='max_length')

#     ids = tokenized_output['input_ids']
#     sample['input_ids'] = ids

#     # decode back to string for use in the reward score function
#     sample['query'] = tokenizer.decode(ids, skip_special_tokens=True)

#     return sample

def tokenize(sample):
    sample['input_ids'] = tokenizer.encode(sample['text'], max_length=128, truncation=True)
    # sample['query'] = tokenizer.decode(sample['input_ids'], skip_special_tokens=True) # let's just do this later in the training loop -- seems to get dropped by the trainer?
    return sample


In [8]:
# load training data

# load the IMDb dataset
# TODO: replace this with our own training data
imdb_dataset = load_dataset('imdb')

# use a subset of IMDb for the POC so it doesn't run for hours
# taking the first 200 examples for demonstration
dataset = imdb_dataset['train'].select(range(200))

# tokenize the dataset
dataset = dataset.map(tokenize, batched=False)

# cast input_ids as torch tensors
dataset.set_format(type='torch', columns=['input_ids'])


In [9]:
# use lambda collator to ensure 'input_ids' are stacked correctly
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

# initialize PPOTrainer
ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=dataset,
    tokenizer=tokenizer,
    data_collator=collator
)

In [None]:
# revised PPO training loop

# setup device and config
device = ppo_trainer.accelerator.device
print(f"Training on device: {device}")

# define how often to print
LOG_INTERVAL = 1

epochs = 10
# epochs = 1

# see https://huggingface.co/docs/trl/v0.8.6/ppo_trainer
generation_kwargs = {
    'min_length': -1, # don't ignore the EOS token
    'top_k': 0.0, # no top-k sampling
    'top_p': 1.0, # no nucleus sampling
    'do_sample': True, # yes, we want to sample
    'pad_token_id': tokenizer.eos_token_id, # most decoder models don't have a padding token - use EOS token instead
    'max_new_tokens': 32, # specify how many tokens you want to generate at most
}

print("Starting training...")
print(f"Number of batches per epoch: {len(ppo_trainer.dataloader)}")

for epoch in tqdm(range(epochs), 'epoch: '):
    for i, batch in tqdm(enumerate(ppo_trainer.dataloader)):

        # get query_tensors as tensors
        query_tensors = batch['input_ids']

        # reconstruct 'query' from input_ids, since might have been removed???
        batch['query'] = [tokenizer.decode(q_t, skip_special_tokens=True) for q_t in query_tensors]

        # print('batch["query"]: ', batch["query"])

        #### Get response from SFTModel
        response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)
        batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

        # calculate rewards - replaced with code below to call sarcasm model
        # rewards = []
        # for q, r in zip(batch['query'], batch['response']):
        #     score = get_reward_score(q, r)
        #     rewards.append(torch.tensor(score))

        # calculate rewards with the sarcasm reward model
        queries = batch['query']
        responses = batch['response']

        # generate separator token
        sep_token = sarcasm_model.tokenizer.sep_token

        # combine queries and responses seprated by token into a single list of "query [SEP] response"
        batch_inputs = [f"{q} {sep_token} {r}" for q, r in zip(queries, responses)]

        # process the batch
        pipe_outputs = sarcasm_model(batch_inputs, batch_size=len(batch_inputs), truncation=True)

        # process the results
        rewards = []

        for output in pipe_outputs:

          # extract the score
          sarcasm_score = output['score']
          
          # TODO: add other reward signals -- just placeholder here
          # other_score = float(random.randint(0, 1))
          other_score = 0
          
          # combine score -- TODO: weighted sum? NORMALIZE the score!
          score = sarcasm_score + other_score

          # append
          rewards.append(torch.tensor(score))

        #### Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)

        # logging code
        if i % LOG_INTERVAL == 0:
            # clculate mean reward for this batch
            print(f"Step {i}: Mean Reward from PPO stats: {stats['ppo/mean_scores']:.4f}")
            print(f"        PPO Loss:    {stats['ppo/loss/total']:.4f}")

print('Training complete')

#### Save model
ppo_trainer.save_pretrained(drive_path)

print('Model saved')

Training on device: cuda
Starting training...
Number of batches per epoch: 1


epoch:   0%|          | 0/10 [00:00<?, ?it/s]
0it [00:00, ?it/s][AYou're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.

1it [00:22, 22.56s/it]
epoch:  10%|█         | 1/10 [00:22<03:23, 22.56s/it]

Step 0: Mean Reward from PPO stats: 0.4453
        PPO Loss:    0.3085




1it [00:22, 22.37s/it]
epoch:  20%|██        | 2/10 [00:44<02:59, 22.45s/it]

Step 0: Mean Reward from PPO stats: 0.5000
        PPO Loss:    0.2331




1it [00:22, 22.51s/it]
epoch:  30%|███       | 3/10 [01:07<02:37, 22.48s/it]

Step 0: Mean Reward from PPO stats: 0.4922
        PPO Loss:    0.1910




1it [00:22, 22.23s/it]
epoch:  40%|████      | 4/10 [01:29<02:14, 22.38s/it]

Step 0: Mean Reward from PPO stats: 0.4844
        PPO Loss:    0.1685




1it [00:22, 22.59s/it]
epoch:  50%|█████     | 5/10 [01:52<01:52, 22.46s/it]

Step 0: Mean Reward from PPO stats: 0.5391
        PPO Loss:    0.1638




1it [00:22, 22.30s/it]
epoch:  60%|██████    | 6/10 [02:14<01:29, 22.40s/it]

Step 0: Mean Reward from PPO stats: 0.5156
        PPO Loss:    0.1771




1it [00:22, 22.53s/it]
epoch:  70%|███████   | 7/10 [02:37<01:07, 22.45s/it]

Step 0: Mean Reward from PPO stats: 0.5000
        PPO Loss:    0.2029




1it [00:22, 22.74s/it]
epoch:  80%|████████  | 8/10 [02:59<00:45, 22.54s/it]

Step 0: Mean Reward from PPO stats: 0.5078
        PPO Loss:    0.1939




1it [00:22, 22.81s/it]
epoch:  90%|█████████ | 9/10 [03:22<00:22, 22.63s/it]

Step 0: Mean Reward from PPO stats: 0.5156
        PPO Loss:    0.1894




1it [00:22, 22.26s/it]
epoch: 100%|██████████| 10/10 [03:44<00:00, 22.49s/it]


Step 0: Mean Reward from PPO stats: 0.4375
        PPO Loss:    0.2046
Training complete
Model saved


