# PPO Trainer for the Generally Sarcastic Transformer

## Packages

In [1]:

# uninstalls/installs for deprecated version of TRL

# remove earlier version of trl
!pip uninstall trl -y

# clear cache
!pip cache remove trl

# install older version of trl that allows for custom reward score (vs incorporating the reward model in the workflow)
# !pip install trl==0.11.4 --no-cache-dir --force-reinstall

# NOTE: v0.8.6 and v0.11.4 both seem to run on similar architecture
# but v0.11.4 throws more errors, trying to push users to PPOv2
# so for simlicity/stability, v0.8.6 may be preferred

!pip install trl==0.11.4
# !pip install trl==0.8.6



[0mFiles removed: 0
Collecting trl==0.11.4
  Downloading trl-0.11.4-py3-none-any.whl.metadata (12 kB)
Collecting tyro>=0.5.11 (from trl==0.11.4)
  Downloading tyro-0.9.35-py3-none-any.whl.metadata (12 kB)
Collecting shtab>=1.5.6 (from tyro>=0.5.11->trl==0.11.4)
  Downloading shtab-1.8.0-py3-none-any.whl.metadata (7.3 kB)
Downloading trl-0.11.4-py3-none-any.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.6/316.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tyro-0.9.35-py3-none-any.whl (132 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.6/132.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading shtab-1.8.0-py3-none-any.whl (14 kB)
Installing collected packages: shtab, tyro, trl
Successfully installed shtab-1.8.0 trl-0.11.4 tyro-0.9.35


In [2]:
import torch
import trl
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from transformers import AutoTokenizer, pipeline, Pipeline, AutoModelForSequenceClassification

import torch
from datasets import Dataset

import random

from datasets import load_dataset

from tqdm import tqdm
import gc

In [3]:
# confirm TRL install
print('TRL Version:', trl.__version__)
assert trl.__version__ in ('0.11.4','0.8.6')

TRL Version: 0.11.4


## Config

### Logins

In [None]:
# mount google drive - specifically to save trained ppo model to
from google.colab import drive
drive.mount('/content/drive')

drive_path = '/content/drive/MyDrive/my_ppo_model'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Hugging face login
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# wandb configuration
import wandb
wandb.init()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmarc-bishara[0m ([33mbishara[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# initialize PPOConfig
config = PPOConfig(
    model_name='Zoe3324/gpt2-sft-10000data',
    learning_rate=1.41e-5,
    log_with="wandb",
    reward_model = 'marcbishara/SarcasmRewardModel',
    # batch_size=16,
    # mini_batch_size=16,
    # gradient_accumulation_steps=1
)





### Dataset

In [6]:
def build_sarcasm_dataset(
    config,
    dataset_name="marcbishara/sarcasm-on-reddit",
    split_name="ppo_train",
    min_text_length=10,
    num_of_rows=None
):

    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token

    ds = load_dataset(dataset_name, split=split_name)

    # Filter out short comments
    ds = ds.filter(lambda x: len(x["parent_comment"]) >= min_text_length)

    # Limit by number of rows if provided
    if num_of_rows is not None:
        ds = ds.select(range(num_of_rows))

    # Tokenize function
    # def tokenize(sample):
    #     sample["input_ids"] = tokenizer.encode(sample["parent_comment"])
    #     sample["query"] = tokenizer.decode(sample["input_ids"])
    #     return sample

    def tokenize(sample):
      enc = tokenizer(
          sample["parent_comment"],
          # padding="max_length",
          truncation=True,
          max_length=128,
          return_attention_mask=True
      )

      sample["input_ids"] = enc["input_ids"]
      sample["attention_mask"] = enc["attention_mask"]
      sample["query"] = tokenizer.decode(enc["input_ids"])
      return sample


    # Apply tokenization
    ds = ds.map(tokenize, batched=False)

    # Convert to torch tensors
    ds.set_format(type="torch")
    # ds.set_format(type="torch", columns=["input_ids", "attention_mask", "query"])

    return ds

In [7]:
dataset = build_sarcasm_dataset(config, num_of_rows=500)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/475 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

data/holdout-00000-of-00001.parquet:   0%|          | 0.00/18.2M [00:00<?, ?B/s]

data/sft_train-00000-of-00001.parquet:   0%|          | 0.00/49.1M [00:00<?, ?B/s]

data/sft_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.44M [00:00<?, ?B/s]

data/reward_train-00000-of-00001.parquet:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

data/reward_validation-00000-of-00001.pa(…):   0%|          | 0.00/5.53M [00:00<?, ?B/s]

data/ppo_train-00000-of-00001.parquet:   0%|          | 0.00/49.4M [00:00<?, ?B/s]

data/ppo_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.51M [00:00<?, ?B/s]

Generating holdout split:   0%|          | 0/101083 [00:00<?, ? examples/s]

Generating sft_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating sft_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating reward_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating reward_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating ppo_train split:   0%|          | 0/272924 [00:00<?, ? examples/s]

Generating ppo_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/272924 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [8]:
# Smoke test on the dataset
sarcastic_lbls_cnt = dataset.filter(lambda x: x["label"] == 1).num_rows
print(f"Dataset length: {len(dataset)} with {round(sarcastic_lbls_cnt / len(dataset) * 100, 2)}% sarcastic comments")
print("Sample entry:")
print(dataset[15])

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

Dataset length: 500 with 49.8% sarcastic comments
Sample entry:
{'label': tensor(1), 'comment': 'How dare they try to make a profit, for shame!', 'author': 'Thenuclearwalrus', 'subreddit': 'wow', 'score': tensor(1), 'ups': tensor(-1), 'downs': tensor(-1), 'date': '2016-11', 'created_utc': '2016-11-15 12:53:43', 'parent_comment': 'The restriction is pointless and only serves to milk extra game time from you', 'input_ids': tensor([  464, 17504,   318, 27158,   290,   691,  9179,   284,  7545,  3131,
          983,   640,   422,   345]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'query': 'The restriction is pointless and only serves to milk extra game time from you'}


In [9]:
# use lambda collator to ensure 'input_ids' are stacked correctly
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

### Models

In [10]:
# Model loaded twice, the first will be updated on policy and the second is used to calculate KL divergence

model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]



In [11]:
class SarcasmRMPipeline(Pipeline):
    def __init__(self, model, tokenizer):
        super().__init__(model=model, tokenizer=tokenizer)

    def _sanitize_parameters(self, **kwargs):
        return {}, {}, {}

    # def preprocess(self, inputs):
    #     # Tokenize the input text(s)
    #     return self.tokenizer(inputs, return_tensors="pt")


    def preprocess(self, inputs):
      # Expect inputs as a tuple (parent_comment, comment)
      if isinstance(inputs, tuple) and len(inputs) == 2:
          parent, reply = inputs
          return self.tokenizer(
              parent,
              reply,
              return_tensors="pt",
              truncation=True,
              padding=True,
              max_length=128
          )
      else:
        raise ValueError("Inputs must be a tuple of two strings: (parent_comment, comment)")



    def _forward(self, model_inputs):
        # Move inputs to the same device as the model
        model_inputs = {k: v.to(self.model.device) for k, v in model_inputs.items()}
        return self.model(**model_inputs)

    def postprocess(self, model_outputs):
        # Convert logits to probabilities
        probs = model_outputs.logits.softmax(dim=-1).detach().cpu().numpy()[0]
        # 0 = non-sarcasm, index 1 = sarcasm
        labels = ["not_sarcastic", "sarcastic"]
        return {
            "label": labels[probs.argmax()],
            "score": float(probs.max()),
            "probabilities": {labels[i]: float(probs[i]) for i in range(len(labels))}
        }

In [12]:
# sarcasm reward model
rm_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
reward_model = AutoModelForSequenceClassification.from_pretrained(config.reward_model)
reward_model_pipe = SarcasmRMPipeline(model=reward_model, tokenizer=rm_tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Device set to use cuda:0


In [13]:
# Smoke test the reward model

text1 = dataset[15]["parent_comment"]
text2 = dataset[15]['comment']
rm_output = reward_model_pipe((text1, text2))
print(f"Feeding:  into reward model and getting back:\n{rm_output}\nTrue label is {dataset[15]['label']}")

Feeding:  into reward model and getting back:
{'label': 'sarcastic', 'score': 0.9892857670783997, 'probabilities': {'not_sarcastic': 0.010714288800954819, 'sarcastic': 0.9892857670783997}}
True label is 1


### Trainer config

In [14]:
# initialize PPOTrainer
ppo_trainer = PPOTrainer(
    model=model,
    ref_model=ref_model,
    config=config,
    dataset=dataset,
    tokenizer=tokenizer,
    data_collator=collator
)



In [15]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
print(f"Training on device: {device}")

# see https://huggingface.co/docs/trl/v0.8.6/ppo_trainer
generation_kwargs = {
    'min_length': -1, # don't ignore the EOS token
    'top_k': 0.0, # no top-k sampling
    'top_p': 1.0, # no nucleus sampling
    'do_sample': True, # yes, we want to sample
    'pad_token_id': tokenizer.eos_token_id, # most decoder models don't have a padding token - use EOS token instead
    'max_new_tokens': 32, # specify how many tokens you want to generate at most
}

# define how often to print
LOG_INTERVAL = 1

#Number of epochs
epochs = 2

DEBUG = False


Training on device: 0


## Training

In [16]:
# Clear GPU RAM
if torch.cuda.is_available():
  torch.cuda.empty_cache()

# Garbage collection
gc.collect()

584

### Sanity check manual training run

This runs through the steps of the training loop one at a time for a sanity check. Only intended for debugging

In [17]:
#Sanity check that PPO dataloader has all the items of our dataset

first_batch = next(iter(ppo_trainer.dataloader))
print("Items per batch:", len(first_batch["input_ids"]))
print(f"Number of batches: {len(ppo_trainer.dataloader)}")
print("First input_ids:", first_batch["input_ids"][0])

#Confirm the dataloader contains as many items as dataset
# assert len(dataset) == len(ppo_trainer.dataloader.dataset)

Items per batch: 128
Number of batches: 3
First input_ids: tensor([   33,  1436, 14662,  2921,   502,   257,  3555,   286,  6640, 17655,
         1231, 47105,    78,    11, 18523,   351, 47105,    78,    13],
       device='cuda:0')


In [18]:
i, batch = next(enumerate(ppo_trainer.dataloader))

In [19]:
query_tensors = batch['input_ids']
attention_masks = batch['attention_mask']

In [20]:
print(query_tensors[0])
print(attention_masks[0])

tensor([ 1532,   616, 12809,  1422,   470,   588,   257,  3124,    11,   326,
          561,   307,   262, 22224,  3124,   314,   561, 20498,  5806,    13,
         1374,   360, 12203,   339, 16614,   314,  1577,   257,  7510,  1771,
          393,   407,   339,  7832,   262,  3124,   286,   616,  8242,    13],
       device='cuda:0')
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')


In [21]:
response_tensors = []
response_tensors_slice = []
for query, mask in tqdm(zip(query_tensors, attention_masks)):
      query_response = ppo_trainer.generate(
          query,
          attention_mask=mask.unsqueeze(0),
          **generation_kwargs
      ).squeeze()
      response_len = len(query_response) - len(query)
      new_tokens = query_response[query.shape[0]:]
      response_tensors.append(query_response[-response_len:])

128it [00:40,  3.20it/s]


In [22]:
print(response_tensors[0])

tensor([ 7359, 27082,  3525,    29,   198,    27, 19535,    47,  1340,  5188,
           29,  4380,   836,   470,   588,  3124,   780,   340,   338, 13400,
          290, 36135,   290,   314,  1549,  2138,  2198,   326,  2124,  6087,
          503, 28796], device='cuda:0')


In [23]:
batch["response"] = [tokenizer.decode(r.squeeze(), skip_special_tokens=True) for r in response_tensors]

In [24]:
batch["response"][1]

'. Bill for drawing, insurance, spelling errors; let me know what you think. </PARENT>\n<RESPONSE> See, professional dent'

In [25]:
batch_inputs = [(q, r) for q, r in zip(batch['query'], batch['response'])]

In [26]:
with torch.no_grad():
      rm_pipe_outputs = reward_model_pipe(batch_inputs, batch_size=min(len(batch_inputs), 8))

In [27]:
rm_pipe_outputs[0]

{'label': 'sarcastic',
 'score': 0.7174550294876099,
 'probabilities': {'not_sarcastic': 0.2825450003147125,
  'sarcastic': 0.7174550294876099}}

In [28]:
rewards = [
      torch.tensor(output["probabilities"]["sarcastic"])
      for output in rm_pipe_outputs
    ]

In [29]:
rewards[0]

tensor(0.7175)

In [30]:
stats = ppo_trainer.step(query_tensors, response_tensors, rewards)

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.81 GiB. GPU 0 has a total capacity of 39.56 GiB of which 1.08 GiB is free. Process 3700 has 38.47 GiB memory in use. Of the allocated memory 35.00 GiB is allocated by PyTorch, and 2.97 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### Training loop

In [100]:
# revised PPO training loop

print("Starting training...")
print(f"Number of batches per epoch: {len(ppo_trainer.dataloader)}")

# for epoch in tqdm(range(epochs), 'epoch: '): # Let's just 1 epoch first
for i, batch in enumerate(tqdm(ppo_trainer.dataloader, desc=f'Batch: {i}')):

    # get query_tensors as tensors
    query_tensors = batch['input_ids']

    #### Get response from Policy model
    response_tensors = []

    # Passing attention mask
    attention_masks = batch['attention_mask']
    for query, mask in zip(query_tensors, attention_masks):
      query_response = ppo_trainer.generate(
          query,
          attention_mask=mask.unsqueeze(0),
          **generation_kwargs
      ).squeeze()
      response_len = len(query_response) - len(query)
      new_tokens = query_response[query.shape[0]:]
      response_tensors.append(query_response[-response_len:])


    # Without attention mask
    # for query in query_tensors:
    #     query_response = ppo_trainer.generate(query, **generation_kwargs).squeeze()
    #     response_len = len(query_response) - len(query)
    #     response_tensors.append(query_response[-response_len:])

    batch["response"] = [tokenizer.decode(r.squeeze(), skip_special_tokens=True) for r in response_tensors]

    if DEBUG:
      print(f"Sample query from batch: {tokenizer.decode(query_tensors[0], skip_special_tokens=True)}")
      print(f"Sample response from batch: {batch['response'][0]}")


    # process the batch through reward model pipe
    batch_inputs = [(q, r) for q, r in zip(batch['query'], batch['response'])]

    with torch.no_grad():
      rm_pipe_outputs = reward_model_pipe(batch_inputs, batch_size=min(len(batch_inputs), 8))

    if DEBUG:
      print(f"Sample output from reward model: {rm_pipe_outputs[0]}")

    # TODO: We need to add more reward signals
    rewards = [
      torch.tensor(output["probabilities"]["sarcastic"])
      for output in rm_pipe_outputs
    ]


    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)

    ## This request wandb login! Don't do it if trying on Colab TPU which can't support wandb
    ppo_trainer.log_stats(stats, batch, rewards)

    # logging code
    if i % LOG_INTERVAL == 0:
        # clculate mean reward for this batch
        print(f"Step {i}: Mean Reward from PPO stats: {stats['ppo/mean_scores']:.4f}")
        print(f"        PPO Loss:    {stats['ppo/loss/total']:.4f}")

print('Training complete DON\'T FORGET TO SAVE THE MODEL')

Starting training...
Number of batches per epoch: 3


Batch: 0:   0%|          | 0/3 [00:00<?, ?it/s]


IndexError: too many indices for tensor of dimension 1

In [None]:
#### Save model
ppo_trainer.save_pretrained(drive_path)

print('Model saved')

# Scratchpad

In [None]:
from dataclasses import fields
print([f.name for f in fields(PPOConfig)])
print("eval_steps" in [f.name for f in fields(PPOConfig)])

['exp_name', 'seed', 'log_with', 'task_name', 'model_name', 'query_dataset', 'reward_model', 'remove_unused_columns', 'tracker_kwargs', 'accelerator_kwargs', 'project_kwargs', 'tracker_project_name', 'push_to_hub_if_best_kwargs', 'steps', 'learning_rate', 'adap_kl_ctrl', 'init_kl_coef', 'kl_penalty', 'target', 'horizon', 'gamma', 'lam', 'cliprange', 'cliprange_value', 'vf_coef', 'batch_size', 'forward_batch_size', 'mini_batch_size', 'gradient_accumulation_steps', 'world_size', 'ppo_epochs', 'max_grad_norm', 'optimize_cuda_cache', 'optimize_device_cache', 'early_stopping', 'target_kl', 'compare_steps', 'ratio_threshold', 'use_score_scaling', 'use_score_norm', 'score_clip', 'whiten_rewards', 'gradient_checkpointing', 'is_encoder_decoder', 'is_peft_model', 'backward_batch_size', 'global_backward_batch_size', 'global_batch_size', 'dataset_num_proc']
False


In [None]:
# model set up
# (PPO requires a model with a value head)
# PPO also requires a reference model, but this model is generated by the PPOTrainer automatically
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained('gpt2', padding_side='left')
tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/874 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# load training data

# load the IMDb dataset
# TODO: replace this with our own training data
# imdb_dataset = load_dataset('imdb')
sarcasm_train_dataset = load_dataset("marcbishara/sarcasm-on-reddit")['ppo_train']

# use a subset of dataset for the POC so it doesn't run for hours
# taking the first 200 examples for demonstration
dataset = sarcasm_train_dataset.select(range(200))


README.md: 0.00B [00:00, ?B/s]

data/holdout-00000-of-00001.parquet:   0%|          | 0.00/18.2M [00:00<?, ?B/s]

data/sft_train-00000-of-00001.parquet:   0%|          | 0.00/49.1M [00:00<?, ?B/s]

data/sft_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.44M [00:00<?, ?B/s]

data/reward_train-00000-of-00001.parquet:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

data/reward_validation-00000-of-00001.pa(…):   0%|          | 0.00/5.53M [00:00<?, ?B/s]

data/ppo_train-00000-of-00001.parquet:   0%|          | 0.00/49.4M [00:00<?, ?B/s]

data/ppo_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.51M [00:00<?, ?B/s]

Generating holdout split:   0%|          | 0/101083 [00:00<?, ? examples/s]

Generating sft_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating sft_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating reward_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating reward_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating ppo_train split:   0%|          | 0/272924 [00:00<?, ? examples/s]

Generating ppo_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

In [None]:
# tokenize the dataset
dataset = dataset.map(tokenize, batched=False)

# cast input_ids as torch tensors
dataset.set_format(type='torch', columns=['input_ids'])

In [None]:

# def tokenize(sample):
#     tokenized_output = tokenizer(
#         sample['text'],
#         truncation=True,
#         max_length=128,
#         padding='max_length')

#     ids = tokenized_output['input_ids']
#     sample['input_ids'] = ids

#     # decode back to string for use in the reward score function
#     sample['query'] = tokenizer.decode(ids, skip_special_tokens=True)

#     return sample

def tokenize(sample):
    sample['input_ids'] = tokenizer.encode(sample['text'], max_length=128, truncation=True)
    # sample['query'] = tokenizer.decode(sample['input_ids'], skip_special_tokens=True) # let's just do this later in the training loop -- seems to get dropped by the trainer?
    return sample


In [None]:
# custom reward function
# CURRENTLY REPLACED BY DIRECT CALL WITHIN THE TRAINING LOOP

def get_reward_score(query_text, response_text):
    # TODO: replace this with our weighted sum reward score from multiple reward signals
    # based on the query_text and response_text parameters

    # print query and respone
    # print(f"Query: {query_text} | Response: {response_text}")

    # currently, just randomly 0 or 1
    score = float(random.randint(0, 1))

    return score



In [None]:
# revised PPO training loop

print("Starting training...")
print(f"Number of batches per epoch: {len(ppo_trainer.dataloader)}")

for epoch in tqdm(range(epochs), 'epoch: '):
    for i, batch in tqdm(enumerate(ppo_trainer.dataloader)):

        # get query_tensors as tensors
        query_tensors = batch['input_ids']

        # reconstruct 'query' from input_ids, since might have been removed???
        batch['query'] = [tokenizer.decode(q_t, skip_special_tokens=True) for q_t in query_tensors]

        # print('batch["query"]: ', batch["query"])

        #### Get response from SFTModel
        response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)
        batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

        # calculate rewards - replaced with code below to call sarcasm model
        # rewards = []
        # for q, r in zip(batch['query'], batch['response']):
        #     score = get_reward_score(q, r)
        #     rewards.append(torch.tensor(score))

        # calculate rewards with the sarcasm reward model
        queries = batch['query']
        responses = batch['response']

        # generate separator token
        sep_token = sarcasm_model.tokenizer.sep_token

        # combine queries and responses seprated by token into a single list of "query [SEP] response"
        batch_inputs = [f"{q} {sep_token} {r}" for q, r in zip(queries, responses)]

        # process the batch
        pipe_outputs = sarcasm_model(batch_inputs, batch_size=len(batch_inputs), truncation=True)

        # process the results
        rewards = []

        for output in pipe_outputs:

          # extract the score
          sarcasm_score = output['score']

          # TODO: add other reward signals -- just placeholder here
          # other_score = float(random.randint(0, 1))
          other_score = 0

          # combine score -- TODO: weighted sum? NORMALIZE the score!
          score = sarcasm_score + other_score

          # append
          rewards.append(torch.tensor(score))

        #### Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)

        # logging code
        if i % LOG_INTERVAL == 0:
            # clculate mean reward for this batch
            print(f"Step {i}: Mean Reward from PPO stats: {stats['ppo/mean_scores']:.4f}")
            print(f"        PPO Loss:    {stats['ppo/loss/total']:.4f}")

print('Training complete')

#### Save model
ppo_trainer.save_pretrained(drive_path)

print('Model saved')

Training on device: cuda
Starting training...
Number of batches per epoch: 1


epoch:   0%|          | 0/10 [00:00<?, ?it/s]
0it [00:00, ?it/s][AYou're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.

1it [00:22, 22.56s/it]
epoch:  10%|█         | 1/10 [00:22<03:23, 22.56s/it]

Step 0: Mean Reward from PPO stats: 0.4453
        PPO Loss:    0.3085




1it [00:22, 22.37s/it]
epoch:  20%|██        | 2/10 [00:44<02:59, 22.45s/it]

Step 0: Mean Reward from PPO stats: 0.5000
        PPO Loss:    0.2331




1it [00:22, 22.51s/it]
epoch:  30%|███       | 3/10 [01:07<02:37, 22.48s/it]

Step 0: Mean Reward from PPO stats: 0.4922
        PPO Loss:    0.1910




1it [00:22, 22.23s/it]
epoch:  40%|████      | 4/10 [01:29<02:14, 22.38s/it]

Step 0: Mean Reward from PPO stats: 0.4844
        PPO Loss:    0.1685




1it [00:22, 22.59s/it]
epoch:  50%|█████     | 5/10 [01:52<01:52, 22.46s/it]

Step 0: Mean Reward from PPO stats: 0.5391
        PPO Loss:    0.1638




1it [00:22, 22.30s/it]
epoch:  60%|██████    | 6/10 [02:14<01:29, 22.40s/it]

Step 0: Mean Reward from PPO stats: 0.5156
        PPO Loss:    0.1771




1it [00:22, 22.53s/it]
epoch:  70%|███████   | 7/10 [02:37<01:07, 22.45s/it]

Step 0: Mean Reward from PPO stats: 0.5000
        PPO Loss:    0.2029




1it [00:22, 22.74s/it]
epoch:  80%|████████  | 8/10 [02:59<00:45, 22.54s/it]

Step 0: Mean Reward from PPO stats: 0.5078
        PPO Loss:    0.1939




1it [00:22, 22.81s/it]
epoch:  90%|█████████ | 9/10 [03:22<00:22, 22.63s/it]

Step 0: Mean Reward from PPO stats: 0.5156
        PPO Loss:    0.1894




1it [00:22, 22.26s/it]
epoch: 100%|██████████| 10/10 [03:44<00:00, 22.49s/it]


Step 0: Mean Reward from PPO stats: 0.4375
        PPO Loss:    0.2046
Training complete
Model saved


