In [None]:
!pip install transformers datasets trl peft

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting trl
  Downloading trl-0.9.6-py3-none-any.whl.metadata (12 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting tyro>=0.5.11 (from trl)
  Downloading tyro-0.8.5-py3-none-any.whl.metadata (8.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==1

Reward Modeling

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from trl import ModelConfig, RewardConfig, RewardTrainer, get_kbit_device_map, get_peft_config, get_quantization_config
from peft import LoraConfig, TaskType, PeftModel

In [None]:
model_name_or_path = "distilgpt2"

In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name_or_path)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [None]:
import pandas as pd
filename = "rlhf_train_data.csv"

chosen = []
rejected = []

df = pd.read_csv(filename)
df = df.iloc[:, 1:]

for i in range(len(df)):
  prompt = df.iat[i, 0]

  pair = df.iat[i, 1].split(',')
  pair[0] = pair[0][2:len(pair[0]) - 1]
  pair[1] = pair[1][2:len(pair[1]) - 2]

  sp_choice = df.iat[i, 4]
  chosen.append(f"{prompt} [SEP] {pair[sp_choice]}")
  rejected.append(f"{prompt} [SEP] {pair[1 - sp_choice]}")

print(chosen[0])
print(rejected[0])

Members of the procession walk down the street holding small horn brass instruments. A drum line [SEP] passes by walking down the street playing their instruments.
Members of the procession walk down the street holding small horn brass instruments. A drum line [SEP] has heard approaching them.


In [None]:
import torch
from torch.utils.data import Dataset

# Assuming 'chosen' and 'rejected' lists are already filled with the data
class CustomRewardDataset(Dataset):
    def __init__(self, chosen_texts, rejected_texts, tokenizer):
        self.tokenizer = tokenizer
        self.chosen_texts = chosen_texts
        self.rejected_texts = rejected_texts

    def __len__(self):
        return len(self.chosen_texts)

    def __getitem__(self, idx):
        chosen_text = self.chosen_texts[idx]
        rejected_text = self.rejected_texts[idx]

        # Tokenize both chosen and rejected texts
        chosen_tokens = self.tokenizer(chosen_text, truncation=True, padding='max_length', return_tensors="pt")
        rejected_tokens = self.tokenizer(rejected_text, truncation=True, padding='max_length', return_tensors="pt")

        return {
            "input_ids_chosen": chosen_tokens["input_ids"].squeeze(),
            "attention_mask_chosen": chosen_tokens["attention_mask"].squeeze(),
            "input_ids_rejected": rejected_tokens["input_ids"].squeeze(),
            "attention_mask_rejected": rejected_tokens["attention_mask"].squeeze()
        }

# Initialize the dataset
reward_dataset = CustomRewardDataset(chosen, rejected, tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir='./sp_results',
    num_train_epochs=10,
    per_device_train_batch_size=32,  # Reduced batch size
    gradient_accumulation_steps=2,  # Use gradient accumulation
    fp16=True,  # Enable mixed precision training
)

In [None]:
#model.gradient_checkpointing_enable()
trainer = RewardTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=reward_dataset,
    peft_config=peft_config,
)

# Clear GPU memory cache
torch.cuda.empty_cache()
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
500,0.7289
1000,0.6937
1500,0.6908




TrainOutput(global_step=1890, training_loss=0.7014395416098297, metrics={'train_runtime': 1791.4686, 'train_samples_per_second': 67.487, 'train_steps_per_second': 1.055, 'total_flos': 0.0, 'train_loss': 0.7014395416098297, 'epoch': 10.0})

In [None]:
model_save_path = "sp_RewardTrainer_model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

('sp_RewardTrainer_model/tokenizer_config.json',
 'sp_RewardTrainer_model/special_tokens_map.json',
 'sp_RewardTrainer_model/vocab.json',
 'sp_RewardTrainer_model/merges.txt',
 'sp_RewardTrainer_model/added_tokens.json',
 'sp_RewardTrainer_model/tokenizer.json')

RLHF

In [None]:
import torch
from tqdm import tqdm
import pandas as pd

tqdm.pandas()

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
config = PPOConfig(
    model_name="distilgpt2",
    learning_rate=1.41e-5)
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 8}

In [None]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name,
                                                         device_map="auto"
                                                         )
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/353M [00:00<?, ?B/s]

In [None]:
from datasets import load_dataset

swag = load_dataset("swag", "regular", split="train[:4030]")

Downloading readme:   0%|          | 0.00/9.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.81M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.78M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/73546 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/20006 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20005 [00:00<?, ? examples/s]

In [None]:
swag

Dataset({
    features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
    num_rows: 4030
})

In [None]:
dataset = swag.rename_column("startphrase", "query")
dataset = dataset.remove_columns(["sent1", "sent2","ending0","ending1","ending2","ending3", 'video-id', 'fold-ind', 'gold-source', 'label'])

In [None]:
def tokenize(sample):
    prompt = sample["query"]
    sample["input_ids"] = tokenizer.encode(prompt)
    return sample

dataset = dataset.map(tokenize, batched=False)
dataset.set_format(type="torch")

Map:   0%|          | 0/4030 [00:00<?, ? examples/s]

In [None]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [None]:
ppo_trainer = PPOTrainer(config=config, model=model, tokenizer=tokenizer, dataset=dataset, data_collator=collator)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the trained model
reward_model_path = "./sp_RewardTrainer_model"

# For a general causal language model
reward_model = AutoModelForCausalLM.from_pretrained(reward_model_path, ignore_mismatched_sizes=True)

# Load the tokenizer
reward_tokenizer = AutoTokenizer.from_pretrained(reward_model_path)

Loading adapter weights from ./sp_RewardTrainer_model led to unexpected keys not found in the model:  ['score.weight']. 


In [None]:
output_min_length = 4
output_max_length = 100
output_length_sampler = LengthSampler(output_min_length, output_max_length)

In [None]:
from torch.nn.functional import pad

for epoch in tqdm(range(ppo_trainer.config.ppo_epochs), "Epoch:"):
    for batch in tqdm(ppo_trainer.dataloader):
        query_tensors = batch["input_ids"]  # Ensure each query is 2D
        gen_kwargs = {
            "min_length": output_min_length,
            "max_length": output_max_length + len(tokenizer.encode(batch['query'][0])),
            "top_k": 0.0,  # or you might want to set this to a higher value for more focused sampling
            "top_p": 0.95,  # a value less than 1 can help focus the generation
            "do_sample": True,
            "pad_token_id": tokenizer.eos_token_id,
            "eos_token_id": tokenizer.encode('.')[0],  # assuming '.' is the desired stopping point
        }
        response_tensors = ppo_trainer.generate(query_tensors, **gen_kwargs)
        batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

        #### Compute reward score
        texts = [r for r in batch["response"]]
        # Tokenize the texts for the reward model
        tokenized_texts = reward_tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

        pipe_outputs = reward_model(**tokenized_texts)

        # Assume logits are obtained from the model's output for each example in the batch
        logits = pipe_outputs.logits  # shape: (batch_size, sequence_length, num_classes)

        # Aggregate the logits across the sequence
        # Here we take the mean of the logits across the sequence length dimension
        aggregated_logits = torch.mean(logits, dim=1)

        # Convert the aggregated logits to probabilities
        probabilities = torch.softmax(aggregated_logits, dim=-1)

        # Assume the 'correct' class is the first column in the logits
        # This gives you a single reward score per example in the batch
        reward_scores = probabilities[:, 0]
        # Assuming reward_scores is a 1D tensor of scores
        scores_list = [torch.tensor([score.item()]) for score in reward_scores]
        #### Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, scores_list)
        ppo_trainer.log_stats(stats, batch, reward_scores)

Epoch::   0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/31 [00:00<?, ?it/s][AYou're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.

  3%|▎         | 1/31 [00:15<07:56, 15.89s/it][A





























100%|██████████| 31/31 [07:49<00:00, 15.13s/it]
Epoch::  25%|██▌       | 1/4 [07:49<23:27, 469.17s/it]
  0%|          | 0/31 [00:00<?, ?it/s][A


 10%|▉         | 3/31 [00:36<05:30, 11.80s/it][A
 13%|█▎        | 4/31 [00:48<05:18, 11.79s/it][A








 42%|████▏     | 13/31 [02:43<04:06, 13.68s/it][A













 87%|████████▋ | 27/31 [05:49<00:49, 12.32s/it][A



100%|██████████| 31/31 [06:33<00:00, 12.69s/it]
Epoch::  50%|█████     | 2/4 [14:22<14:09, 424.51s/it]




 13%|█▎        | 4/31 [00:42<04:44, 10.54s/it][A
 16%|█▌        | 5/31 [00:52<04:29, 10.37s/it][A







 42%|████▏     | 13/

AttributeError: 'PPOTrainer' object has no attribute 'save_model'

In [None]:
# Save the updated policy model after training
model_save_path = "updated_gpt2_model"
ppo_trainer.save_pretrained(model_save_path)



In [None]:
def predict_weak_option(prompt, options, tokenizer, model):
    scores = []
    for option in options:
        # Combine prompt and option into a single sequence
        input_sequence = prompt + " " + option
        # Tokenize the input sequence
        tokenized_input = tokenizer.encode(input_sequence, return_tensors="pt").to("cuda")
        # Generate output with GPT-2
        with torch.no_grad():
            outputs = model(tokenized_input, labels=tokenized_input)
            loss = outputs.loss
            scores.append(loss.item())
    # Assuming lower loss (higher likelihood) indicates a better option
    most_likely_option_index = scores.index(min(scores))
    return most_likely_option_index

In [None]:
swag = load_dataset("swag", "regular")

In [None]:
swag

DatasetDict({
    train: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 73546
    })
    validation: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 20006
    })
    test: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 20005
    })
})

In [None]:
eval_dataset=swag["test"]

In [None]:
gpt2_model_sp = AutoModelForCausalLM.from_pretrained("updated_gpt2_model", ignore_mismatched_sizes=True).to("cuda")
gpt2_model_base = AutoModelForCausalLM.from_pretrained("distilgpt2", ignore_mismatched_sizes=True).to("cuda")

Some weights of the model checkpoint at updated_gpt2_model were not used when initializing GPT2LMHeadModel: ['v_head.summary.bias', 'v_head.summary.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from collections import Counter
from random import randrange
from google.colab import files
from itertools import combinations

correct_pairs_distilgpt2_sp = 0
correct_pairs_distilgpt2_base = 0

# Iterate through the evaluation dataset
for i, example in enumerate(eval_dataset):
    if (i >= 4030):
      break

    if (i % 100 == 0):
      print('row {}'.format(i))

    prompt = example["startphrase"]
    label=example["label"]  # The correct option index
    options = [example[f"ending{i}"] for i in range(4)]  # Adjust based on your dataset structure
    # Generate all possible pairs of options
    option_pairs = [pair for pair in combinations(options, 2) if options[label] in pair]
    pair_list=[]

    for pair in option_pairs:

      most_likely_option_index_list=[]

      label_index = pair.index(options[label])

      most_likely_option_index_sp = predict_weak_option(prompt, pair, tokenizer, gpt2_model_sp)
      most_likely_option_index_base = predict_weak_option(prompt, pair, tokenizer, gpt2_model_base)
      pair_list.append(pair)

      most_likely_option_index_list.append(most_likely_option_index_sp)
      most_likely_option_index_list.append(most_likely_option_index_base)
          #print(options[label])
          #print(options[most_likely_option_index])
      if (options[label] == pair[most_likely_option_index_sp]):
          correct_pairs_distilgpt2_sp += 1
      if (options[label] == pair[most_likely_option_index_base]):
          correct_pairs_distilgpt2_base += 1

row 0
row 100
row 200
row 300
row 400
row 500
row 600
row 700
row 800
row 900
row 1000
row 1100
row 1200
row 1300
row 1400
row 1500
row 1600
row 1700
row 1800
row 1900
row 2000
row 2100
row 2200
row 2300
row 2400
row 2500
row 2600
row 2700
row 2800
row 2900
row 3000
row 3100
row 3200
row 3300
row 3400
row 3500
row 3600
row 3700
row 3800
row 3900
row 4000


In [None]:
print(correct_pairs_distilgpt2_base)
print(correct_pairs_distilgpt2_sp)

6115
6039


In [None]:
!zip -r updated_gpt2_model.zip updated_gpt2_model

  adding: updated_gpt2_model/ (stored 0%)
  adding: updated_gpt2_model/merges.txt (deflated 53%)
  adding: updated_gpt2_model/special_tokens_map.json (deflated 60%)
  adding: updated_gpt2_model/tokenizer.json (deflated 72%)
  adding: updated_gpt2_model/config.json (deflated 52%)
  adding: updated_gpt2_model/README.md (deflated 42%)
  adding: updated_gpt2_model/model.safetensors (deflated 7%)
  adding: updated_gpt2_model/vocab.json (deflated 59%)
  adding: updated_gpt2_model/generation_config.json (deflated 24%)
  adding: updated_gpt2_model/tokenizer_config.json (deflated 54%)
