In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import transformers
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig, DefaultDataCollator
import pytorch_lightning as pl
from torchmetrics.classification import BinaryAUROC, BinaryAccuracy
import datasets
import random
from functools import partial
from typing import NamedTuple
import itertools
from collections import defaultdict
from tqdm import tqdm
import json
from google.colab import drive

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


# To check that my method works, I want to fine-tune GPT2.

In [4]:
main_tokenizer = AutoTokenizer.from_pretrained('lvwerra/gpt2-imdb', padding_side='left')
main_tokenizer.pad_token = main_tokenizer.eos_token
main_model = transformers.AutoModelForCausalLM.from_pretrained('lvwerra/gpt2-imdb', device_map=device)

Loading weights:   0%|          | 0/149 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: lvwerra/gpt2-imdb
Key                              | Status     |  | 
---------------------------------+------------+--+-
transformer.h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [5]:
inputs = main_tokenizer('The movie', return_tensors='pt').to(device)
with torch.no_grad():
    generated_ids = main_model.generate(**inputs, max_new_tokens=50, do_sample=True)
generated_text = main_tokenizer.decode(generated_ids[0].tolist())
print('Generated text:', generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text: The movie that had me hooked was 'Moses' which was a pretty good horror film, the director, Paul Williams wanted to make a new classic with a strong psychological story, but all he brought to it was a very low budget budget and a very tight


# Load emb model

In [6]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)
emb_model = AutoModel.from_pretrained(
    'Qwen/Qwen3-Embedding-8B',
    quantization_config=quantization_config,
    device_map=device,
    trust_remote_code=True
).requires_grad_(False)
emb_tokenizer = AutoTokenizer.from_pretrained(
    'Qwen/Qwen3-Embedding-8B',
    trust_remote_code=True
)

config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/398 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

### Check that quantization is `int8` for weights and `bfloat16` for rest

In [7]:
for name, param in itertools.islice(emb_model.named_parameters(), 15):
    print(f'{name}: {param.dtype}, shape: {param.shape}, requires_grad: {param.requires_grad}')

embed_tokens.weight: torch.bfloat16, shape: torch.Size([151665, 4096]), requires_grad: False
layers.0.self_attn.q_proj.weight: torch.int8, shape: torch.Size([4096, 4096]), requires_grad: False
layers.0.self_attn.k_proj.weight: torch.int8, shape: torch.Size([1024, 4096]), requires_grad: False
layers.0.self_attn.v_proj.weight: torch.int8, shape: torch.Size([1024, 4096]), requires_grad: False
layers.0.self_attn.o_proj.weight: torch.int8, shape: torch.Size([4096, 4096]), requires_grad: False
layers.0.self_attn.q_norm.weight: torch.bfloat16, shape: torch.Size([128]), requires_grad: False
layers.0.self_attn.k_norm.weight: torch.bfloat16, shape: torch.Size([128]), requires_grad: False
layers.0.mlp.gate_proj.weight: torch.int8, shape: torch.Size([12288, 4096]), requires_grad: False
layers.0.mlp.up_proj.weight: torch.int8, shape: torch.Size([12288, 4096]), requires_grad: False
layers.0.mlp.down_proj.weight: torch.int8, shape: torch.Size([4096, 12288]), requires_grad: False
layers.0.input_layern

### Output shape is (batch_size, token_len, emb_size)

In [7]:
text = 'Special abracadabra'
inputs = emb_tokenizer(text, return_tensors='pt').to(device)
with torch.no_grad():
    outputs = emb_model(**inputs).last_hidden_state.cpu()



In [8]:
inputs.input_ids.shape, outputs.shape

(torch.Size([1, 6]), torch.Size([1, 6, 4096]))

In [9]:
[emb_tokenizer.decode(token) for token in inputs.input_ids[0]]

['Special', ' ab', 'rac', 'ad', 'abra', '<|endoftext|>']

In [10]:
outputs.amax(dim=1).shape

torch.Size([1, 4096])

# Precompute embeddings

In [8]:
imdb = datasets.load_dataset('imdb')

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(‚Ä¶):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [9]:
def get_embeddings(loader: DataLoader) -> dict[str, torch.Tensor]:
    embeddings = defaultdict(list)

    for batch in tqdm(loader):
        inputs = emb_tokenizer(
            batch['text'], padding=True, truncation=True, max_length=512, return_tensors='pt'
        ).to(device)
        with torch.no_grad():
            outputs = emb_model(**inputs).last_hidden_state.cpu()

        attn_mask = inputs['attention_mask'].cpu()
        token_len = attn_mask.sum(axis=1)

        mask_expanded = attn_mask.unsqueeze(-1).expand_as(outputs)

        outputs_masked_min = outputs.masked_fill(mask_expanded == 0, float('inf'))
        outputs_masked_max = outputs.masked_fill(mask_expanded == 0, float('-inf'))

        embeddings['min'].append(outputs_masked_min.min(dim=1)[0])
        embeddings['max'].append(outputs_masked_max.max(dim=1)[0])

        outputs_masked_sum = (outputs * mask_expanded).sum(dim=1)
        embeddings['mean'].append(outputs_masked_sum / token_len.unsqueeze(-1))

        embeddings['first'].append(outputs[:, 0, :])

        row_indices = torch.arange(len(outputs))
        embeddings['last'].append(outputs[row_indices, token_len - 1])

        embeddings['label'].append(batch['label'])

    embeddings = {key: torch.cat(val) for key, val in embeddings.items()}
    return embeddings

In [122]:
dummy_embeddings = get_embeddings(DataLoader(imdb['train'].shuffle(seed=0).take(10), batch_size=128))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00,  1.10it/s]


In [88]:
!mkdir -m 777 /content/drive/MyDrive/data/imdb_review_embeddings

In [10]:
def get_and_save_embeddings(name: str, loader: DataLoader):
    embeddings = get_embeddings(loader)

    data = {key: value.tolist() for key, value in embeddings.items()}
    full_path = f'/content/drive/MyDrive/data/imdb_review_embeddings/{name}.json'
    with open(full_path, 'w') as out:
        json.dump(data, out)
        print(
            f'Written {len(data)} tables of shape {next(iter(embeddings.values())).shape} '
            f'into {full_path}'
        )

In [9]:
def random_crop(item: dict, rng: random.Random, min_len=10, max_len=100):
    text = item['text']
    words = text.split()
    if len(words) > min_len:
        segment_length = rng.randint(min_len, min(max_len, len(words)))
        max_start = len(words) - segment_length
        start_idx = rng.randint(0, max_start) if max_start > 0 else 0
        text = ' '.join(words[start_idx:start_idx + segment_length])
    return {'text': text}

In [12]:
train_loader = DataLoader(imdb['train'], batch_size=192)
test_loader = DataLoader(imdb['test'], batch_size=192)

# rng = random.Random(42)
# train_random_crop = imdb['train'].map(random_crop, fn_kwargs={'rng': rng, 'max_len': 50})
# test_random_crop = imdb['test'].map(random_crop, fn_kwargs={'rng': rng, 'max_len': 50})

# train_random_crop_loader = DataLoader(train_random_crop, batch_size=256)
# test_random_crop_loader = DataLoader(test_random_crop, batch_size=256)

In [13]:
# get_and_save_embeddings('train_random_crop_10_50', train_random_crop_loader)
# get_and_save_embeddings('test_random_crop_10_50', test_random_crop_loader)
get_and_save_embeddings('train', train_loader)
get_and_save_embeddings('test', test_loader)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 131/131 [30:40<00:00, 14.05s/it]


Written 6 tables of shape torch.Size([25000, 4096]) into /content/drive/MyDrive/data/imdb_review_embeddings/train.json


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 131/131 [30:38<00:00, 14.03s/it]


Written 6 tables of shape torch.Size([25000, 4096]) into /content/drive/MyDrive/data/imdb_review_embeddings/test.json


# Build classifier

In [50]:
class ClassifierOutput(NamedTuple):
    logits: torch.Tensor
    loss: torch.Tensor | None = None

class ClassifierModel(nn.Module):
    def __init__(self, emb_model: transformers.modeling_utils.PreTrainedModel, hid_size: int = 100):
        super().__init__()
        self.emb_model = emb_model
        self.head = nn.Sequential(
            nn.Linear(emb_model.config.hidden_size, hid_size),
            nn.SiLU(),
            nn.Linear(hid_size, 1),
        )

    def forward(self, *args, labels: torch.Tensor | None = None, **kwargs):
        emb_by_token = self.emb_model(*args, **kwargs).last_hidden_state
        emb = emb_by_token.mean(axis=1)
        logits = self.head(emb).flatten()

        loss = None
        if labels is not None:
            loss = nn.BCEWithLogitsLoss()(logits, labels.float())

        return ClassifierOutput(loss=loss, logits=logits)

In [51]:
pl.seed_everything(42)
classifier_model = ClassifierModel(emb_model, hid_size=10).to(device)
classifier_model.head.requires_grad_(True);

INFO:lightning_fabric.utilities.seed:Seed set to 42


### Check outputs

In [52]:
with torch.no_grad():
    with torch.amp.autocast(device.type):
        logits = classifier_model(**inputs).logits.cpu()
logits



tensor([-0.2029, -0.5317, -0.3787,  0.0866, -0.2971, -0.2979, -0.3452, -0.4333,
        -0.5322, -0.0090, -0.0764, -0.1807, -0.1927, -0.2549, -0.2057, -0.0656],
       dtype=torch.float16)

# Load dataset

In [53]:
def collate_with_augmentation(batch: list[dict], tokenizer: AutoTokenizer, rng: random.Random, augment_probability: float = 0.0):
    augmented_texts = []
    labels = []

    for item in batch:
        text = item['text']

        if augment_probability > 0 and rng.random() < augment_probability:
            words = text.split()
            if len(words) > 10:
                segment_length = rng.randint(10, min(100, len(words)))
                max_start = len(words) - segment_length
                start_idx = rng.randint(0, max_start) if max_start > 0 else 0
                text = ' '.join(words[start_idx:start_idx + segment_length])

        augmented_texts.append(text)
        labels.append(item['label'])

    tokenized = tokenizer(augmented_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
    tokenized['labels'] = torch.tensor(labels)

    return tokenized

In [60]:
rng = random.Random(42)

train_augmentation_collate_fn = partial(collate_with_augmentation, tokenizer=emb_tokenizer, rng=rng, augment_probability=0.9)
test_augmentation_collate_fn = partial(collate_with_augmentation, tokenizer=emb_tokenizer, rng=rng)

train_loader = DataLoader(imdb['train'], batch_size=16, shuffle=True, collate_fn=train_augmentation_collate_fn, num_workers=4)

val_subset = imdb['test'].shuffle(seed=42).take(1000)
val_loader = DataLoader(val_subset, batch_size=16, shuffle=False, collate_fn=test_augmentation_collate_fn, num_workers=4)

In [65]:
class ClassifierModule(pl.LightningModule):
    def __init__(self, model, lr=2e-5):
        super().__init__()
        self.model = model
        self.lr = lr
        self.val_auroc = BinaryAUROC()
        self.val_accuracy = BinaryAccuracy()

    def forward(self, **kwargs):
        return self.model(**kwargs)

    def training_step(self, batch, batch_idx):
        output = self(**batch)

        predictions = output.logits > 0
        acc = (predictions == batch['label']).float().mean()

        self.log('train_loss', output.loss.item(), prog_bar=True)
        self.log('train_acc', acc, prog_bar=True)
        return output.loss

    def validation_step(self, batch, batch_idx):
        output = self(**batch)

        self.val_auroc.update(output.logits, batch['label'])
        self.val_accuracy.update(output.logits, batch['label'])

        self.log('val_loss', output.loss.item(), prog_bar=True, on_step=False, on_epoch=True)
        return output.loss

    def on_validation_epoch_end(self):
        self.log('val_auc', self.val_auroc.compute(), prog_bar=True)
        self.log('val_acc', self.val_accuracy.compute(), prog_bar=True)
        self.val_auroc.reset()
        self.val_accuracy.reset()

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)

In [66]:
pl_model = ClassifierModule(classifier_model, lr=2e-5)
trainer = pl.Trainer(max_epochs=1, accelerator='auto', precision='bf16-mixed')
trainer.fit(pl_model, train_loader, val_loader)

INFO:pytorch_lightning.utilities.rank_zero:Using bfloat16 Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud logging and experiment tracking, try installing [litlogger](https://pypi.org/project/litlogger/) to enable LitLogger, which logs metrics and artifacts automatically to the Lightning Experiments platform.
INFO:pytorch_lightning.utilities.rank_zero:üí° Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [73]:
classifier_model.eval()

for sample_index in 45, 16000:
    text = imdb['train'][sample_index]['text']
    print('Review:', text)
    inputs = emb_tokenizer(text, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        with torch.amp.autocast(device.type):
            reward = classifier_model(**inputs).logits[0].item()
    print('Reward:', reward)
    print('Label:', imdb['train'][sample_index]['label'])
    print()

Review: This movie sucked. It really was a waste of my life. The acting was atrocious, the plot completely implausible. Long, long story short, these people get "terrorized" by this pathetic "crazed killer", but completely fail to fight back in any manner. And this is after they take a raft on a camping trip, with no gear, and show up at a campsite that is already assembled and completely stocked with food and clothes and the daughters headphones. Additionally, after their boat goes missing, they panic that they're stuck in the woods, but then the daughters boyfriend just shows up and they apparently never consider that they could just hike out of the woods like he did to get to them. Like I said, this movie sucks. A complete joke. Don't let your girlfriend talk you into watching it.




Reward: -2.291015625
Label: 0

Review: Good: Engaging cinematic firefights, great presentation, vehicles are actually fun to drive, fairly appealing multiplayer, faithful to the movie, and the list goes on.<br /><br />Bad: Main missions are a bit short.<br /><br />This game defines what a "good" third person shooter(not necessarily a spy-game) is. Great firefights carry on the story and make you want to complete EVERY single mission through, and unlock all the genuine bonuses the game has to offer. The hype this game had, was lived up to, and I personally think you should buy it, and hook up with a couple of friends and play this one. Loads of fun. <br /><br />The sound in this game, is a rip-roaring achievement from a few previous bond games, and firing a weapon, really feels like you're firing a weapon. It ties in with the aspect that you are a deadly and ruthless spy.<br /><br />All in all, this game makes you excited and satisfied after you make it through, and some multiplayer tha

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_true, y_pred_logits)

np.float64(0.9732160832000001)

### Reward-guided generation (1 point)

If you did everything right, by now you should have a decent reward model. Before we use it for reinforcement learning, let's see if we can align model samples without any training.

To do so, you can use reward-guided inference: __generate N=16 samples, then select the one with the highest reward__ (according to your reward model).

For this problem, it's on you to demonstrate whether or not your code works. Find at least 5 neutral prompts such as 'This movie is' (...), generate samples, rank them based on reward and show which samples get the highest reward.

Note: it is faster to generate samples in parallel, rather than sequentially, as follows:




In [None]:
inputs = main_tokenizer(['It was'] * 16, return_tensors='pt').to(device)
with torch.no_grad():
    outputs = main_model.generate(**inputs, max_new_tokens=30, do_sample=True).cpu()
generated = [main_tokenizer.decode(candidate.tolist()) + '...' for candidate in outputs]
print('\n'.join(generated))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


It was so awful the movie seemed to be put on paper and had little merit to be entertaining.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>...
It was a fun watch all the time. Don't miss, you'll be making it again, and we'll always have your back.<br /><br...
It was all very well done and entertaining.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>...
It was actually quite amusing.<br /><br />The other problem with the film was that it was almost totally amateurish. If that wasn't enough,...
It was a real honor to be involved in the project, as the "first lady" in that film and I'll never forget the final speech he gave to...
It wa

In [None]:
inputs = reward_tokenizer(generated, truncation=True, padding=True, return_tensors='pt').to(device)
with torch.no_grad():
    outputs = reward_model(**inputs)
    logits = outputs.logits[:, 0].tolist()

In [None]:
import pandas as pd
import textwrap

In [None]:
data = pd.DataFrame({'logit': logits, 'text': generated}).sort_values('logit')
data

Unnamed: 0,logit,text
6,-5.625,It was an awful effort by the filmmakers. One ...
0,-5.253906,It was so awful the movie seemed to be put on ...
13,-4.808594,It was a really awful film to view. I guess it...
3,-3.591797,It was actually quite amusing.<br /><br />The ...
8,-3.447266,It was a really slow film and I had to watch a...
10,0.023529,It was so very difficult to write these songs ...
7,1.905273,It was great for the movie. Its too bad they d...
4,3.03125,It was a real honor to be involved in the proj...
12,3.857422,It was the first movie in which I actually saw...
9,4.152344,It was good with a nice twist. It wasn't the t...


In [None]:
print(textwrap.fill(data.text.iloc[3], 50))

It was actually quite amusing.<br /><br />The
other problem with the film was that it was almost
totally amateurish. If that wasn't enough,...


In [None]:
print(textwrap.fill(data.text.iloc[-1], 50))

It was an OK movie, but you can tell by the acting
of some (or all) who watched the film. The acting
and the dialogue were great and...


# Stage 2: fine-tune the main model with RL (2 points)


For this tutorial, we will optimize GPT2 to produce positive IMDB movie reviews using the reward model you trained above.

Unlike supervised fine-tuning, RL allows model to generate it's own sentences on each training step. Then, it calculates the reward of those specific sentences, and finally, updates the model to increase the probability of sentences with high reward.

Thus, each RLHF consists of three stages: __Rollout__, __Evaluation__ and __Update__

<div style='text-align: center'>
<img src='https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/gpt2_bert_training.png' width='600'>

The update stage depends on the specific RL algorithm. We'll be using Proximal Policy Optimization, or [PPO](https://arxiv.org/abs/1707.06347), similarly to what was used for InstructGPT.

Before we run those 3 stages, however, we need to create a dataset of 'queries' - partial reviews in our case.

In [None]:
import random

In [None]:
imdb_for_rlhf = imdb.filter(lambda row: len(row['text']) > 200, batched=False)
imdb_for_rlhf = imdb_for_rlhf.remove_columns(['label'])

rng = random.Random(42)

def format_prompt(sample):
    text = sample['text']
    words = text.split()[:rng.randint(2, 6)]
    sample['prompt'] = ' '.join(words)
    return sample

imdb_for_rlhf = imdb_for_rlhf.map(format_prompt, batched=False, remove_columns=imdb_for_rlhf.column_names)

Map:   0%|          | 0/24895 [00:00<?, ? examples/s]

In [None]:
'Whoever wrote...'  # That might have funny continuations!)

Next, let's prepare your reward model to predict rewards on whatever reviews were generated. Note that we use plaintext reviews because main model uses a different tokenizer from the reward model.

In [None]:
from typing import List
def compute_reward(texts: List[str]) -> torch.Tensor:
  inputs = reward_tokenizer(texts, truncation=True, padding=True, return_tensors='pt').to(device)
  with torch.no_grad():
    return reward_model(**inputs).logits[:, 0]

In [None]:
compute_reward([imdb[45]['text'], imdb[16000]['text']])  # test on human-written reviews

tensor([-5.9648,  6.3984], device='cuda:0')

Finally, we move to RL training. In this tutorial, we'll train LoRA adapters and not the full model.

In [None]:
from trl import GRPOConfig, GRPOTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer
import peft
import torch

Same as before, trl has a special type of trainer that minimize PPO-specific pseudo-loss. You can read more on this trainer [here](https://huggingface.co/docs/trl/main/en/ppo_trainer).

In [None]:
history = []

def compute_reward(prompts: list[str], completions: list[str], **kwargs):
    full_reviews = [prompt + completion for prompt, completion in zip(prompts, completions)]
    inputs = reward_tokenizer(
        full_reviews,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors='pt'
    ).to(reward_model.device)

    with torch.no_grad():
        outputs = reward_model(**inputs)

    scores = outputs.logits[:, 0].tolist()

    global history
    history.append({'reviews': full_reviews, 'scores': scores})

    return scores

In [None]:
main_tokenizer = AutoTokenizer.from_pretrained('lvwerra/gpt2-imdb')
main_tokenizer.pad_token = main_tokenizer.eos_token
main_tokenizer.padding_side = 'left'

policy_model = AutoModelForCausalLM.from_pretrained('lvwerra/gpt2-imdb', device_map=device)

peft_config = peft.LoraConfig(
    task_type=peft.TaskType.CAUSAL_LM,
    r=32,
    lora_alpha=32,
    lora_dropout=0.0,
    inference_mode=False,
)

training_args = GRPOConfig(
    output_dir='./grpo_output',
    learning_rate=1.41e-5,
    per_device_train_batch_size=64,
    gradient_accumulation_steps=1,
    num_generations=8,
    max_completion_length=45,
    max_steps=400,
    logging_steps=10,
)

grpo_trainer = GRPOTrainer(
    model=policy_model,
    reward_funcs=compute_reward,
    args=training_args,
    train_dataset=imdb_for_rlhf,
    processing_class=main_tokenizer,
    peft_config=peft_config,
)

grpo_trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Step,Training Loss
10,0.0006
20,0.0005
30,0.015
40,0.0034
50,0.0074
60,0.0054
70,0.0074
80,-0.0025
90,0.0092
100,0.0154




TrainOutput(global_step=400, training_loss=0.006350756771862507, metrics={'train_runtime': 387.8387, 'train_samples_per_second': 66.007, 'train_steps_per_second': 1.031, 'total_flos': 0.0, 'train_loss': 0.006350756771862507})

In [None]:
pd.DataFrame(history[0])

Unnamed: 0,reviews,scores
0,This movie is excellent and is just interestin...,6.281250
1,This movie is excellent and should convey emot...,6.156250
2,"This movie is excellent and always has been, i...",6.234375
3,This movie is excellent and a great look at wh...,6.375000
4,This movie is excellent and at times hard to w...,6.214844
...,...,...
59,It's hard for me to criticize the film. It's s...,-1.138672
60,It's hard for me to criticize them for a bad p...,-0.179565
61,It's hard for me to criticize French imaginati...,4.718750
62,It's hard for me to criticize somebody for onl...,-2.839844


In [None]:
len(history[0]['completions'])

64

In [None]:
history[0]['completions'][4]

"' has to start with the crime scene, well, obviously right on the safe side. Die To Kill (My slowest Rebecca, written at a youthful production out of an ADFC!), stuck just as the other James Garner starlet...yes, we need a disco song for when you have to hear the superficial Georgia governor telling him to be ever faithful.<br /><br />The Quatch: But, would you believe that ....some of this is credible. It does not edge the"

In [None]:
history[0]['completions'][4]

"' has to start with the crime scene, well, obviously right on the safe side. Die To Kill (My slowest Rebecca, written at a youthful production out of an ADFC!), stuck just as the other James Garner starlet...yes, we need a disco song for when you have to hear the superficial Georgia governor telling him to be ever faithful.<br /><br />The Quatch: But, would you believe that ....some of this is credible. It does not edge the"

In [None]:
grpo_trainer.save_model('./adapters')

In [None]:
beginnings = [
    'Whoever wrote',
    'At first I though that the movie was good. But',
] + ['It was'] * 5

In [None]:
main_tokenizer = AutoTokenizer.from_pretrained('lvwerra/gpt2-imdb')
main_tokenizer.pad_token = main_tokenizer.eos_token
main_tokenizer.padding_side = 'left'

In [None]:
main_model.eval()

inputs = main_tokenizer(beginnings, return_tensors='pt', padding=True).to(device)
with torch.no_grad():
    outputs = main_model.generate(**inputs, max_new_tokens=100, do_sample=True).cpu()
generated = [main_tokenizer.decode(candidate.tolist(), skip_special_tokens=True) for candidate in outputs]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
print('\n'.join(generated))

Whoever wrote this movie was a great movie to watch because of its character and story, but unfortunately I did not like that one. It has quite many flaws - not a bad start, not a bad ending, but a very bad, very boring sequel with no character development. Perhaps if I am honest there are some scenes I do not like at all, but the film does not even make up any of the flaws. <br /><br />One word of advice: don't bother renting the DVD unless
At first I though that the movie was good. But then the movie started getting really bad too.<br /><br />I think that this is a typical movie. The acting and the writing is very high level. After seeing it I don't know about the movie but it seems to be from another time period. The movie starts out good so the film doesn't feel really bad. But it becomes very bad very fast. The acting wasn't good. It just seemed like the people who make this movie (and who the production company) made lots
It was also my first experience with the horror genre. Hav

In [None]:
print('\n'.join(generated))

Whoever wrote this, I have nothing more to add....
At first I though that the movie was good. But I felt like it was the worst piece of acting in a movie, except for an old-timer like me that didn't think his characters were funny. "Pitch Black" was just another waste of my time.<br /><br />A great cast including Kevin Spacey, Michael Sheen, Michael Kiley, John Malkovich, the latter of whom looks like Jadzia Dovargue with the bright pink tint on his cheeks and a cute voice (I don't remember anything...
It was funny that this film did get canceled after "Lifetime" after being advertised for 1,000 million. If you're into the good old days, see "The Good, The Bad."...
It was the most memorable moment of the film. It was as if I'd run to the bathroom and hit the bathroom wall. This was the first movie I had ever witnessed since the first movie they filmed in 1955 "American Beauty". The other actors looked like they had taken their day off watching "American Beauty". When they came back to 

In [None]:
print('\n'.join(generated))

Whoever wrote about it?...
At first I though that the movie was good. But the quality of the acting was poor. The movie was slow moving, there was some kind of emotional and physical feeling with little thought given to any subject. The plot could have been much more thought provoking. In the end most of the work would have been directed by the actors themselves and perhaps at least the director could have used their talents well. But that's not true. No other movie I have seen has an emotional response because of any topic, and that is why I gave this flick 7 out...
It was not a "serious" documentary and didn't even qualify that category as a propaganda film. In fact, we found the movie really "resembling" the facts about war (including the events of the Vietnam War and the "war on terror"). Even though it didn't do very well in its first three weeks, it has a lot of action scene segments, and a great deal of plot twists that were made good by showing it off, so a decent film would ne

In [None]:
print('\n'.join(generated))

Whoever wrote that one would have never seen the entire movie with their own eyes or ears, the whole movie is only a joke.<br /><br />Well, the only reason i have it on video is out of curiosity. <br /><br />I'm just not sure if anyone else would've seen the entire movie. <br /><br />So, it's not clear where exactly to begin if you have an extra dollar to make. <br /><br />I didn't expect...
At first I though that the movie was good. But the plot was good too.<br /><br />The whole show is made of it's own nonsense. Each character has one or more motivations which are explained by another character. It's supposed to make me want to kill it all... but I didn't think that at first.<br /><br />The characters have to talk or listen through speeches to get anything out of it. That's all wrong, that's how it ends.<br /><br />And if it's about money...
It was amazing to think that any actor got their own scene so they could direct and make the movie. I didn't understand that because everyone w

In [None]:
inputs = reward_tokenizer(generated, truncation=True, padding=True, return_tensors='pt').to(device)
with torch.no_grad():
    outputs = reward_model(**inputs)
    logits = outputs.logits[:, 0].tolist()

In [None]:
data_4 = pd.DataFrame({'logit': logits, 'text': generated}).sort_values('logit')
data_4

Unnamed: 0,logit,text
3,-5.890625,It was an excellent movie. It seemed like a go...
1,-5.582031,At first I though that the movie was good. But...
6,-2.425781,It was funny because I got off to a lot of goo...
5,-1.801758,"It was made in 1947; in 1973, it was used by t..."
2,0.20813,It was also my first experience with the horro...
4,5.964844,It was absolutely incredible. I watched the fi...
0,6.03125,Whoever wrote this movie was a great movie to ...


# [Optional] high-effort bonus assignment: RL fine-tuning in the wild


Use the RLHF pipeline to train a model for a reward of your choice. Here's what you can choose from:

__A. Toxicity fine-tuning:__ train the model to be less (or more!) toxic. For this task, you may use the data from [jigsaw toxic comments](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge) and [lmsys/toxic-chat](https://huggingface.co/datasets/lmsys/toxic-chat),  or any other source. Alternatively, you may use toxicity scores from [oasst1](https://huggingface.co/datasets/OpenAssistant/oasst1).


__B. Actual human feedback:__ use one of the existing datasets with pairwise human feedback to align your langauge model. You may use [anthropic's hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf), [OpenAssistant dataset](https://huggingface.co/datasets/OpenAssistant/oasst1) or any other data you see fit. You may also turn the tables and train the model to [minimize](https://habrastorage.org/getpro/geektimes/post_images/ac7/2ad/827/ac72ad82767d4132164a4b6b76196c42.jpg) human preferences, as long as your model does not degrade to gibberish.

__C. Controlled generation:__ Instead of training a reward model from human feedback, you may define the reward function as the text length (longer or shorter) or number of times the model uses specific words (e.g. 'sorry', 'apologize'). If you choose specific words, make sure the model generates them at least sometimes.

__Alternatively,__ you may choose a different task. However, unless your task is very similar to one of the above, there is a chance that it will be **significantly** harder to solve, requiring orders of magnitude more compute and tuning. If you are in doubt, please ask the course staff. If they are AFK (again >.<), please prefer one of the recommended tasks.


#### General tips & tricks


Things to look out for:
- during PPO stage, the reward model should be in eval mode (dropout disabled)
- make sure max_length and max_new_tokens are enough for your chosen dataset - at least most of the time
- when in doubt, view the data manually or inspect how the model performs on a few samples


We highly recommend that you manually check the performance after each sub-stage:
1. when you assembled the pairwise dataset, inspect a couple of from of *your* dataset class and detokenize them. Make sure that you-the-human understand why one sample was accepted and the other - rejected. At least most of the time. This also lets you spot tokenization/truncation errors.
2. after you trained a reward model, measure how accurate this model is in isolation. If your reward model is poor, any subsequent RLHF will also fail.
3. once you've trained the main model with RL, ask it to generate examples and explore how well it does. If it produces an obviously bad output, check if the reward model assigns high reward to that output. If yes, reward model is the culprit; if no, it's a question of better/longer PPO training.

__It is also a good idea to periodically print samples during training.__

__When stuck, simplify the problem.__ If you've spent a several hours enchanting the reward model but it still won't budge, try switching to a simple subtask. For instance, if you're training on hh-rlhf, try limiting it the dataset to 10% of the shortest sequences - they are typically easier to learn.


## Bonus Assignment Stages

Regardless of the specific task you chose, your solution needs to contain several parts that will be graded separately (for bonus points).


#### Stage 1: reward model

Construct a dataset for training the reward model on your problem. Then, train a reward model on that dataset and evaluate how well can your model predict preferences on a hold-out (test) subset of your data.

Please make sure that the part of your notebook where you evaluate reward model is clearly visible and reasonably easy to read. And for all that is holy, do not call it IMDB unless it actually **is** data of imdb movie reviews :)

__Not all tasks require a reward model for later PPO fine-tuning.__ For instance, there's no reason to train a reward model if your reward equals sentence length. Likewise, toxicity reward can be estimated with a pre-trained toxicity classifier. __If your task does not require training a reward model, please train an unrelated model on [hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) as though you were solving assignment version B.__ This is for grading purposes only, you won't use this model for stage 2.


#### Stage 2: RL fine-tuning

Once the reward model is ready - or you can compute rewards without a model - it is time to maximize that reward with PPO. Optionally, you may replace PPO with another RL algorithm (or unlikelihood learning scheme), but only if you're feeling adventurous.


First, you need to choose a language model to be fine-tuned. You may choose any model, but make sure that your model **can** generate the data in your format. For instance, [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) is a general purpose LM and may (or may not) need prompt engineering to generate chat assistant responses. For that reason, it is best if you **do not use `'lvwerra/gpt2-imdb'` unless you're generating only movie reviews**.



There are two 'difficulty modes' for this task:
For the **easy mode**, use [gpt2-large](https://huggingface.co/gpt2-large) or [opt-1.3b](https://huggingface.co/facebook/opt-1.3b) with minimal code changes.
If you want the **Hard mode:** use a larger (e.g. 7B) model in combination with `load_in_4bit` and LoRA, the same way we did last week.
Some reasonable model choices are [LLaMA-7B](https://huggingface.co/Enoch/llama-7b-hf), [Falcon-7b](https://huggingface.co/tiiuae/falcon-7b), [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) for general-purpose LM or [guanaco-7b](https://huggingface.co/timdettmers/guanaco-7b), [vicuna-7b](https://huggingface.co/lmsys/vicuna-7b-v1.5) for chat-based tasks, though there are many more (see [leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)). In the hard mode, you will need to modify the training arguments to enable 4-bit fine-tuning. Furthermore, your experiments will take somewhat longer to complete. On the plus side, your model will produce significantly better results.

__High reward is not enough!__ RL algorithms are famous for [cheating their reward functions](https://openai.com/research/faulty-reward-functions). To ensure that your model is actually doing what you want it to do, you will need some additional evaluation. To get the full grade, provide at least 20 side-by-side examples of your fine-tuned model vs original model predictions and a short summary.

Alternatively, you may provide 5 examples and some extrinsic evaluation metric over many examples. For instance, you may use a different pre-trained toxicity score for option A. When dealing with human preferences, you may choose to [enlist actual humans](https://toloka.ai/) or [ask GPT/Claude](https://arxiv.org/pdf/2304.03277.pdf) to compare your model's predictions. For task C, when optimizing for simple rewards like sentence lengths, it is enough to compare histograms of rewards (e.g. average lengths).










