In [1]:
import torch

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer
from transformers import AutoTokenizer
from tqdm import tqdm
from torch import Tensor
from datasets import Dataset

from train import evaluate_out, BASE_EXO, CodeGen7

NUM_ITERATIONS: int = 10
# cg25_7 = CodeGen7()
MODEL_NAME = "meta-llama/Meta-Llama-3-8B" # "Salesforce/codegen25-7b-mono" # 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = PPOConfig(
    model_name=MODEL_NAME,
    learning_rate=1.41e-5,
    batch_size=1,
    ppo_epochs=NUM_ITERATIONS,
    mini_batch_size=1,
)

model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.18it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
dataset = Dataset.from_dict({ "query": [BASE_EXO] })
def tokenize(sample):
    sample["input_ids"] = tokenizer.encode(sample["query"])
    return sample

dataset = dataset.map(tokenize)

Map: 100%|██████████| 1/1 [00:00<00:00, 68.87 examples/s]


In [4]:
ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    # optimizer=torch.optim.SGD(model.parameters(), lr=config.learning_rate),
    dataset=dataset,
    tokenizer=tokenizer,
)

In [5]:
def get_rewards(responses: list[str]) -> list[Tensor]:
    rews = [ ]
    measured = { }
    for resp in responses:
        if resp in measured.keys():
            rews.append(measured[resp])
            continue
        valid, cost = evaluate_out(resp)
        rew = torch.tensor(-cost if valid else -1e10)
        rews.append(rew)
        measured[resp] = rew
    return rews

In [6]:
for a in ppo_trainer.dataloader:
    print(a)
    break

{'query': ['from __future__ import annotations\nfrom exo import proc, DRAM\nfrom exo.stdlib.scheduling import *\n\n# Algorithm definition\n@proc\ndef generated_operation(\n    In: i8[16, 16] @ DRAM,\n    Weights: i8[16, 16] @ DRAM,\n    Out: i8[16, 16] @ DRAM,\n):\n    for i in seq(0, 16):\n        for j in seq(0, 16):\n            for k in seq(0, 16):\n                Out[i, j] += In[i, k] * Weights[k, j]\n\n# Now we create a schedule to improve performance!\n## first switch to "output stationary"\ngemmini = rename(generated_operation, "generated_operation_scheduled")\ngemmini = reorder_loops(gemmini, "j k")\ngemmini = reorder_loops(gemmini, "i k")\n\n## then '], 'input_ids': [tensor([1527], device='cuda:0'), tensor([1328], device='cuda:0'), tensor([21733], device='cuda:0'), tensor([565], device='cuda:0'), tensor([1179], device='cuda:0'), tensor([33307], device='cuda:0'), tensor([198], device='cuda:0'), tensor([1527], device='cuda:0'), tensor([506], device='cuda:0'), tensor([78], devi

In [7]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": False,
    "max_new_tokens" : 100,
    "pad_token_id": tokenizer.eos_token_id,
    "num_beams": 4,
    "num_beam_groups": 2,
    "diversity_penalty": 0.5
}

In [8]:
it = iter(ppo_trainer.dataloader)
batch = next(it)
query_tensors = batch["input_ids"]
query_tensors_formatted = [torch.cat(query_tensors)]

In [9]:
response_tensors = ppo_trainer.generate(query_tensors, batch_size=1, **generation_kwargs)
batch["response"] = [tokenizer.decode(r.squeeze(), skip_special_tokens=True) for r in response_tensors]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [10]:
texts: list[str] = [batch['query'][0] + r for r in batch["response"]]
rewards: list[Tensor] = get_rewards(texts)

In [11]:
idx = rewards.index(max(rewards))
response_tensors_formatted = [response_tensors[idx]]
batch["selected_idx"] = idx
batch["selected_response"] = batch["response"][idx]

In [12]:
print(batch["selected_response"])

future> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral>. <http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral> <http://www.w3.org/1999/02/22-rdf-syntax-ns#datatype> <http://www.w3.org/


In [13]:
(query_tensors_formatted, response_tensors_formatted, [max(rewards)])[1]

[tensor([21733,    29,   366,  1277,  1129,  2185,  1444,    18,  2726,    14,
          2550,    24,    14,  2437,    14,  1313,  3880,  3013, 83879,    12,
          4511,     2,  1337,    29,   366,  1277,  1129,  2185,  1444,    18,
          2726,    14,  2550,    24,    14,  2437,    14,  1313,  3880,  3013,
         83879,    12,  4511,     2, 10833, 17802,    29,   662,   366,  1277,
          1129,  2185,  1444,    18,  2726,    14,  2550,    24,    14,  2437,
            14,  1313,  3880,  3013, 83879,    12,  4511,     2, 10833, 17802,
            29,   366,  1277,  1129,  2185,  1444,    18,  2726,    14,  2550,
            24,    14,  2437,    14,  1313,  3880,  3013, 83879,    12,  4511,
             2, 63509,    29,   366,  1277,  1129,  2185,  1444,    18,  2726,
            14], device='cuda:0')]

In [14]:
stats = ppo_trainer.step(query_tensors_formatted, response_tensors_formatted, [max(rewards)])
ppo_trainer.log_stats(stats, batch, rewards)

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 79.11 GiB of which 164.50 MiB is free. Process 64090 has 78.94 GiB memory in use. Of the allocated memory 77.96 GiB is allocated by PyTorch, and 342.23 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:

epochs = NUM_ITERATIONS
for epoch in tqdm(range(epochs), "epoch: "):
    for batch in tqdm(ppo_trainer.dataloader): 
        query_tensors = batch["input_ids"]
        query_tensors_formatted = [torch.stack(query_tensors)]

        #### Get response from SFTModel
        response_tensors = ppo_trainer.generate(query_tensors, batch_size=1, **generation_kwargs)
        batch["response"] = [tokenizer.decode(r.squeeze(), skip_special_tokens=True) for r in response_tensors]

        #### Compute reward score
        texts: list[str] = [batch['query'][0] + r for r in batch["response"]]
        rewards: list[Tensor] = get_rewards(texts)

        idx = rewards.index(max(rewards))
        response_tensors_formatted = [response_tensors[idx]]
        batch["selected_idx"] = idx
        batch["selected_response"] = batch["response"][idx]

        #### Run PPO step
        stats = ppo_trainer.step(query_tensors_formatted, response_tensors_formatted, [max(rewards)])
        ppo_trainer.log_stats(stats, batch, rewards)

In [None]:
#### Save model
ppo_trainer.save_pretrained("my_ppo_model")