In [1]:
import torch

from tqdm import tqdm
from torch.utils.data import DataLoader
from typing import Tuple, Literal
from datasets import load_dataset
from peft import LoraConfig, PeftModel, PeftConfig
from transformers import (
    AutoModelForCausalLM,
    AutoConfig,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import (
    PPOTrainer,
    PPOConfig,
    AutoModelForCausalLMWithValueHead
)

from reward import get_reward

SFT_ADAPTER_DIRECTORY = "./open_llama_3b_v2_sft/"

2023-12-14 02:52:12.632176: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-14 02:52:12.683426: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Model Prep

In [2]:
# Model from Hugging Face hub
base_model = "openlm-research/open_llama_3b_v2"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
# Create model and quantization configs
config = AutoConfig.from_pretrained(base_model, trust_remote_code=True)
config.init_device = 'cuda:0' # For fast initialization directly on GPU!

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    # torch_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

In [5]:
# Load SFT model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    trust_remote_code=True,
)
model.config.use_cache = False

model_with_value_head = AutoModelForCausalLMWithValueHead(model)

peft_model = PeftModel.from_pretrained(model_with_value_head, SFT_ADAPTER_DIRECTORY)
peft_model = peft_model.merge_and_unload(safe_merge=True)



## Dataset Prep

In [6]:
dataset_name = "samlhuillier/sql-create-context-spider-intersect"

In [7]:
response_template = "\n-- Answer:\n"

In [8]:
def format_prompt(example) -> dict:
    # Combine 'context' and 'question' into a single string called 'query'
    query = f"{example['context']} \n-- Question: {example['question']}{response_template}"
    # Return a dictionary with 'query' and 'answer'
    return {"query": query}

In [9]:
def tokenize_function(examples):
    # 450 is a little over 400, which is the max length query (147) + max length answer (253)
    return tokenizer(examples['query'], padding="max_length", truncation=True, max_length=150) # the longest query is 147 tokens long

In [10]:
dataset = load_dataset(dataset_name)

In [11]:
train_dataset = dataset['train'].map(format_prompt)
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3961 [00:00<?, ? examples/s]

In [12]:
valid_dataset = dataset['validation'].map(format_prompt)
tokenized_valid_dataset = valid_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/568 [00:00<?, ? examples/s]

In [13]:
tokenized_train_dataset

Dataset({
    features: ['answer', 'question', 'context', 'db_id', 'query', 'input_ids', 'attention_mask'],
    num_rows: 3961
})

## Trainer Prep

In [14]:
batch_size = 2
train_dataloader = DataLoader(tokenized_train_dataset, batch_size=batch_size, shuffle=True)

trainer_config = PPOConfig(
    model_name=base_model,
    # query_dataset=tokenized_train_dataset,
    log_with="tensorboard",  # Specify that you are using TensorBoard for logging
    project_kwargs={"logging_dir": "./ppo_results"},
    
    ppo_epochs=1,
    learning_rate=2e-4,
    gradient_accumulation_steps=1,
    batch_size=batch_size,
    # mini_batch_size=script_args.mini_batch_size, # Default: 1
    # steps=script_args.steps, # default: 20_000
    
    # optimize_cuda_cache=True,
    is_peft_model=True
    # seed=script_args.seed, # Default: 0
)

In [15]:
trainer = PPOTrainer(
    model=model_with_value_head,
    dataset=tokenized_train_dataset,
    tokenizer=tokenizer,
    config=trainer_config,
    optimizer="paged_adamw_32bit"
)

## Run Training

In [16]:
def get_num_tokens(text):
    return len(tokenizer(text)["input_ids"])

max(map(get_num_tokens, tokenized_train_dataset["answer"])), max(map(get_num_tokens, tokenized_train_dataset["query"]))

(253, 147)

In [17]:
def fix_tokens_for_step(toks):
    return list(map(torch.squeeze, torch.split(torch.vstack(toks).T, split_size_or_sections=1)))

In [27]:
generation_kwargs = {
    "max_new_tokens": 1,  # Limit the maximum length to ensure faster generation
    "top_k": 0.,  # Don't use Top-K sampling for performance (and reduce randomness) 
    "top_p": 1.0,  # Don't use nucleus sampling with a reasonable threshold
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
}

model.is_peft_model = True
for epoch, batch in tqdm(enumerate(train_dataloader)):
    query_tensors = batch["input_ids"]
    # print(query_tensors)  # Debug print

    ## Generate response (action)
    print(f"Generating response for batch {epoch}")  # Debug 3   
    response_tensors = trainer.generate(query_tensors, **generation_kwargs)
    print(f"Response generated for batch {epoch}")  # Debug 4

    resps = tokenizer.batch_decode(map(lambda resp: resp.squeeze(), response_tensors))
    batch["response"] = resps
    tokenizer.padding_side = "right"
    response_tensors = tokenizer(resps, padding="max_length", max_length=256)
    tokenizer.padding_side = "left"
    print(resps)

    ## Compute reward (see paper)
    rewards = [ 
        torch.tensor(get_reward(db, response, ans)) 
        for response, db, ans in zip(
            batch["response"], batch["db_id"], batch["answer"]
        )
    ]
    print(f"achieved rewards:\n{rewards}")
    print(f"{len(query_tensors)}")
#     # rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    qt_for_logs = fix_tokens_for_step(query_tensors)
    print("Query Lens:", list(map(len, qt_for_logs)))
    print(list(map(
        len, response_tensors
    )))
    rt_for_logs = fix_tokens_for_step(response_tensors)
    print("Resp Lens:", list(map(len, rt_for_logs)))
    ## Perform PPO step
    stats = trainer.step(qt_for_logs, rt_for_logs, rewards)
    trainer.log_stats(stats, batch, rewards)
    break


0it [00:00, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Generating response for batch 0


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Response generated for batch 0
['</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '</s>', '<s>id>', 'CRE VMC', 'ATEARCH,', 'TABLEARPT', 'Sports)\n', 'info 3', '(\nD', 's--Sc', 'port Question?', 




TypeError: vstack(): argument 'tensors' (position 1) must be tuple of Tensors, not BatchEncoding

In [None]:
## Save model
trainer.save_model("open_llama_3b_v2_sft_plus_ppo")

In [None]:
from tensorboard import notebook
logging_dir = "ppo_results/runs"
notebook.start("--logdir {} --port 4000".format(logging_dir))