In [1]:
import torch
import pickle

from typing import Tuple
from tqdm.notebook import tqdm
from peft import AutoPeftModelForCausalLM
from transformers import (
    BitsAndBytesConfig,
    pipeline,
    AutoTokenizer
)
from datasets import load_dataset


from reward import get_reward

SFT_ADAPTER_DIRECTORY = "./open_llama_3b_v2_sft/"
DPO_ADAPTER_DIRECTORY = "./open_llama_3b_v2_sft_plus_dpo/"

  from pandas.core import (


In [2]:
dataset_name = "samlhuillier/sql-create-context-spider-intersect"
response_template = "\n-- Answer:\n"
def format_prompt(example) -> Tuple[str, str]:
    return f"{example['context']} \n-- Question: {example['question']}{response_template}", example['answer']

In [3]:
dataset = load_dataset(dataset_name, split="validation").map(lambda example: {"query" : format_prompt(example)[0]})

In [4]:
dataset[0]

{'answer': 'SELECT count(*) FROM singer',
 'context': 'CREATE TABLE singer (Id VARCHAR)',
 'question': 'How many singers do we have?',
 'db_id': 'concert_singer',
 'query': 'CREATE TABLE singer (Id VARCHAR) \n-- Question: How many singers do we have?\n-- Answer:\n'}

In [5]:
# Model from Hugging Face hub
base_model = "openlm-research/open_llama_3b_v2"

In [6]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
def load_model(adapter_dir):
    # Load DPO model
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        # torch_dtype=torch.float16,
        bnb_4bit_use_double_quant=False,
    )
    
    dpo_model = AutoPeftModelForCausalLM.from_pretrained(
        adapter_dir,
        quantization_config=quant_config,
        trust_remote_code=True,
        is_trainable=True,
    )
    
    dpo_model.config.use_cache = False
    dpo_model.config.pretraining_tp = 1
    
    return dpo_model.merge_and_unload().eval()

In [8]:
def log(txt, quiet=True):
    with open("eval_out.txt", "w+", encoding="utf-8") as f:
        f.write(txt)
    if not quiet:
        print(txt)

In [9]:
def get_eval_rewards(model, quiet=True, sample_size=-1):
    p = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=120)
    
    rewards = []
    if sample_size >= 0:
        inds = torch.randperm(len(dataset))[:sample_size]
        ds = dataset.select(inds)
    else:
        ds = dataset

    for row in tqdm(ds):
        out = p(row["query"])
        response = out[0]["generated_text"]
        
        # remove our query, split by newlines
        response_lines = response.replace(row["query"], "").split("\n")
        model_submission = response_lines[0]
        # rew = get_reward(row["db_id"], model_submission, row["answer"])
        rew = 1.1
        rewards.append(rew)
        
        log(row["query"], quiet=quiet)
        log(model_submission, quiet=quiet)
        log("-------------------------------------------------", quiet=quiet)
        log(f"-- Got reward of {rew} against solution:", quiet=quiet)
        log(row['answer'], quiet=quiet)
        log("-------------------------------------------------", quiet=quiet)
    
    return rewards

In [None]:
rewards = get_eval_rewards(load_model(DPO_ADAPTER_DIRECTORY), quiet=True, sample_size=50)

with open("dpo_eval_returns.pkl", "wb") as f:
    pickle.dump(rewards, f)



  0%|          | 0/50 [00:00<?, ?it/s]



In [None]:
torch.mean(rewards)