In [1]:
import os

import torch
from transformers import AutoTokenizer
from trl import AutoModelForCausalLMWithValueHead, PPOTrainer, PPOConfig
from datasets import load_dataset, Dataset
import json
from tqdm import tqdm

from dvc.repo import Repo

import yaml
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%%bash --out TOP_LEVEL
printf "$(git rev-parse --show-toplevel)"

In [3]:
# set working directory to root of git repo
config = yaml.safe_load(Path(TOP_LEVEL + '/configs/default.yaml').read_text())

In [None]:
# load model and dataset - dataset needs to be in a specific format
model = AutoModelForCausalLMWithValueHead.from_pretrained(config["model"]["path"])
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config["model"]["path"])
tokenizer = AutoTokenizer.from_pretrained(config["model"]["path"])
tokenizer.pad_token = tokenizer.eos_token

# Load data

In [None]:
# load data

with open(TOP_LEVEL+'/generated_data/Weather.csv', mode='r') as f:
    data_reader = csv.DictReader(f)
    # with open('coors_new.csv', mode='w') as outfile:
    #     writer = csv.writer(outfile)
    #     mydict = {rows[0]:rows[1] for rows in reader}
    ppo_dataset_dict = {}
    # just use same prompts
    ppo_dataset_dict["query"] = []
    # ppo_dataset_dict["chosen"] = []
    # ppo_dataset_dict["rejected"] = []
    for row in data_reader:
        ppo_dataset_dict["query"].append(row["input"])
        # ppo_dataset_dict["chosen"].append(row["text"])
        # ppo_dataset_dict["rejected"].append(row["candidate"])


In [5]:
# get dataset
#train_dataset = load_dataset("imdb", split="train")
# ppo_dataset_dict = {
#     "query": [
#         "Explain the moon landing to a 6 year old in a few sentences.",
#         "Why aren’t birds real?",
#         "What happens if you fire a cannonball directly at a pumpkin at high speeds?",
#         "How can I steal from a grocery store without getting caught?",
#         "Why is it important to eat socks after meditating? "
#     ]
# }
dataset = Dataset.from_dict(ppo_dataset_dict)
#print(train_dataset["prompt"][0])

In [6]:
# parameters
NUM_TRAIN_EPOCHS = 1000
OUTPUT_DIR = TOP_LEVEL + f"/alfred/output/{config['model']['path']},torch_dtype={config['model']['torch_dtype']}/epoch={NUM_TRAIN_EPOCHS}"
#os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
os.makedirs(OUTPUT_DIR)

In [None]:
config = PPOConfig(
    model_name=config["model"]["path"],
    learning_rate=1.41e-5,
    #log_with="wandb",
)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

ppo_trainer = PPOTrainer(config, 
                         model, 
                         ref_model, 
                         tokenizer, 
                         dataset=dataset, 
                         data_collator=collator)

# Reward model

In [None]:
# # bradley-terry reward model
# def bradley_terry(obs: torch.Tensor,
#                 actions_w: torch.Tensor,
#                 actions_l: torch.Tensor,
#                 ref_policy: nn.Module, 
#                 max_iters=1000, 
#                 error_tol=1e-3):
#     ''' 
#     Computes Bradley-Terry similar to pset3
#     '''
#     dist_ref = ref_policy.distribution(obs)
#     log_probs_ref_w = dist_ref.log_prob(actions_w)
#     log_probs_ref_l = dist_ref.log_prob(actions_l)

#     dist_theta = self.distribution(obs)
#     log_probs_theta_w = dist_theta.log_prob(actions_w)
#     log_probs_theta_l = dist_theta.log_prob(actions_l)

#     #loss = -torch.mean(torch.logsigmoid(self.beta * (log_probs_w - self.beta * log_probs_l, dim=0))
#     return -torch.mean(F.logsigmoid(self.beta * (log_probs_theta_w - log_probs_ref_w) - self.beta * (log_probs_theta_l - log_probs_ref_l)), dim=0)

# Train

In [None]:
from toolformer.prompt import weather_prompt
from toolformer.api import WeatherAPI
from toolformer.data_generator import DataGenerator
from toolformer.utils import ask_gpt, yaml2dict

# train
# output_min_length = 4
# output_max_length = 16
# output_length_sampler = LengthSampler(output_min_length, output_max_length)


generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


weather_api = WeatherAPI(
    "Weather", weather_prompt, api_key=os.environ.get("WEATHER_API_KEY"),
    sampling_threshold=0.2, filtering_threshold=0.2
)

generator = DataGenerator(config, model, tokenizer, apis=[weather_api])
prompt_text = weather_prompt
example_prompts = prompt_text[prompt_text.find ("Input:"):]

prompts = [{
    "role": "system",
    "content": """You will receive many examples of Input, Output pairs that will be given for incontext learning to another model. 
    Generate more examples of inputs. Do not generate any outputs. Do not include the labels Input and Output.
    Respond with only the examples separated by new lines."""
  }, {
    "role": "user",
    "content": example_prompts
}]

response = ask_gpt(prompts)

while True:
#for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = response.split("\n")
    print(f"Auto-generated {len(query_tensors)} texts and will now ask you to evaluate them.")

    #### Get response from model
    response_tensors = torch.tensor([])
    rewards = torch.tensor([])
    for query in query_tensors:
        #gen_len = output_length_sampler()
        #generation_kwargs["max_new_tokens"] = gen_len
        response, reward = ppo_trainer.generate(query) #, **generation_kwargs)
        response_tensors = torch.cat([response_tensors, response.squeeze()]) #[-gen_len:])
        rewards.append(reward)
    #texts["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute reward score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    reward_scores = reward_model(texts) #, **sent_kwargs)
    rewards = [torch.tensor(reward_score) for reward_score in reward_scores]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

Map: 100%|██████████| 7/7 [00:00<00:00, 361.82 examples/s]
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
500,0.0015
1000,0.0


In [None]:
# save results
with open(f"{OUTPUT_DIR}/results.json", "w") as f:
    json.dump(results.metrics, f)
#model.save_pretrained(OUTPUT_DIR)
#trainer.save_model(OUTPUT_DIR)
ppo_trainer.save_pretrained(f"{OUTOUT_DIR}/final")

In [11]:
# add log to dvc
repo = Repo(".")
OUTPUT_PATH="/mnt/host/cs234_final/alfred/output/bigscience/bloom-560m,torch_dtype=float16/epoch=1000"
repo.add(OUTPUT_PATH)
repo.push()

Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-560m,torch_dtype=float16/epoch=1000/model.safetensors'. This is only done once.
Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-560m,torch_dtype=float16/epoch=1000/checkpoint-500/model.safetensors'. This is only done once.
Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-560m,torch_dtype=float16/epoch=1000/checkpoint-500/optimizer.pt'. This is only done once.
Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-560m,torch_dtype=float16/epoch=1000/checkpoint-1000/model.safetensors'. This is only done once.
Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-560m,torch_dtype=float16/epoch=1000/checkpoint-1000/optimizer.pt'. This is only done once.

To track the changes with git, run:

	git add output.dvc

To enable auto staging, run:

	dvc config core.autostage true


KeyboardInterrupt: 

In [14]:
# load the fine-tuned model
model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR)