In [1]:
import os

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, Dataset
import json
from tqdm import tqdm

import csv

from dvc.repo import Repo

import yaml
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

In [3]:
%%bash --out TOP_LEVEL
printf "$(git rev-parse --show-toplevel)"

In [4]:
# set working directory to root of git repo
config = yaml.safe_load(Path(TOP_LEVEL + '/configs/default.yaml').read_text())
match config['model']['torch_dtype']:
    case 'float16':
        torch_dtype = torch.float16
    case 'float32':
        torch_dtype = torch.float32
    case 'float64':
        torch_dtype = torch.float64
    case 'bfloat16':
        torch_dtype = torch.bfloat16
    case 'auto':
        torch_dtype = "auto"
    case _:
        raise ValueError('torch_dtype is invalid')

In [5]:
# load model and dataset - dataset needs to be in a specific format
print(torch_dtype)
model = AutoModelForCausalLM.from_pretrained(config["model"]["path"], torch_dtype=torch_dtype).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer"]["path"])
tokenizer.pad_token = tokenizer.eos_token

torch.float16


Downloading shards: 100%|██████████| 2/2 [04:13<00:00, 126.91s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.63s/it]


In [6]:
NUM_TRAIN_EPOCHS = 20
OUTPUT_DIR = TOP_LEVEL + f"/alfred/output/{config['model']['path']},torch_dtype={torch_dtype},epoch={NUM_TRAIN_EPOCHS}"
#os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [7]:
# #from collections import Dict

# def return_prompt_and_responses(samples): # -> Dict[str, str, str]:
#     return {
#         "prompt": [
#             "Question: " + question + "\n\nAnswer: "
#             for question in samples["question"]
#         ],
#         "chosen": samples["response_j"],   # rated better than k
#         "rejected": samples["response_k"], # rated worse than j
#     }

# dataset = load_dataset(
#     "lvwerra/stack-exchange-paired",
#     split="train",
#     data_dir="data/rl"
# )
# original_columns = dataset.column_names

# train_dataset = dataset.map(
#     return_prompt_and_responses,
#     batched=True,
#     remove_columns=original_columns
# )
# train_dataset = train_dataset.select(range(100))
# #print(train_dataset)

In [8]:
# get dataset
#train_dataset = load_dataset("imdb", split="train")

weather_prompt = """
Your task is to add calls to a Weather API to a piece of text. The API call should help you get information required to complete the text.
You can call the API by writing "[WeatherAPI(city_name)]" where "city_name" is the name of the city you want to get the weather for. Here are some examples of API calls:

Input: The weather in New York is clear with a few clouds.
Output: The weather in New York is [WeatherAPI("New York")].

Input: Dubai is experiencing sunny weather with no clouds in sight, so wear sunscreen.
Output: Dubai is experiencing [WeatherAPI("Dubai")], so wear sunscreen.

Input: San Francisco is foggy this morning, so remember to turn on your headlights.
Output: San Francisco is [WeatherAPI("San Francisc")] this morning, so remember to turn on your headlights.

Input: Since the weather in Paris is rainy, I will bring a rain jacket. 
Output: Since the weather in Paris is [WeatherAPI("Paris")], I will bring a rain jacket. 

Input: {input}
Output:
"""

with open(TOP_LEVEL+'/generated_data/Weather.csv', mode='r') as f:
    data_reader = csv.DictReader(f)
    # with open('coors_new.csv', mode='w') as outfile:
    #     writer = csv.writer(outfile)
    #     mydict = {rows[0]:rows[1] for rows in reader}
    dpo_dataset_dict = {}
    dpo_dataset_dict["prompt"] = []
    dpo_dataset_dict["chosen"] = []
    dpo_dataset_dict["rejected"] = []
    #i = 0
    for row in data_reader:
        #if i < 21:
        #dpo_dataset_dict["prompt"].append(weather_prompt + row["input"])
        dpo_dataset_dict["prompt"].append(row["input"])
        dpo_dataset_dict["chosen"].append(row["text"])
        dpo_dataset_dict["rejected"].append(row["candidate"])
        #    i += 1
        #else:
        #    break

# copy the dataset used to the output directory
!cp {TOP_LEVEL+'/generated_data/Weather.csv'} {OUTPUT_DIR+'/Weather.csv'}

train_dataset = Dataset.from_dict(dpo_dataset_dict)
print(train_dataset["prompt"][0])


Could you provide the current weather conditions in London and the forecast for the next few hours?


In [9]:
# load trainer

training_args = DPOConfig(
    beta=0.1,
    # does not automatically save model output
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    #learning_rate=2e-4,
    evaluation_strategy="steps",
    eval_steps=100,
    #fp16=True,
)
trainer = DPOTrainer(
    model=model,
    #ref_model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
)

Map: 100%|██████████| 289/289 [00:00<00:00, 2262.37 examples/s]
Map: 100%|██████████| 289/289 [00:00<00:00, 2424.03 examples/s]


In [10]:
# train
results = trainer.train()

Could not estimate the number of tokens of the input, floating-point operations will not be computed


OutOfMemoryError: CUDA out of memory. Tried to allocate 128.00 MiB. GPU 

In [None]:
#model.save_pretrained(OUTPUT_DIR)
print(OUTPUT_DIR)

In [None]:
# save results
# with open(f"{OUTPUT_DIR}/results.json", "w") as f:
#     json.dump(results.metrics, f)
print(OUTPUT_DIR)
model.save_pretrained(OUTPUT_DIR+'/final-model-save_small7')
trainer.save_model(OUTPUT_DIR+'/final-trainer-save_small7')

In [None]:
# add log to dvc
repo = Repo(".")
#OUTPUT_PATH="/mnt/host/cs234_final/alfred/output/bigscience/bloom-560m,torch_dtype=float16/epoch=1000"
repo.add(OUTPUT_DIR)
print('starting to push to remote')
repo.push()
!git add {TOP_LEVEL + '/alfred/output.dvc'}
#!git push

In [None]:
# load the fine-tuned model
print(torch_dtype)
model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR + '/final-model-save_small2', torch_dtype=torch_dtype).to("cuda")
#tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR + '/final-model-save_small1') #config["tokenizer"]["path"])

In [None]:
#prompt_ids = torch.tensor(tokenizer(dpo_dataset_dict["prompt"][1])["input_ids"])
#queries = ["Why is the sky blue?", "Could you tell me the current time?"]
weather_prompt_ids = torch.tensor(tokenizer(weather_prompt)["input_ids"])
queries = dpo_dataset_dict["prompt"]
decoded = []
prompt_ids_list = []
for query in queries:
    #print(text)
    prompt_ids = torch.tensor(tokenizer(query)["input_ids"])
    concat = torch.cat([weather_prompt_ids, prompt_ids], dim=0)
    prompt_ids_list.append(concat)
#query_ids = torch.tensor(tokenizer("Why is the sky blue?")["input_ids"])
#prompt_and_query_ids = torch.cat([prompt_ids, query_ids], dim=0)
#print(prompt_and_query_ids)
    decoded.append(tokenizer.decode(concat, skip_special_tokens=True))

print(decoded[0])

In [None]:
decoded_result = []
for prompt_ids in prompt_ids_list[0:10]:
    response_ids = model.generate(
                      torch.Tensor(prompt_ids).unsqueeze(0).to("cuda"),
        num_beams=1, max_new_tokens=100,              
        repetition_penalty=1.2 #,temperature = 0
                )
    #print(response_ids[0])
    decoded = tokenizer.batch_decode(response_ids)
    #print(decoded)
    decoded_result.append(decoded)


with open(OUTPUT_DIR + '/testing_original.txt', 'w') as f:
    for line in decoded_result:
        #print(line[0])
        f.write(line[0] + '\n')

In [None]:
logits = model(input_ids=prompt_and_query_ids.unsqueeze(0).to("cuda")).logits
last_logit = logits[0, -1, :]
probs = torch.softmax(last_logit, dim=-1)

next_token = torch.argmax(probs, dim=-1)
next_token = next_token.unsqueeze(0)
tokenizer.decode(next_token)

In [None]:
model.to("cuda")

def generate(text):
    cleaned = []
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)
    generated_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    for i in generated_output:
        print(i)

generate("I am a")

In [None]:
inputs = tokenizer(["Today is"], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
outputs = model.generate(
    **inputs,
    max_new_tokens=5,
    num_beams=4,
    num_return_sequences=4,
    return_dict_in_generate=True,
    output_scores=True,
)
print(outputs)