In [9]:
import os

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, Dataset
import json
from tqdm import tqdm

import csv

from dvc.repo import Repo

import yaml
from pathlib import Path

In [10]:
%%bash --out TOP_LEVEL
printf "$(git rev-parse --show-toplevel)"

In [11]:
# set working directory to root of git repo
config = yaml.safe_load(Path(TOP_LEVEL + '/configs/default.yaml').read_text())
match config['model']['torch_dtype']:
    case 'float16':
        torch_dtype = torch.float16
    case 'float32':
        torch_dtype = torch.float32
    case 'float64':
        torch_dtype = torch.float64
    case 'bfloat16':
        torch_dtype = torch.bfloat16
    case 'auto':
        torch_dtype = "auto"
    case _:
        raise ValueError('torch_dtype is invalid')

In [12]:
# load model and dataset - dataset needs to be in a specific format
model = AutoModelForCausalLM.from_pretrained(config["model"]["path"],torch_dtype=torch_dtype)
tokenizer = AutoTokenizer.from_pretrained(config["model"]["path"])
tokenizer.pad_token = tokenizer.eos_token

In [19]:
NUM_TRAIN_EPOCHS = 100
OUTPUT_DIR = TOP_LEVEL + f"/alfred/output/{config['model']['path']},torch_dtype={torch_dtype}/epoch={NUM_TRAIN_EPOCHS}"
#os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [14]:
# get dataset
#train_dataset = load_dataset("imdb", split="train")

with open(TOP_LEVEL+'/generated_data/Weather.csv', mode='r') as f:
    data_reader = csv.DictReader(f)
    # with open('coors_new.csv', mode='w') as outfile:
    #     writer = csv.writer(outfile)
    #     mydict = {rows[0]:rows[1] for rows in reader}
    dpo_dataset_dict = {}
    dpo_dataset_dict["prompt"] = []
    dpo_dataset_dict["chosen"] = []
    dpo_dataset_dict["rejected"] = []
    for row in data_reader:
        dpo_dataset_dict["prompt"].append(row["input"])
        dpo_dataset_dict["chosen"].append(row["text"])
        dpo_dataset_dict["rejected"].append(row["candidate"])

# copy the dataset used to the output directory
!cp {TOP_LEVEL+'/generated_data/Weather.csv'} {OUTPUT_DIR+'/Weather.csv'}

train_dataset = Dataset.from_dict(dpo_dataset_dict)
#print(train_dataset["prompt"][0])

In [15]:
# load trainer

training_args = DPOConfig(
    beta=0.1,
    # does not automatically save model output
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS
)
trainer = DPOTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
)

# train
results = trainer.train()

Map: 100%|██████████| 281/281 [00:00<00:00, 2531.40 examples/s]
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
500,0.3474
1000,0.0
1500,0.0
2000,0.0
2500,0.0
3000,0.0
3500,0.0


In [16]:
# save results
with open(f"{OUTPUT_DIR}/results.json", "w") as f:
    json.dump(results.metrics, f)
#model.save_pretrained(OUTPUT_DIR)
trainer.save_model(OUTPUT_DIR+'/final')

In [21]:
# add log to dvc
repo = Repo(".")
#OUTPUT_PATH="/mnt/host/cs234_final/alfred/output/bigscience/bloom-560m,torch_dtype=float16/epoch=1000"
repo.add(OUTPUT_DIR)
print('starting to push to remote')
repo.push()
!git add {TOP_LEVEL + '/alfred/output.dvc'}

Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-1b1,torch_dtype=torch.float16/epoch=100/checkpoint-500/model.safetensors'. This is only done once.
Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-1b1,torch_dtype=torch.float16/epoch=100/checkpoint-500/optimizer.pt'. This is only done once.
Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-1b1,torch_dtype=torch.float16/epoch=100/checkpoint-1000/model.safetensors'. This is only done once.
Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-1b1,torch_dtype=torch.float16/epoch=100/checkpoint-1000/optimizer.pt'. This is only done once.
Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-1b1,torch_dtype=torch.float16/epoch=100/checkpoint-1500/model.safetensors'. This is only done once.
Computing md5 for a large file '/mnt/host/cs234_final/alfred/output/bigscience/bloom-1b

KeyboardInterrupt: 

In [None]:
# load the fine-tuned model
model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR)