In [6]:
import os

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, Dataset
import json
from tqdm import tqdm

import csv

from dvc.repo import Repo

import yaml
from pathlib import Path

In [7]:
torch.cuda.is_available()

True

In [8]:
%%bash --out TOP_LEVEL
printf "$(git rev-parse --show-toplevel)"

In [9]:
# set working directory to root of git repo
config = yaml.safe_load(Path(TOP_LEVEL + '/configs/default.yaml').read_text())
match config['model']['torch_dtype']:
    case 'float16':
        torch_dtype = torch.float16
    case 'float32':
        torch_dtype = torch.float32
    case 'float64':
        torch_dtype = torch.float64
    case 'bfloat16':
        torch_dtype = torch.bfloat16
    case 'auto':
        torch_dtype = "auto"
    case _:
        raise ValueError('torch_dtype is invalid')

In [10]:
# load model and dataset - dataset needs to be in a specific format
model = AutoModelForCausalLM.from_pretrained(config["model"]["path"],torch_dtype=torch_dtype).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer"]["path"])
tokenizer.pad_token = tokenizer.eos_token

In [11]:
NUM_TRAIN_EPOCHS = 20
OUTPUT_DIR = TOP_LEVEL + f"/alfred/output/{config['model']['path']},torch_dtype={torch_dtype}/epoch={NUM_TRAIN_EPOCHS}"
#os.makedirs(os.path.dirname(OUTPUT_DIR), exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [12]:
#from collections import Dict

def return_prompt_and_responses(samples): # -> Dict[str, str, str]:
    return {
        "prompt": [
            "Question: " + question + "\n\nAnswer: "
            for question in samples["question"]
        ],
        "chosen": samples["response_j"],   # rated better than k
        "rejected": samples["response_k"], # rated worse than j
    }

dataset = load_dataset(
    "lvwerra/stack-exchange-paired",
    split="train",
    data_dir="data/rl"
)
original_columns = dataset.column_names

train_dataset = dataset.map(
    return_prompt_and_responses,
    batched=True,
    remove_columns=original_columns
)
#train_dataset = train_dataset.select(range(100))
print(train_dataset)

Downloading readme: 100%|██████████| 737/737 [00:00<00:00, 4.41MB/s]
Downloading data: 100%|██████████| 20/20 [01:56<00:00,  5.81s/files]
Generating train split: 7435908 examples [00:34, 214998.45 examples/s]
Map: 100%|██████████| 7435908/7435908 [01:00<00:00, 123196.71 examples/s]


Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 7435908
})


In [31]:
# get dataset
#train_dataset = load_dataset("imdb", split="train")

with open(TOP_LEVEL+'/generated_data/Weather.csv', mode='r') as f:
    data_reader = csv.DictReader(f)
    # with open('coors_new.csv', mode='w') as outfile:
    #     writer = csv.writer(outfile)
    #     mydict = {rows[0]:rows[1] for rows in reader}
    dpo_dataset_dict = {}
    dpo_dataset_dict["prompt"] = []
    dpo_dataset_dict["chosen"] = []
    dpo_dataset_dict["rejected"] = []
    for row in data_reader:
        dpo_dataset_dict["prompt"].append(row["input"])
        dpo_dataset_dict["chosen"].append(row["text"])
        dpo_dataset_dict["rejected"].append(row["candidate"])

# copy the dataset used to the output directory
!cp {TOP_LEVEL+'/generated_data/Weather.csv'} {OUTPUT_DIR+'/Weather.csv'}

train_dataset = Dataset.from_dict(dpo_dataset_dict)
#print(train_dataset["prompt"][0])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# load trainer

training_args = DPOConfig(
    beta=0.1,
    # does not automatically save model output
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS
)
trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
)

# train
results = trainer.train()

Map:  10%|█         | 775293/7435908 [58:44<13:51:38, 133.48 examples/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Map:  33%|███▎      | 2467229/7435908 [3:18:19<6:44:25, 204.76 examples/s] IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Map:  56%|█████▌    | 4161173/7435908 [5:23:20<5:07:27, 177.52 examples/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change thi

In [None]:
#model.save_pretrained(OUTPUT_DIR)
print(OUTPUT_DIR)

In [None]:
# save results
# with open(f"{OUTPUT_DIR}/results.json", "w") as f:
#     json.dump(results.metrics, f)
print(OUTPUT_DIR)
model.save_pretrained(OUTPUT_DIR+'/final-model-save')
trainer.save_model(OUTPUT_DIR+'/final-trainer-save')

In [None]:
# add log to dvc
repo = Repo(".")
#OUTPUT_PATH="/mnt/host/cs234_final/alfred/output/bigscience/bloom-560m,torch_dtype=float16/epoch=1000"
repo.add(OUTPUT_DIR)
print('starting to push to remote')
repo.push()
!git add {TOP_LEVEL + '/alfred/output.dvc'}
!git push

In [28]:
# load the fine-tuned model
print(torch_dtype)
model = AutoModelForCausalLM.from_pretrained(OUTPUT_DIR + '/final-trainer-save', torch_dtype=torch_dtype).to("cuda")
tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR + '/final-trainer-save') #config["tokenizer"]["path"])

torch.float16


In [29]:
prompt_ids = torch.tensor(tokenizer("Echo all the statements that are provided.\n")["input_ids"])
query_ids = torch.tensor(tokenizer("Why is the sky blue?")["input_ids"])
prompt_and_query_ids = torch.cat([prompt_ids, query_ids], dim=0)
print(prompt_and_query_ids)
tokenizer.decode(prompt_and_query_ids, skip_special_tokens=True)

tensor([224619,   1728,    368,  43163,    861,   1306,  15984,    336,  23857,
           632,    368,  60614,  29853,     34])


'Echo all the statements that are provided.\nWhy is the sky blue?'

In [30]:
response_ids = model.generate(
                  torch.Tensor(prompt_and_query_ids).unsqueeze(0).to("cuda"),
    num_beams=1, max_new_tokens=100,              
    repetition_penalty=1.2 #,temperature = 0
            )
print(response_ids[0])
tokenizer.batch_decode(response_ids)

tensor([224619,   1728,    368,  43163,    861,   1306,  15984,    336,  23857,
           632,    368,  60614,  29853,     34,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0, 

['Echo all the statements that are provided.\nWhy is the sky blue?<unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk>']

In [None]:
logits = model(input_ids=prompt_and_query_ids.unsqueeze(0).to("cuda")).logits
last_logit = logits[0, -1, :]
probs = torch.softmax(last_logit, dim=-1)

next_token = torch.argmax(probs, dim=-1)
next_token = next_token.unsqueeze(0)
tokenizer.decode(next_token)

In [64]:
model.to("cuda")

def generate(text):
    cleaned = []
    inputs = tokenizer(text, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)
    generated_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    for i in generated_output:
        print(i)

generate("I am a")

I am a doctoral student after graduation, but I have been interested in the topic for a few years. As always it is a bit difficult to know exactly how to study if i am interested, but I want to get some idea about what would be a good introduction.
My university is in Canada and I am currently studying philosophy and is a big fan of science fiction and fantasy fiction. I am not interested in science fiction and fantasy anymore, but I'm still interested in the idea of a big city like in


In [None]:
inputs = tokenizer(["Today is"], return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
outputs = model.generate(
    **inputs,
    max_new_tokens=5,
    num_beams=4,
    num_return_sequences=4,
    return_dict_in_generate=True,
    output_scores=True,
)
print(outputs)