# Finetuning LLaMA 2

Following guide from https://medium.com/@ud.chandra/instruction-fine-tuning-llama-2-with-pefts-qlora-method-d6a801ebb19

In [1]:
from datasets import load_dataset
import pickle
import re
from itertools import chain
import random
import json
import huggingface_hub
import os
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer, TrainingArguments
from peft import LoraConfig
from trl import SFTTrainer
from transformers import LlamaTokenizer
# huggingface_hub.login(os.getenv("HUGGINGFACE_TOKEN"), add_to_git_credential=True)

os.environ["WANDB_NOTEBOOK_NAME"] = "prep_dataset.ipynb"


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so
CUDA SETUP: CUDA runtime path found: /home/msaad/miniconda3/envs/thesis/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/msaad/miniconda3/envs/thesis/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...


Either way, this might cause trouble in the future:
If you get `CUDA error: invalid device function` errors, the above might be the cause and the solution is to make sure only one ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] in the paths that we search based on your env.
  warn(msg)


# First things first...

Gotta cleanup and format the dataset. For first run (to get it working), just using a small portion of dataset. This specific portion contains only the URLs included in version 1 of semantic search. These include broad information about the school, and tend to be the most information rich webpages. (6% ish?)

Taking QA dataset generated by gpt3.5 and filtered to those URLs only.

In [2]:
# Contains URL and raw GPT output as key value pairs
gpt_output   = pickle.load(open('/home/msaad/workspace/honors-thesis/data-collection/data/gpt_output.p', 'rb'))

# Using cleaned_dict keys (the urls I eventually selected as the best) to filter this down
cleaned_dict = pickle.load(open('/home/msaad/workspace/honors-thesis/data-collection/data/cleaned_url_response_dict.p', 'rb'))

small_dataset = {k: v for k, v in gpt_output.items() if k in cleaned_dict.keys()}

# NOTE there are 5 webpages in cleaned_dict that did not carry over. Might be worth investigating more -- for now ignoring and moving on.

In [3]:
# NOTE THIS FUNCTION IS TAKEN DIRECTLY FROM 'data-collection/data/gpt_data_generation.ipynb'

def generate_json(gpt_output: dict[str, str], filename: str) -> None:
    """
    Parses GPT output into a JSON file. This function uses regex to parse for all the instructions (questions), and 
    outputs (answers) from the GPT output which is a dictionary. The parsed data is returned into a Python list of 
    dictionaries, which is appended for each webpage. This list is shuffled to mix up the questions and then dumped 
    to a JSON file.

    Args:
        gpt_output (dict): The dictionary containing the GPT output.
        filename (str): The JSON filename to write the parsed and shuffled list of dictionaries to.
    """

    # The regular expression pattern for a JSON object with "instruction" and "output"
    pattern = r'"instruction":\s*"(.*?)",.*?"output":\s*"(.*?)"'

    def extract_data(s):
        matches = re.findall(pattern, s, flags=re.DOTALL)
        # Add a conditional filter in the list comprehension
        data = [{"instruction": m[0], "output": m[1]} for m in matches if m[0] and m[1] and '"' not in m[0] and '"' not in m[1]]
        return data

    jsonqa = []

    for value in gpt_output.values():
        clean_value = extract_data(value)
        jsonqa.append(clean_value)

    jsonqa = list(chain(*jsonqa))

    random.shuffle(jsonqa)

    # Write to a JSON file
    with open('/home/msaad/workspace/honors-thesis/data-collection/data/' + filename + '.json', 'w') as f:
        json.dump(jsonqa, f, indent=4)  # Dump the entire list at once

In [4]:
print("Filtering to", len(small_dataset), "webpages (" + str(round(len(small_dataset)/ len(gpt_output) * 100, 2)) + "% of full dataset)")

generate_json(small_dataset, "cleaned_ss_dataset")

Filtering to 334 webpages (6.31% of full dataset)


# Formatting the responses

Will also upload both the before and after datasets to huggingface, because why not!

In [5]:
dataset = load_dataset("json", data_files="../data-collection/data/cleaned_ss_dataset.json")

# dataset.push_to_hub("msaad02/preformat-ss-cleaned-brockport-qa")

def format_prompt(example):
    prompt = f"""Below is an inquiery related to SUNY Brockport - from academics, admissions, and faculty support to student life. Prioritize accuracy and brevity."

### Instruction:
{example['instruction']}

### Response:
{example['output']}"""

    return {'text': prompt}

dataset = dataset.map(format_prompt, remove_columns=['instruction', 'output'])

# dataset.push_to_hub("msaad02/formatted-ss-cleaned-brockport-qa")

Downloading and preparing dataset json/default to /home/msaad/.cache/huggingface/datasets/json/default-3fafff93c0199302/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/msaad/.cache/huggingface/datasets/json/default-3fafff93c0199302/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/7098 [00:00<?, ? examples/s]

In [6]:
print(dataset['train']['text'][0])

Below is an inquiery related to SUNY Brockport - from academics, admissions, and faculty support to student life. Prioritize accuracy and brevity."

### Instruction:
What is the emergency information resource at SUNY Brockport?

### Response:
The emergency information resource at SUNY Brockport provides important information and resources in case of emergency situations.


# Now for actually finetuning!

In [2]:
dataset_name = "msaad02/formatted-ss-cleaned-brockport-qa"
dataset = load_dataset(dataset_name, split="train")

base_model_name = "meta-llama/Llama-2-7b-hf" # PENDING ACCESS. Requested, need to wait 1-2 days
# base_model_name = "decapoda-research/llama-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
    use_auth_token=True
)
base_model.config.use_cache = False

# More info: https://github.com/huggingface/transformers/pull/24906
base_model.config.pretraining_tp = 1 

peft_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

output_dir = "./results"

# See docs for explanations: https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir=output_dir,                  # Directory for predictions and checkpoints
    per_device_train_batch_size=16,         # Batch size per device - default 8. Need to find right setting for my hardware
    gradient_accumulation_steps=2,          # Number of updates steps to accumulate the gradients for... Need to learn more. Default 1
    learning_rate=2e-4,                     # Learning rate, default = 5e-5
    logging_steps=10,                       # How often to log or print updates. User preference
    num_train_epochs=2                      # Num epochs. Default 3
    # max_steps=500                         # OVERRIDES num_train_epochs if set. 
)

max_seq_length = 512

trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
)

Found cached dataset parquet (/home/msaad/.cache/huggingface/datasets/msaad02___parquet/msaad02--formatted-ss-cleaned-brockport-qa-2e4da28dc7eed695/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading cached processed dataset at /home/msaad/.cache/huggingface/datasets/msaad02___parquet/msaad02--formatted-ss-cleaned-brockport-qa-2e4da28dc7eed695/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-e0d9c3a5ca40e680.arrow


In [8]:
trainer.train()

output_dir = os.path.join(output_dir, "final_checkpoint")
trainer.model.save_pretrained(output_dir)

[34m[1mwandb[0m: Currently logged in as: [33mmsaad02[0m ([33mhon-thesis[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,0.0
20,0.0
30,0.0
40,0.0
50,0.0
60,0.0
70,0.0
80,0.0
90,0.0
100,0.0


In [5]:
base_model = AutoModelForCausalLM.from_pretrained(
    "./results",
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True
)

OSError: ./results does not appear to have a file named config.json. Checkout 'https://huggingface.co/./results/None' for available files.