# Finetuning LLaMA 2 without guide

I seem to keep running in to issues when following the guides and tutorials for finetuning LLaMA. It seems most people use other repos and tools to fine tune as opposed to writing their own scripts, and understandably so, but I would like to understand the process better and be able to do it myself.

In [1]:
from datasets import load_dataset
import huggingface_hub
import torch
from transformers import (
    AutoModelForCausalLM, 
    BitsAndBytesConfig, 
    AutoTokenizer, 
    TrainingArguments
)
from peft import LoraConfig
from transformers import LlamaTokenizer

In [4]:
dataset = load_dataset("msaad02/formatted-ss-cleaned-brockport-qa", split="train")
dataset

Dataset({
    features: ['text'],
    num_rows: 7098
})

## Load in base model & tokenizer

In [None]:
base_model_name = "meta-llama/Llama-2-7b-hf"

device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    ),
    device_map=device_map,
    trust_remote_code=True,
    use_auth_token=True
)

# base_model.config.use_cache = False
# More info: https://github.com/huggingface/transformers/pull/24906
# base_model.config.pretraining_tp = 1 

In [None]:

base_model_name = "meta-llama/Llama-2-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
    use_auth_token=True
)
base_model.config.use_cache = False

# More info: https://github.com/huggingface/transformers/pull/24906
base_model.config.pretraining_tp = 1 

peft_config = LoraConfig(
    lora_alpha=64,
    lora_dropout=0.1, # increase dropout/test. Try 0.5?
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

output_dir = "./results"

# See docs for explanations: https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
training_args = TrainingArguments(
    output_dir=output_dir,                  # Directory for predictions and checkpoints
    per_device_train_batch_size=16,         # Batch size per device - default 8. Need to find right setting for my hardware
    gradient_accumulation_steps=2,          # Number of updates steps to accumulate the gradients for... Need to learn more. Default 1
    learning_rate=2e-4,                     # Learning rate, default = 5e-5
    logging_steps=10,                       # How often to log or print updates. User preference
    num_train_epochs=2                      # Num epochs. Default 3
    # max_steps=500                         # OVERRIDES num_train_epochs if set. 
)

max_seq_length = 512

trainer = SFTTrainer(
    model=base_model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
)

# Following other example from huggingface trl documentation

https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama_2/scripts/sft_llama2.py

In [5]:
from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset

In [None]:
dataset = load_dataset(
        "msaad02/formatted-ss-cleaned-brockport-qa",
        data_dir=args.subset,
        split=args.split,
        use_auth_token=True,
        num_proc=args.num_workers if not args.streaming else None,
        streaming=args.streaming,
    )

train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        formatting_func=prepare_sample_text,
        infinite=True,
        seq_length=args.seq_length,
        chars_per_token=chars_per_token,
    )