In [None]:
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install bitsandbytes
!pip install datasets
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/transformers.git


In [None]:
!pip install huggingface-hub

!git config --global credential.helper store
!huggingface-cli login

In [None]:
import argparse
import bitsandbytes as bnb
import os
import torch
from datasets import load_dataset
from functools import partial
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    AutoPeftModelForCausalLM,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    set_seed,
    Trainer,
    TrainingArguments,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
)


In [None]:
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",  # dispatch efficiently the model on the available resources
        max_memory={i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True, add_eos_token=True)
    print(repr(tokenizer.pad_token))
    print(repr(tokenizer.bos_token))
    print(repr(tokenizer.eos_token))

    # Needed for LLaMA tokenizer: This code doesn't pad, but...
    # tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = 18610

    return model, tokenizer


In [None]:
from datasets import load_dataset
import re

# Load the dataset
dataset = load_dataset('lrtherond/running-qa')

In [None]:
dataset = dataset['train'].shuffle(seed=42)

import re

B_S = "<s>"
E_S = "</s>"
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

SYSTEM_MESSAGE = "You are a helpful, respectful, and honest coach. You help runners of all levels, from beginners to Olympians. You always respond in the style of Hal Higdon. Your answers are self-sufficient and do not reference any other resources. If a question does not make sense or is not factually coherent, explain why instead of answering incorrectly. If you don't know the answer to a question, please don't share false information."

def format_prompt(sample):
    """Transform a (question, answer) pair into NLI format."""

    question = sample['question']
    answer = sample['answer']

    # Replace the question and answer with the new template
    text = f"""{B_INST} {B_SYS}{SYSTEM_MESSAGE}{E_SYS}{question.strip()} {E_INST} {answer.strip()}"""

    sample["text"] = text

    return sample


In [None]:
print(f'Number of prompts: {len(dataset)}')
print(f'Column names are: {dataset.column_names}')


In [None]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max length: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")

    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """

    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(format_prompt)

    # Apply preprocessing to each batch of the dataset
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["question", "answer", "text"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset


In [None]:
def create_bnb_config():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    return bnb_config


In [None]:
def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,                # dimension of the updated matrices
        lora_alpha=64,       # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,    # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config


In [None]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')

    return list(lora_module_names)


In [None]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()

        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    if use_4bit:
        trainable_params /= 2

    print(
        f"all params: {all_param:,d} || "
        f"trainable params: {trainable_params:,d} || "
        f"trainable%: {100 * trainable_params / all_param}"
    )


In [None]:
# Load a model from Hugging Face with user's token and with bitsandbytes config

model_name = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = create_bnb_config()
model, tokenizer = load_model(model_name, bnb_config)


In [None]:
# Preprocess dataset

max_length = get_max_length(model)
seed = 42

dataset = preprocess_dataset(tokenizer, max_length, seed, dataset)

In [None]:
dataset


In [None]:
def train(model, tokenizer, dataset, output_dir):
    # Enable gradient checkpointing to reduce memory usage
    model.gradient_checkpointing_enable()

    # Prepare the model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # Get the names of Lora modules and create PEFT config
    modules = find_all_linear_names(model)
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)

    # Print information about trainable parameters
    print_trainable_parameters(model)

    # Define training parameters
    training_args = TrainingArguments(
        num_train_epochs=1,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        # max_steps=20, ** Can be used instead of num_train_epochs
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit",
    )

    # Create the trainer
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=training_args,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )

    # Disable caching during training
    model.config.use_cache = False

    # Verify data types
    # ...

    # Train the model
    print("Training...")

    train_result = trainer.train()

    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
    print(metrics)

    # Save the trained model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)

    # Free GPU memory
    del model
    del trainer
    torch.cuda.empty_cache()

# Define output directory and call the training function
output_dir = "results/Llama-2-7b-chat-hf-running-qa"

train(model, tokenizer, dataset, output_dir)


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#!cp -r outputs drive/MyDrive/Llama-2-7b-chat-hf-running-qa
#!cp -r results drive/MyDrive/Llama-2-7b-chat-hf-running-qa

In [None]:
#!cp -r drive/MyDrive/Llama-2-7b-chat-hf-running-qa/outputs .
#!cp -r drive/MyDrive/Llama-2-7b-chat-hf-running-qa/results .

In [None]:
# Load a pre-trained model with specified parameters
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

# Merge and unload the model
model = model.merge_and_unload()


In [None]:
# Define the output directory for the merged checkpoint
output_merged_dir = "results/Llama-2-7b-chat-hf-running-qa/final_merged_checkpoint"

# Create the directory if it doesn't exist
os.makedirs(output_merged_dir, exist_ok=True)

# Save the model in the defined directory with safe serialization
model.save_pretrained(output_merged_dir, safe_serialization=True)



In [None]:
# Load the tokenizer associated with the model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Save the tokenizer in the same directory as the model
tokenizer.save_pretrained(output_merged_dir)

In [None]:
model.push_to_hub("lrtherond/Llama-2-7b-chat-hf-running-qa")

In [None]:
tokenizer.push_to_hub("lrtherond/Llama-2-7b-chat-hf-running-qa")