# Fine-tuning Implementation for Q4 LLAMA2-7B OR MISTRAL-TB Chat model for a Hiring Decision support System

---



## Dependency Installation

In [None]:
# Install dependencies

!pip -q install huggingface_hub accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7 ctransformers
!pip install git+https://github.com/huggingface/transformers

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


## Main Imports

In [None]:
#Imports

from huggingface_hub import hf_hub_download

#Training Imports
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel, AutoPeftModelForCausalLM
from trl import SFTTrainer

# Additional Utility Imports
import numpy as np
import json


## Implementation

### Variable, Model and Data loading and Initialization

In [None]:

app_base_dir="/content/drive/MyDrive/LLM_Training"
supported_base_models = {
    'llama': {
        'name': 'NousResearch/Llama-2-7b-chat-hf',
        'base_dir': f"{app_base_dir}/llama"
    },
    'mistral': {
        'name': 'mistralai/Mistral-7B-v0.1',
        'base_dir': f"{app_base_dir}/mistral"
    }
}

model_engine=input("Enter preferred training Engine/Base Model (7B pre-trained) (\'llama\'/\'mistral\'):")

if model_engine not in np.array(['llama', 'mistral']):
  raise ValueError("Invalid value entered: Enter either 'llama' or 'mistral' ")


model_name=supported_base_models[model_engine]['name']
base_dir=supported_base_models[model_engine]['base_dir']


## Utilities

In [None]:
# Benchmark Util Definition:

import time


class Benchmarker:
    def __init__(self):
        self.start_time = None
        self.end_time = None
        self.execution_time = None

    def start(self):
        self.start_time = time.perf_counter()

    def end(self):
        self.end_time = time.perf_counter()

    def compute_execution_time(self):
        """
        Computes the execution time
        """
        self.execution_time = self.end_time - self.start_time

    def get_execution_time(self):
        """
        :return: The execution time taken for the process
        """
        return self.execution_time

    def benchmark_function(self, function, *args, **kwargs):
        """
        :param function: The function to be executed
        :param args: The function variable number of arguments
        :param kwargs: The function keyword arguments
        """
        self.start()
        function(*args, **kwargs)
        self.end()
        self.compute_execution_time()


In [None]:
# Getting Harware Information

import subprocess

# Get GPU information
# gpu_info = subprocess.check_output(['nvidia-smi']).decode('utf-8')

# Get CPU information
cpu_info = subprocess.check_output(['cat', '/proc/cpuinfo']).decode('utf-8')

# Get RAM information
ram_info = subprocess.check_output(['cat', '/proc/meminfo']).decode('utf-8')

print(cpu_info, ram_info)

In [None]:
# Utility Function 1: # Prepare training dataset

#Load and prepare dataset

system_instruction = "Assuming you are working as Hiring decision support system to provide insights and inferences on potential fits for job applications for a company. Can you please provide a suitable answer for the below question in the most objective and rational manner. If you don't know the answer to a question, please don't share false information"

def llama_prompt_generator(prompt, response):
  full_prompt= ""
  full_prompt+= "<s> [INST]"
  full_prompt+= f"<<SYS>>{system_instruction}<</SYS>>"
  full_prompt+= f"\n\n### Input:"
  full_prompt+= f"\n{prompt}[/INST]"
  full_prompt+= "\n\n### Answer:"
  full_prompt+= f"\n{response}"
  full_prompt+= "</s>"
  return full_prompt

def mistral_prompt_generator(prompt, response):
  full_prompt= ""
  full_prompt+= "<s>"
  full_prompt+= "### Instruction:"
  full_prompt+= f"\n{system_instruction}"
  full_prompt+= "\n\n### Input:"
  full_prompt+= f"\n{prompt}"
  full_prompt+= "\n\n### Response:"
  full_prompt+= f"\n{response}"
  full_prompt+= "</s>"
  return full_prompt



def prompt_generator(batchData):
  generator_function = llama_prompt_generator if model_engine == "llama" else (mistral_prompt_generator if model_engine == "mistral" else None)
  if generator_function is None:
    raise ValueError("Engine or Base Model not specified")
  training_prompt = [generator_function(data[0], data[1]) for data in zip(batchData['prompt'], batchData['response'])]
  batchData['text']=training_prompt
  return batchData



# Deprecated
def convert_prompts_and_responses(prompt_data, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    with open(os.path.join(output_dir, "train.txt"), "w") as f:
        for index, data in enumerate(prompt_data):
            f.write(f"### Instruction {index+1}\n")
            f.write(f"{data['prompt']}\n")
            f.write("### Response:\n")
            f.write(f"{data['response']}\n\n")


# Utility Function 2: # Read a json file
def read_json(json_file_path: str):
    with open(json_file_path) as json_file:
        return json.load(json_file)

# Utility Function 2: # Read a text file
def load_text_file(filename):
    with open(filename, 'r') as f:
        text = f.read()

    return text

## Dataset Preparation

In [None]:
# Prepare training dataset

training_dataset_source = f"{app_base_dir}/resume_prompt_template.json"
dataset = load_dataset("json", data_files=training_dataset_source)

instructed_datasets = dataset.map(prompt_generator, batched=True)
instructed_datasets = instructed_datasets.remove_columns(['context','prompt', 'response'])
print(instructed_datasets['train'])


## Training Parameter Definitions and Instantiations

*Optimized Parameter Reference:*

*1. https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da#comments*

*2. https://github.com/facebookresearch/llama-recipes/blob/main/src/llama_recipes/finetuning.py*

*Research Reference:*

In [None]:
print(model_name, instructed_datasets['train']['text'][0])

mistralai/Mistral-7B-v0.1 <s>### Instruction:
Assuming you are working as Hiring decision support system to provide insights and inferences on potential fits for job applications for a company. Can you please provide a suitable answer for the below question in the most objective and rational manner. If you don't know the answer to a question, please don't share false information

### Input:
What certifications does the candidate have ?

### Response:
The candidate has a total of 1 different certifications. With the details below: 
 IC3 Internet and Computing Core Certification</s>


In [None]:
# Load tokenizer and model with QLoRA configuration (Helper Reference: https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da#comments)
compute_dtype = getattr(torch, "float16")

# 4 bit Quantization Configuration for LORA
bits_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)


use_nested_quant = False

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Harware resource configuration:
# Load the entire model on the GPU 0 (Based on available hardware resource)
device_map = {"": 0}


# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bits_config,
    device_map=device_map,
)
model.config.use_cache = False
model.config.temperature = 0.6
model.config.pretraining_tp = 1

# Load model tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

if model_engine == 'mistral':
  model.gradient_checkpointing_enable()
  tokenizer.add_eos_token = True
  tokenizer.add_bos_token, tokenizer.add_eos_token


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/966 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [None]:
# LORA Parameters
attention_dimension = 64 if model_engine == "llama" else 16
dropout_prop = 0.1 if model_engine == "llama" else 0.05
target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"] if model_engine == "mistral" else None

# Load PEFT: LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16, # Alpha parameter for LoRA scaling
    lora_dropout=0.1, # Dropout probability for LoRA layers
    r=64, # LoRA attention dimension
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules
)

## Model Training

In [None]:
# Output directory where the model predictions and checkpoints will be stored
output_dir =f"{base_dir}/results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True


# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=25,
    logging_steps=1,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=instructed_datasets['train'],
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
)

# Train model
trainer.train()

# Save trained model
new_model_path = f"{base_dir}/fine_tuned_adapter_model"
trainer.model.save_pretrained(new_model_path)