In [1]:
%%capture

!pip install bitsandbytes accelerate peft trl

In [2]:
import time
from random import randrange, sample, seed

import torch
import os
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
print(torch.cuda.is_available())

True


In [5]:
use_flash_attention2 = False

# Replace attention with flash attention 
if torch.cuda.get_device_capability()[0] >= 8:
    use_flash_attention2 = True

print(f"Using flash attention 2: {use_flash_attention2}")

if use_flash_attention2:
    !pip install flash-attn --no-build-isolation --upgrade

Using flash attention 2: True




In [6]:
from datasets import load_dataset
from utils import Variables
dataset = load_dataset("json", data_files=Variables.INSTRUCTION_DATASET_JSON_PATH, split="train")

Generating train split: 2429 examples [00:00, 53169.76 examples/s]


In [7]:
def format_instruction(sample):
	return f"""    
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample['Instruction']}

### Input:
{sample['Input']}

### Response:
{sample['Output']}
"""

In [20]:
from datasets import load_dataset, DatasetDict

# Load the entire dataset
dataset = load_dataset("json", data_files=Variables.INSTRUCTION_DATASET_JSON_PATH)

# Split the dataset into training and testing sets
train_test_split = dataset['train'].train_test_split(test_size=0.2)

# Create a DatasetDict to hold the splits
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

# Now you have separate training and testing sets
train_dataset = dataset_dict['train']
test_dataset = dataset_dict['test']

In [23]:
print(format_instruction(train_dataset[randrange(len(train_dataset))]))

    
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Describe the KV cache mechanism in the context of large language models.

### Input:
Figure 2 illustrates the KV cache mechanism in both static and real-time editing settings for large language models.

### Response:
The KV cache stores precomputed Key/Value pairs that the model leverages to generate predictions in the static setting, and updates in real-time editing settings to maintain accurate information.





In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Hugging Face model id
model_id = "meta-llama/Meta-Llama-3-8B"
# model_id = "mistralai/Mistral-7B-v0.1"

# BitsAndBytesConfig int-4 config 
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if use_flash_attention2 else torch.float16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto",
    token=os.environ["HF_TOKEN"],  # if model is gated like llama or mistral
    attn_implementation="flash_attention_2" if use_flash_attention2 else "sdpa"
)
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=os.environ["HF_TOKEN"],  # if model is gated like llama or mistral
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Specify the directory where you want to save the model and tokenizer
save_directory = Variables.ORIGINAL_MODEL_PATH

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

Downloading shards: 100%|██████████| 4/4 [03:04<00:00, 46.13s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.71s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('/root/Projects/llama3_8b_finetuning/models/original_llama3_model/tokenizer_config.json',
 '/root/Projects/llama3_8b_finetuning/models/original_llama3_model/special_tokens_map.json',
 '/root/Projects/llama3_8b_finetuning/models/original_llama3_model/tokenizer.json')

In [10]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj", 
            "up_proj", 
            "down_proj",
        ]
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)

In [11]:
args = TrainingArguments(
    output_dir= Variables.FINE_TUNED_MODEL_PATH,
    num_train_epochs=1,
    per_device_train_batch_size=4,#6 if use_flash_attention2 else 2, # you can play with the batch size depending on your hardware
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=use_flash_attention2,
    fp16=not use_flash_attention2,
    tf32=use_flash_attention2,
    max_grad_norm=0.3,
    warmup_steps=5,
    lr_scheduler_type="linear",
    disable_tqdm=False,
    report_to="none"
)

model = get_peft_model(model, peft_config)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction, 
    args=args,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Generating train split: 143 examples [00:00, 862.48 examples/s]


In [12]:
# train
trainer.train()

# save model
trainer.save_model()

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss


In [34]:
# Path to finetuned model
finetuned_model_dir=Variables.FINE_TUNED_MODEL_PATH

# Load finetuned LLM model and tokenizer
model = AutoPeftModelForCausalLM.from_pretrained(
    finetuned_model_dir,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    load_in_4bit=True,
)
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_dir)

instruction = {
    "Instruction": "Answer the following question",
    "Input": "Explain the significance of LoRA-Guard's performance in cross-domain evaluation.",
    "Output": ""
}

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.90s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [76]:
def ask_question(instruction, temperature=0.5):
    
    prompt = format_instruction(instruction)

    input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

    start_time = time.time()
    with torch.inference_mode():
        outputs = model.generate(input_ids=input_ids, pad_token_id=tokenizer.eos_token_id, max_new_tokens=100, do_sample=True, top_p=0.5,temperature=temperature)
    end_time = time.time()
    total_time = end_time - start_time
    output_length = len(outputs[0])-len(input_ids[0])

    output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

    #print(f"\nInstruction generated from finetuned model | Inference time - {total_time:.2f}s:\n")

    return output

In [56]:
def parse_output(text):    

    # Split the text at the word "Response"
    parts = text.split("Response:", 1)

    # Check if "Response" is in the text and get the part after it
    if len(parts) > 1:
        response_text = parts[1].strip()
    else:
        response_text = ""

    return response_text

parse_output(ask_question(test_dataset[2]))

# Evaluation

In [30]:
import pandas as pd
import re
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer

def calculate_bleu_score(machine_results, reference_texts):
    bleu_score = corpus_bleu([[ref.split()] for ref in reference_texts], [gen.split() for gen in machine_results])
    print(f'BLEU Score: {bleu_score}')

def calculate_rouge_scores(generated_answers, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1, total_rouge2, total_rougeL = 0, 0, 0
    for gen, ref in zip(generated_answers, ground_truth):
        scores = scorer.score(gen, ref)
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure
    average_rouge1 = total_rouge1 / len(generated_answers)
    average_rouge2 = total_rouge2 / len(generated_answers)
    average_rougeL = total_rougeL / len(generated_answers)
    print(f'Average ROUGE-1: {average_rouge1}')
    print(f'Average ROUGE-2: {average_rouge2}')
    print(f'Average ROUGE-L: {average_rougeL}')

In [69]:
new_test_dataset = []
for dict_ in test_dataset:
    dict_['Output'] = ''
    new_test_dataset.append(dict_)

In [81]:
generated_answer = parse_output(ask_question(new_test_dataset[0]))
ground_truth = test_dataset[0]['Output']

In [86]:
calculate_rouge_scores(generated_answer,ground_truth)

Average ROUGE-1: 0.020654044750430294
Average ROUGE-2: 0.0
Average ROUGE-L: 0.020654044750430294
