In [1]:
%%capture

!pip install bitsandbytes accelerate peft trl

use_flash_attention2 = False

import torch
# Replace attention with flash attention 
if torch.cuda.get_device_capability()[0] >= 8:
    use_flash_attention2 = True

print(f"Using flash attention 2: {use_flash_attention2}")

if use_flash_attention2:
    !pip install flash-attn --no-build-isolation --upgrade

import torch
print(torch.cuda.is_available())

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
import torch
from datasets import load_dataset, DatasetDict
from dotenv import load_dotenv
from random import seed
import os
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from trl import SFTTrainer
import utils

# Set seed for reproducibility
seed(42)

class EnvironmentLoader:
    @staticmethod
    def load_env():
        load_dotenv()

class DatasetHandler:
    def __init__(self, data_path):
        self.data_path = data_path

    def load_and_split_dataset(self):
        dataset = load_dataset("json", data_files=self.data_path)
        train_test_split = dataset['train'].train_test_split(test_size=0.2)
        dataset_dict = DatasetDict({
            'train': train_test_split['train'],
            'test': train_test_split['test']
        })
        return dataset_dict['train'], dataset_dict['test']

    @staticmethod
    def format_instruction(sample):
        return f"""
        Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

        ### Instruction:
        {sample['Instruction']}

        ### Input:
        {sample['Input']}

        ### Response:
        {sample['Output']}
        """

class ModelManager:
    def __init__(self, model_id, use_flash_attention2, hf_token):
        self.model_id = model_id
        self.use_flash_attention2 = use_flash_attention2
        self.hf_token = hf_token
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16 if use_flash_attention2 else torch.float16
        )
    
    def load_model_and_tokenizer(self):
        model = AutoModelForCausalLM.from_pretrained(
            self.model_id, 
            quantization_config=self.bnb_config, 
            use_cache=False, 
            device_map="auto",
            token=self.hf_token,  
            attn_implementation="flash_attention_2" if self.use_flash_attention2 else "sdpa"
        )
        model.config.pretraining_tp = 1

        tokenizer = AutoTokenizer.from_pretrained(
            self.model_id,
            token=self.hf_token
        )
        tokenizer.pad_token = tokenizer.eos_token
        tokenizer.padding_side = "right"
        
        return model, tokenizer

    def save_model_and_tokenizer(self, model, tokenizer, save_directory):
        model.save_pretrained(save_directory)
        tokenizer.save_pretrained(save_directory)
    
    @staticmethod
    def prepare_for_training(model):
        return prepare_model_for_kbit_training(model)

class Trainer:
    def __init__(self, model, tokenizer, train_dataset, peft_config, use_flash_attention2, output_dir):
        self.model = model
        self.tokenizer = tokenizer
        self.train_dataset = train_dataset
        self.peft_config = peft_config
        self.args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=1,
            per_device_train_batch_size=4,
            gradient_accumulation_steps=4,
            gradient_checkpointing=True,
            optim="paged_adamw_8bit",
            logging_steps=10,
            save_strategy="epoch",
            learning_rate=2e-4,
            bf16=use_flash_attention2,
            fp16=not use_flash_attention2,
            tf32=use_flash_attention2,
            max_grad_norm=0.3,
            warmup_steps=5,
            lr_scheduler_type="linear",
            disable_tqdm=False,
            report_to="none"
        )
        self.model = get_peft_model(self.model, self.peft_config)

    def train_model(self, format_instruction_func):
        trainer = SFTTrainer(
            model=self.model,
            train_dataset=self.train_dataset,
            peft_config=self.peft_config,
            max_seq_length=2048,
            tokenizer=self.tokenizer,
            packing=True,
            formatting_func=format_instruction_func, 
            args=self.args,
        )
        trainer.train()
        return trainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
EnvironmentLoader.load_env()
dataset_handler = DatasetHandler(data_path=utils.Variables.INSTRUCTION_DATASET_JSON_PATH)
train_dataset, test_dataset = dataset_handler.load_and_split_dataset()

new_test_dataset = []
for dict_ in test_dataset:
    dict_['Output'] = ''
    new_test_dataset.append(dict_)

model_manager = ModelManager(
    model_id="meta-llama/Meta-Llama-3-8B",
    use_flash_attention2=True,
    hf_token=os.environ["HF_TOKEN"]
)
model, tokenizer = model_manager.load_model_and_tokenizer()
model_manager.save_model_and_tokenizer(model, tokenizer, save_directory=utils.Variables.BASE_MODEL_PATH)
model = model_manager.prepare_for_training(model)

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",
    ]
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    peft_config=peft_config,
    use_flash_attention2=True,
    output_dir=utils.Variables.FINE_TUNED_MODEL_PATH
)
trained_model = trainer.train_model(format_instruction_func=dataset_handler.format_instruction)
trained_model.save_model()

Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.78s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Generating train split: 122 examples [00:00, 687.44 examples/s]
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os
from dotenv import load_dotenv
import time
from random import randrange, sample, seed

import torch
import os
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

seed(42)

from datasets import load_dataset
import utils
dataset = load_dataset("json", data_files=utils.Variables.INSTRUCTION_DATASET_JSON_PATH, split="train")

def format_instruction(sample):
	return f"""    
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample['Instruction']}

### Input:
{sample['Input']}

### Response:
{sample['Output']}
"""

from datasets import load_dataset, DatasetDict
import utils
# Load the entire dataset
dataset = load_dataset("json", data_files=utils.Variables.INSTRUCTION_DATASET_JSON_PATH)

# Split the dataset into training and testing sets
train_test_split = dataset['train'].train_test_split(test_size=0.2)

# Create a DatasetDict to hold the splits
dataset_dict = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

# Now you have separate training and testing sets
train_dataset = dataset_dict['train']
test_dataset = dataset_dict['test']


new_test_dataset = []
for dict_ in test_dataset:
    dict_['Output'] = ''
    new_test_dataset.append(dict_)

# Load environment variables from .env file
load_dotenv()

# Hugging Face model id
model_id = "meta-llama/Meta-Llama-3-8B"
# model_id = "mistralai/Mistral-7B-v0.1"

# BitsAndBytesConfig int-4 config 
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if use_flash_attention2 else torch.float16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto",
    token=os.environ["HF_TOKEN"],  # if model is gated like llama or mistral
    attn_implementation="flash_attention_2" if use_flash_attention2 else "sdpa"
)
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=os.environ["HF_TOKEN"],  # if model is gated like llama or mistral
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Specify the directory where you want to save the model and tokenizer
save_directory = utils.Variables.BASE_MODEL_PATH

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)


# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj", 
            "up_proj", 
            "down_proj",
        ]
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)


args = TrainingArguments(
    output_dir=utils.Variables.FINE_TUNED_MODEL_PATH,
    num_train_epochs=1,
    per_device_train_batch_size=4,#6 if use_flash_attention2 else 2, # you can play with the batch size depending on your hardware
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=use_flash_attention2,
    fp16=not use_flash_attention2,
    tf32=use_flash_attention2,
    max_grad_norm=0.3,
    warmup_steps=5,
    lr_scheduler_type="linear",
    disable_tqdm=False,
    report_to="none"
)

model = get_peft_model(model, peft_config)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction, 
    args=args,
)

# train
trainer.train()

# save model
trainer.save_model()

Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.92s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Generating train split: 114 examples [00:00, 679.80 examples/s]
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss


# Evaluation

In [25]:
import pandas as pd
import re
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
import time

def calculate_bleu_score(machine_results, reference_texts):
    bleu_score = corpus_bleu([[ref.split()] for ref in reference_texts], [gen.split() for gen in machine_results])
    print(f'BLEU Score: {bleu_score}')

def calculate_rouge_scores(generated_answers, ground_truth):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    total_rouge1, total_rouge2, total_rougeL = 0, 0, 0
    for gen, ref in zip(generated_answers, ground_truth):
        scores = scorer.score(gen, ref)
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure
    average_rouge1 = total_rouge1 / len(generated_answers)
    average_rouge2 = total_rouge2 / len(generated_answers)
    average_rougeL = total_rougeL / len(generated_answers)
    return {'average_rouge1':average_rouge1,
            'average_rouge2':average_rouge2,
            'average_rougeL':average_rougeL}
    print(f'Average ROUGE-1: {average_rouge1}')
    print(f'Average ROUGE-2: {average_rouge2}')
    print(f'Average ROUGE-L: {average_rougeL}')

class ModelHandler:

    def __init__(self):
        pass

    def loading_model(self, model_chosen='fine_tuned_model'):

        if model_chosen == 'fine_tuned_model':
            model_dir=utils.Variables.FINE_TUNED_MODEL_PATH
            self.model = AutoPeftModelForCausalLM.from_pretrained(
                model_dir,
                low_cpu_mem_usage=True,
                torch_dtype=torch.float16,
                load_in_4bit=True,
                )

        elif model_chosen == 'base_model':
            model_dir=utils.Variables.BASE_MODEL_PATH
            self.model = AutoModelForCausalLM.from_pretrained(
                model_dir,
                low_cpu_mem_usage=True,
                torch_dtype=torch.float16,
                load_in_4bit=True,
                )

        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)

    def ask_question(self, instruction, temperature=0.5, max_new_tokens = 1000):

        prompt = format_instruction(instruction)

        input_ids = self.tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

        start_time = time.time()
        with torch.inference_mode():
            outputs = self.model.generate(input_ids=input_ids, pad_token_id=self.tokenizer.eos_token_id, max_new_tokens=max_new_tokens, do_sample=True, top_p=0.5,temperature=temperature)
        end_time = time.time()

        total_time = end_time - start_time
        output_length = len(outputs[0])-len(input_ids[0])

        self.output = self.tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

        return self.output

    def parse_output(self):    

        # Split the text at the word "Response"
        parts = self.output.split("Response:", 1)

        # Check if "Response" is in the text and get the part after it
        if len(parts) > 1:
            response_text = parts[1].strip()
        else:
            response_text = ""

        return response_text

In [26]:
modelhandler_finetuned = ModelHandler()
modelhandler_finetuned.loading_model(model_chosen = 'fine_tuned_model')
ground_truths_list = []
finetuned_model_generated_answers = []

for triple_with_output, triple_without_output in zip(test_dataset, new_test_dataset):

    modelhandler_finetuned.ask_question(triple_without_output)
    finetuned_model_generated_answer = modelhandler_finetuned.parse_output()
    ground_truth = triple_with_output['Output']

    finetuned_model_generated_answers.append(finetuned_model_generated_answer)
    ground_truths_list.append(ground_truth)

    
rouge_score_fine_tuned_model = calculate_rouge_scores(finetuned_model_generated_answers,ground_truths_list)
rouge_score_fine_tuned_model

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 4/4 [00:12<00:00,  3.10s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'average_rouge1': 0.3830173835854431,
 'average_rouge2': 0.21161541912653067,
 'average_rougeL': 0.3213611920078565}

In [30]:
modelhandler_base = ModelHandler()
modelhandler_base.loading_model(model_chosen = 'base_model')
base_model_generated_answers = []
ground_truths_list = []

for triple_with_output, triple_without_output in zip(test_dataset, new_test_dataset):

    print(rf'Running triple {triple_with_output}')

    modelhandler_base.ask_question(triple_without_output)
    base_model_generated_answer = modelhandler_base.parse_output()
    ground_truth = triple_with_output['Output']

    base_model_generated_answers.append(base_model_generated_answer)
    ground_truths_list.append(ground_truth)
    
rouge_score_base_model = calculate_rouge_scores(base_model_generated_answers,ground_truths_list)
rouge_score_base_model

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running triple {'Instruction': 'Identify the dataset used in the LLM-Seq template for sequential feature selection.', 'Input': '"E.1 LLM-Seq Template for Credit-G"', 'Output': '"Credit-G dataset"'}
Running triple {'Instruction': 'Summarize the performance of GPTQT on the PTB dataset.', 'Input': '"OPT perplexity results on the PTB dataset are also provided in Tab.III, demonstrating that GPTQT’s effectiveness is not limited to specific datasets."', 'Output': '"GPTQT performs well on the PTB dataset, with minimal increase in perplexity compared to the original fp16 model."\n\n'}
Running triple {'Instruction': 'Explain how the LANE framework generates explanation information for recommendations.', 'Input': 'The text describing the explanation generation process in the LANE framework.', 'Output': 'The LANE framework generates explanation information by inputting user interaction sequences, attention weights, and multiple preferences into predefined prompt templates, which are then processed

{'average_rouge1': 0.2524191394349585,
 'average_rouge2': 0.13402054342344535,
 'average_rougeL': 0.2115590931984475}

In [19]:
modelhandler_base = ModelHandler()
modelhandler_base.loading_model(model_chosen = 'base_model')
rouge_scores_base_model = []

for triple_with_output, triple_without_output in zip(test_dataset, new_test_dataset):

    modelhandler_base.ask_question(triple_without_output)
    base_model_generated_answer = modelhandler_base.parse_output()
    ground_truth = triple_with_output['Output']
    rouge_score = calculate_rouge_scores(base_model_generated_answer,ground_truth)
    rouge_scores_base_model.append(rouge_score)

rouge_score

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.01it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Average ROUGE-1: 0.005063291139240506
Average ROUGE-2: 0.0
Average ROUGE-L: 0.005063291139240506


In [16]:
model_dir=utils.Variables.BASE_MODEL_PATH

model = AutoModelForCausalLM.from_pretrained(
model_dir,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
load_in_4bit=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_dir)


prompt = format_instruction(test_dataset[15])

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

start_time = time.time()
with torch.inference_mode():
    outputs = model.generate(input_ids=input_ids, pad_token_id=tokenizer.eos_token_id, max_new_tokens=100, do_sample=True, top_p=0.5,temperature=0.5)
end_time = time.time()

total_time = end_time - start_time
output_length = len(outputs[0])-len(input_ids[0])

output = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.12s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [17]:
output

'    \nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nSummarize the purpose of the KGYM platform and KBENCH dataset.\n\n### Input:\n"We introduce KGYM (a platform) and KBENCH (a dataset) to evaluate if ML models are useful while developing large-scale systems-level software like the Linux kernel."\n\n### Response:\n"KGYM and KBENCH are introduced to assess the usefulness of ML models in developing large-scale system software like the Linux kernel."\n\n\n### Instruction:\nSummarize the purpose of the KGYM platform and KBENCH dataset.\n\n### Input:\n"We introduce KGYM (a platform) and KBENCH (a dataset) to evaluate if ML models are useful while developing large-scale systems-level software like the Linux kernel."\n\n### Response:\n"KGYM and KBENCH are introduced to assess the usefulness of ML models in developing large-scale system software like the Linux ke

ValueError: Can't find 'adapter_config.json' at '/root/Projects/llama3_8b_finetuning/models/original_llama3_model'

In [None]:
modelhandler_base = ModelHandler()
modelhandler_base.loading_model(model_chosen = 'fine_tuned_model')

for triple_with_output, triple_without_output in zip(test_dataset, new_test_dataset):

    modelhandler_base.ask_question(triple_without_output)
    base_model_generated_answer = modelhandler_base.parse_output()
    ground_truth = triple_with_output['Output']
    rouge_score = calculate_rouge_scores(base_model_generated_answer,ground_truth)
    break

rouge_score



triple = test_dataset[2]


modelhandler_base.ask_question(triple)
base_model_generated_answer = modelhandler_base.parse_output()
ground_truth = triple['Output']

calculate_rouge_scores(base_model_generated_answer,ground_truth)

In [29]:
test_dataset[2]['Output']

'The Positional Integrity Encoding approach performs similarly to full re-computation in different scenarios, with a maximum relative difference of up to 2.24% in code edition.'

In [None]:
modelhandler_fine_tuned = ModelHandler()
modelhandler_fine_tuned.loading_model(model_chosen = 'base_model')

modelhandler_fine_tuned.ask_question(test_dataset[2])
fine_tuned_model_generated_answer = modelhandler_fine_tuned.parse_output()

In [28]:
# Rouge score for the Fine Tuned Model:


modelhandler.ask_question(test_dataset[2])
generated_answer = modelhandler.parse_output()

generated_answer



calculate_rouge_scores(generated_answer,ground_truth)


'The Positional Integrity Encoding approach performs similarly to full re-computation in different scenarios, with a maximum relative difference of up to 2.24% in code edition.'

In [None]:
generated_answer = parse_output(ask_question(new_test_dataset[0]))
ground_truth = test_dataset[0]['Output']
calculate_rouge_scores(generated_answer,ground_truth)

In [21]:
test_dataset[2]

{'Instruction': 'Compare the performance of the Positional Integrity Encoding approach with full re-computation in different scenarios.',
 'Input': 'The maximum relative difference between our PIE and Full-recomputation across different model sizes and code languages is 0.3%/0.15%, 0.66%/0.79%, 1.33%/2.24% for code insertion, deletion, and edition.',
 'Output': 'The Positional Integrity Encoding approach performs similarly to full re-computation in different scenarios, with a maximum relative difference of up to 2.24% in code edition.'}

In [None]:


    # Load finetuned LLM model and tokenizer
    

    # instruction = {
    #     "Instruction": "Answer the following question",
    #     "Input": "Explain the significance of LoRA-Guard's performance in cross-domain evaluation.",
    #     "Output": ""
    # }


In [16]:
new_test_dataset = []
for dict_ in test_dataset:
    dict_['Output'] = ''
    new_test_dataset.append(dict_)

In [23]:
parse_output(ask_question(test_dataset[2]))

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.90s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


'The Positional Integrity Encoding approach performs similarly to full re-computation in different scenarios, with a maximum relative difference of up to 2.24% in code edition.'

In [17]:
generated_answer = parse_output(ask_question(new_test_dataset[0]))
ground_truth = test_dataset[0]['Output']
calculate_rouge_scores(generated_answer,ground_truth)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.78s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [86]:
calculate_rouge_scores(generated_answer,ground_truth)

Average ROUGE-1: 0.020654044750430294
Average ROUGE-2: 0.0
Average ROUGE-L: 0.020654044750430294


In [14]:
ask_question(new_test_dataset[0])

NameError: name 'new_test_dataset' is not defined

In [22]:
ask_question(new_test_dataset[0],model_dir =  utils.Variables.ORIGINAL_MODEL_PATH)

ValueError: Can't find 'adapter_config.json' at '/root/Projects/llama3_8b_finetuning/models/original_llama3_model'