In [None]:
# install the required libraries

! pip install accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.30.2 trl==0.4.7 --quiet

In [None]:
# set up the weights and bias secret key for model performence tracking

from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("WANDB_API_KEY") 

wandb.login(key=wandb_api)

In [None]:
# ignoring the warnings

import warnings
warnings.filterwarnings("ignore")

### Dataset preparation

In [None]:
# Load the dataset from HF - https://huggingface.co/datasets/knkarthick/dialogsum

from datasets import load_dataset

dataset_name = 'knkarthick/dialogsum' 

ds = load_dataset(dataset_name)

# looking at the dataset splits
ds

In [None]:
# loading the full train dataset and subset of test dataset

train_ds , test_ds = load_dataset(dataset_name,split =['train', 'test[0:200]'])

In [None]:
# covert the dataset to pandas dataframe for instruction finetuning dataset preparation

import pandas as pd

train_df = pd.DataFrame(train_ds)
test_df = pd.DataFrame(test_ds)

In [None]:
# looking at the training dataset

train_df.head()

Write a concise summary of the following text which starts with ### Input: \n
Return your response in bullet points which covers the key points of the text.

In [None]:
# instruction finetuning data preparation function

def prepare_dataset(df,split='train'):
    text_col = []
    instruction = """Write a concise summary of the below input text.Return your response in bullet points which covers the key points of the text. """ # change instuction according to the task
    if split == 'train':
        for _ , row in df.iterrows():
            input_q = row["dialogue"]
            output = row["summary"]
            text = ("### Instruction: \n" + instruction + 
                    "\n### Input: \n" + input_q + 
                    "\n### Response :\n" + output) # keeping output column in training dataset
            text_col.append(text)
        df.loc[:,'text'] = text_col
    else:
        for _ , row in df.iterrows():
            input_q = row["dialogue"]
            text = ("### Instruction: \n" + instruction + 
                    "\n### Input: \n" + input_q +
                    "\n### Response :\n" ) # not keeping output column in test dataset
            text_col.append(text)
        df.loc[:,'text'] = text_col
    return df

In [None]:
train_df = prepare_dataset(train_df,'train')
test_df = prepare_dataset(test_df,'test')

In [None]:
# looking at the train df , new text column is created
train_df.head()

In [None]:
# looking at one of the train text column format
print(train_df['text'][0])

In [None]:
# looking at the test df , new text column is created
test_df.head()

In [None]:
# looking at one of the test text column format without output data
print(test_df['text'][0])

In [None]:
# coverting the dataframe to huggingface dataset for easy finetuning
from datasets import Dataset
dataset = Dataset.from_pandas(train_df)

In [None]:
# looking at the dataset
dataset

### Loading the sharded Llama-2 model in Quantized format

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

# sharded model path in hugging face
model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

#model_name = 'NousResearch/Llama-2-7b-hf'

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
)

# loading the model with quantization config
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map='auto'
)
model.config.use_cache = False

In [None]:
# Creating the Llama-2 tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True , return_token_type_ids=False)
tokenizer.pad_token = tokenizer.eos_token

### QLoRA Configuration

In [None]:
# looking into the model structure
print(model)

In [None]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.05 
lora_r = 8 # rank

# Parameter efficient finetuning for LoRA configuration

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules= ["q_proj","v_proj"], # we will only create adopters for q, v metrices of attention module
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

### Finetuning Process

In [None]:
# defining the model fine tuning arguments
# arguments are self explanatory

import transformers

training_arguments = transformers.TrainingArguments(
        output_dir="llama2_qlora_finetuned",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        save_strategy="epoch",
        logging_steps=10,
        num_train_epochs=3,
        max_steps=100,
        fp16=True,
        push_to_hub=False
    )

In [None]:
# creating trainer with the training agruments

from trl import SFTTrainer
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config, # passing peft config
        dataset_text_field="text", # mentioned the required column
        args=training_arguments, # training agruments
        tokenizer=tokenizer, # tokenizer 
        packing=False,
        max_seq_length=512
    )

In [None]:
# upcasting the layer norms in float 32 for more stable training

for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
# starting the finetuning process

trainer.train()

### Save the LoRA adopters / you can even push these adopters to hugging face model hub for future inference

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

In [None]:
# adding back the LoRA adopters to the base Llama-2 model

lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

### Inference using Llama2 + QLoRA adopters

In [None]:
# perform inference on the first row of the test dataset
text = test_df['text'][0]
print(text)

### <b>Update:</b>

Added repetition_penalty=1.2 to avoid the repetion of input task as input

In [None]:
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=100 ,repetition_penalty=1.2)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
# another example:
text = test_df['text'][100]

inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_new_tokens=100,repetition_penalty=1.2)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))