## Parameter Efficient Fine Tuning of Large Language Models

In [None]:
!pip install transformers trl datasets peft accelerate bitsandbytes sentencepiece

In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    logging,
    pipeline
)
from peft import LoraConfig, PeftModel, get_peft_config
from trl import SFTTrainer
import gc
from datasets import Dataset
import pandas as pd

## Add your data

In [None]:
data = pd.read_csv("data/medquad.csv")

In [None]:
data.sample(3)

Unnamed: 0,question,answer,source,focus_area
7176,What are the symptoms of Congenital adrenal hy...,What are the signs and symptoms of Congenital ...,GARD,Congenital adrenal hyperplasia due to cytochro...
1263,Who is at risk for Prostate Cancer? ?,Prostate cancer is the most common nonskin can...,CancerGov,Prostate Cancer
5443,What are the symptoms of Loeys-Dietz syndrome ...,What are the signs and symptoms of Loeys-Dietz...,GARD,Loeys-Dietz syndrome type 2


In [None]:
data.isnull().sum()

question       0
answer         5
source         0
focus_area    14
dtype: int64

In [None]:
data.shape

(16412, 4)

In [None]:
data.dropna(inplace=True)

In [None]:
data.shape

(16393, 4)

In [None]:
data.drop("source",axis=1,inplace=True)

## Convert into HuggingFace datasets

In [None]:
from datasets import Dataset

In [None]:
data = Dataset.from_pandas(pd.DataFrame(data=data))

In [None]:
data

Dataset({
    features: ['question', 'answer', 'focus_area', '__index_level_0__'],
    num_rows: 16393
})

In [None]:
#hyperparameters
LORA_ALPHA = 32
LORA_DROPOUT = 0.2
LORA_R = 4

LEARNING_RATE = 1e-4
NUM_EPOCHS = 2
BATCH_SIZE = 4
WEIGHT_DECAY = 0.001
MAX_GRAD_NORM = 0.3
gradient_accumulation_steps = 16
STEPS = 1
OPTIM = "adam"
MAX_STEPS = 200
OUTPUT_DIR = "./results"

## Quantization configuration using Bitsandbytes

In [None]:
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
torch.cuda.get_device_capability()
device_map = "auto"

## Define model and tokenization

In [None]:
model_name = "pankajmathur/orca_mini_3b"

In [None]:
model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=bnb_config,
            device_map=device_map,
        )
model.config.pretraining_tp = 1

Downloading (…)lve/main/config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/21.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/3.72G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/534k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/208 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
torch.cuda.empty_cache()

## Define LoRA adapter

In [None]:
peft_config = LoraConfig(
      lora_alpha= LORA_ALPHA,
      lora_dropout= LORA_DROPOUT,
      r= LORA_R,
      bias="none",
      task_type="CAUSAL_LM",
  )

## Setup training parameters

In [None]:
training_args = TrainingArguments(
      output_dir= OUTPUT_DIR,
      per_device_train_batch_size=BATCH_SIZE,
      gradient_accumulation_steps= gradient_accumulation_steps,
      learning_rate= LEARNING_RATE,
      logging_steps= STEPS,
      num_train_epochs= NUM_EPOCHS,
      max_steps= MAX_STEPS,
)

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer = SFTTrainer(
        model=model,
        train_dataset=data,
        peft_config=peft_config,
        dataset_text_field= "question",
        max_seq_length=512,
        tokenizer=tokenizer,
        args=training_args,
)

Map:   0%|          | 0/16393 [00:00<?, ? examples/s]

In [None]:
trainer.train()

Step,Training Loss
1,4.5608
2,4.4345
3,4.229
4,4.3057
5,4.162
6,3.9152
7,4.0483
8,3.922
9,3.7654
10,4.1636


TrainOutput(global_step=200, training_loss=1.7866380763053895, metrics={'train_runtime': 2485.1949, 'train_samples_per_second': 5.151, 'train_steps_per_second': 0.08, 'total_flos': 4781878895923200.0, 'train_loss': 1.7866380763053895, 'epoch': 0.78})

In [None]:
logging.set_verbosity(logging.CRITICAL)
torch.cuda.empty_cache()

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

## Test the model

### Using Inference pipeline

In [None]:
pipe = pipeline(task="text-generation",model=model,tokenizer=tokenizer,max_length=500)

In [None]:
prompt = "Who is at risk for Prostate Cancer?"

In [None]:
template = f"""<s>[INST] <<SYS>>
You are a honest Medical assistant bot.
Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content.
Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct.
If you don't know the answer to a question, please don't share false information.
<</SYS>>

{prompt} [/INST]
"""

In [None]:
result = pipe(template)

In [None]:
response = result[0]['generated_text']
index = response.find("[/INST]")+len("[/INST]")

In [None]:
print(response[index:].strip())

Prostate cancer can affect both men and women, but the risk of developing the disease is higher in men. Men who have a family history of prostate cancer, men who have a history of chronic inflammation, men who have a high level of testosterone, and men who have a strong genetic predisposition to the disease are at a higher risk of developing prostate cancer.


## Without using pipeline

In [None]:
from peft import get_peft_model

In [None]:
lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
inputs = tokenizer(template, return_tensors="pt").to(device)
model = model.to(device)
outputs = model.generate(**inputs, max_new_tokens=1024)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

[INST] <<SYS>>
You are a helpful, respectful and honest Medical and Legal assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

Who is at risk for Prostate Cancer? [/INST]

Prostate cancer can occur in both men and women, but it is more common in men. The risk of developing prostate cancer increases with age. In fact, the lifetime risk of developing prostate cancer is about 1 in 6 men. African American men have a higher risk of developing prostate cancer than other men. Additionally, men with a family history of prostate cancer are also at a higher risk. It is important 

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model.push_to_hub("devfest_bbsr2023_demo")

adapter_model.bin:   0%|          | 0.00/5.36M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lucifertrj/devfest_bbsr2023_demo/commit/fe03fa1560c573f94050372b7a7654b75465d550', commit_message='Upload model', commit_description='', oid='fe03fa1560c573f94050372b7a7654b75465d550', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
!nvidia-smi

Tue Oct 31 20:53:38 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0    35W /  70W |   4031MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-9ecb8e28-fc76-7481-7af2-4d3de2c801bf)
