<a href="https://colab.research.google.com/github/neeleshbatham/LLM_Fine_Tuning/blob/main/Lamma2_Fine_Tune_LORA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m37.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [3]:
import os
import torch
from datasets import load_dataset
from transformers import(
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer


In [9]:
model_name = "NousResearch/Llama-2-7b-chat-hf"

dataset_name = "mlabonne/guanaco-llama2-1k"

new_model = "Llama-2-7b-chat-finetune-n"


# QLora Prameters

lora_r = 64

lora_alpha = 16

lora_dropout = 0.1

# BitAndByte Parameters for Quantization

use_4bit = True

bnb_4bit_compute_dtype = "float16"

# Quantization type(nf4 or fp4)
bnb_4bit_quant_type = "nf4"

use_nested_quant = False


# TrainingArgs Params

output_dir = "./results"

#Training Epochs
num_train_epochs = 1

#Enable fp16/bf16 training
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumlate gradients
gradient_accumulation_steps =1

#Enable gradient checkpointing
gradient_checkpointing = True

#Maximum gradient normal
max_grad_norm = 0.3

#intial learning rate
learning_rate = 2e-4

#Weight decay to apply to all layer except bias/LayerNorm Weights
weight_decay = 0.001

#Optimizer
optim = "paged_adamw_32bit"

#Learning rate schedule
lr_scheduler_type = "cosine"

#Number of training steps
max_steps =1

#Ratio of steps for training steps
warmup_ratio = 0.03

#Groups sequences into batch with same length
# Good for saving memory and speed.
group_by_length = True

#Save checkpoint every X updates steps
save_steps = 0

#Log Every X update steps
logging_steps =25



# SupervisedFT PARAMETERS

# Maximum sequence length to use.
max_seq_length = None

# Pack multiple short examples in same input sequence; Efficiency Boost
packing =False

# Load Model on GPU
device_map = {"": 0 }

In [10]:
# Load Dataset
dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configs
compute_dtype= getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=use_nested_quant,

)


#Check GPU compatibulity with float16

if compute_dtype == torch.float16 and use_4bit:
  major, _ = torch.cuda.get_device_capability()
  if major >=8:
    print("=", *50)
    print("GPU Supports bfloat16: accelerate training with bf16=True")
    print("=", *50)


# Load Base Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map=device_map
)

model.config.use_cache=True
model.config.pretraining_tp=1

# Load LLama Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load Lora Config
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"

)
# Set Training Parameters
training_arguments = TrainingArguments(
    output_dir = output_dir,
    num_train_epochs = num_train_epochs,
    per_device_train_batch_size =per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

#SuperVised FINE TUNING Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer = tokenizer,
    args=training_arguments,
    packing=packing,
)

trainer.train()




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


TrainOutput(global_step=1, training_loss=1.3449612855911255, metrics={'train_runtime': 15.1628, 'train_samples_per_second': 0.264, 'train_steps_per_second': 0.066, 'total_flos': 83629556367360.0, 'train_loss': 1.3449612855911255, 'epoch': 0.0})

In [11]:
#Save Trained Model

trainer.model.save_pretrained(new_model)

In [13]:
#Ignore Warnings
logging.set_verbosity(logging.CRITICAL)


# Run Text Generation pipeline with new MODEL

prompt = "What is Large Language Model?"

pipe = pipeline(task='text-generation', model=model, tokenizer=tokenizer, max_length=200)

result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] What is Large Language Model? [/INST]  A large language model (LLM) is a type of artificial intelligence (AI) model that is trained on a large dataset of text to generate language outputs that are coherent and natural-sounding. everybody. Large language models are trained on vast amounts of text data, such as books, articles, and websites, and use a combination of machine learning algorithms and neural networks to learn the patterns and structures of language.

The goal of a large language model is to be able to generate text that is similar to the training data, but not necessarily identical to it. This means that the model can produce novel and creative text, such as writing a story or generating a new sentence that has never been seen before.

Some examples of large language models include:

1. Generative Adversarial Networks (GANs): GANs are a type of neural network that consists
