<a href="https://colab.research.google.com/github/myrondza10/llama2-llm-genai/blob/main/llama2_finetuning_peft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install Required Libraries & Packages

In [None]:
pip install transformers==4.31.0 accelerate==0.21.0 einops==0.6.1 langchain==0.0.240 xformers==0.0.20 bitsandbytes==0.41.0 peft safetensors sentencepiece streamlit langchain sentence-transformers gradio pypdf chromadb==0.4.15 pypdfium2

Collecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops==0.6.1
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain==0.0.240
  Downloading langchain-0.0.240-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xformers==0.0.20
  Downloading xformers-0.0.20-cp310-cp310-manylinux2014_x86_64.whl (109.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m109.1/109.1 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.41.0
  Downloading bitsandbytes-0.41.0-py3-none-any.whl (92.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Pretrained LLaMa2 Model 🤗

<img src="https://img.freepik.com/premium-photo/llama-art-wallpaper-showcasing-arriving-new-revolutionary-ai-model_843415-12331.jpg" heigh=600 width=600>

In [None]:
!pip install peft==0.4.0 trl==0.4.7

Collecting trl==0.4.7
  Downloading trl-0.4.7-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.4/77.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from trl==0.4.7)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets->trl==0.4.7)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets->trl==0.4.7)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets->trl==0.4.7)
  Downloading multiprocess-0.70.16-py3

### Import Required Libraries & Packages

In [None]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
from datasets import load_dataset

In [None]:
# QLoRA parameters
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1

# Bitsandbytes parameters
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# TrainingArguments parameters
output_dir = "./results"
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25

# SFT parameters
max_seq_length = None
packing = False
device_map = {"": 0}

In [None]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

### Import Meta's Llama2 Model from Hugging Face 🤗

In [None]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'

# Begin initializing HF items, you need an access token
hf_auth = '<hf_access_token>

In [None]:
device

'cuda:0'

In [None]:
import transformers

In [None]:
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    trust_remote_code=True,
    use_auth_token=hf_auth
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



### Llama2's Model Architecture

In [None]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


In [None]:
model.config.use_cache = False
model.config.pretraining_tp = 1
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

In [None]:
from datasets import load_dataset
import pandas as pd

# Path to your Parquet file
parquet_file_path = "train.parquet"
dataset = pd.read_parquet(parquet_file_path).tail(10)

In [None]:
dataset

Unnamed: 0,text
990,<s>[INST] Voglio gioccare un gioco di ruolo. I...
991,<s>[INST] What is the hardest instrument to pl...
992,<s>[INST] Can you list the 5 most popular open...
993,<s>[INST] What is the airspeed velocity of an ...
994,<s>[INST] Cuéntame un chiste sobre un tomate y...
995,<s>[INST] I want you to act as a Linux termina...
996,<s>[INST] quiero un tutorial de como acceder a...
997,"<s>[INST] Auf Mani reimt sich Banani, daraus w..."
998,<s>[INST] Buenos días! [/INST] ¡Hola! ¿Cómo es...
999,<s>[INST] Est-tu mieux que Chat-GPT? [/INST] P...


In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(dataset)

In [None]:
dataset

Dataset({
    features: ['text'],
    num_rows: 10
})

In [None]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=3, training_loss=2.6065750122070312, metrics={'train_runtime': 50.1436, 'train_samples_per_second': 0.199, 'train_steps_per_second': 0.06, 'total_flos': 79137734492160.0, 'train_loss': 2.6065750122070312, 'epoch': 1.0})

### Update LLM Model Weights (Merge Finetuned Weights)

In [None]:
finetuned_model = "llama-2-7b-finetuned"
trainer.model.save_pretrained(finetuned_model)

In [None]:
finetuned_model

'llama-2-7b-finetuned'

In [None]:
model_final = PeftModel.from_pretrained(model,finetuned_model)

In [None]:
model_final

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear4bit(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (

In [None]:
prompt = "Can you list the 5 most popular open source software licenses?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Can you list the 5 most popular open source software licenses? [/INST]  Sure! Here are five of the most popular open source software licenses:
 nobody, the GNU General Public License (GPL), the MIT License, the Apache License, and the Eclipse Public License.

1. GNU General Public License (GPL): The GPL is one of the most widely used open source licenses, created by the Free Software Foundation (FSF) in 1989. It requires that any derivative works of a GPL-licensed program also be distributed under the GPL, ensuring that the software remains free and open source.
2. MIT License: The MIT License is a permissive open source license, which means it allows users to modify and distribute the software without having to release their changes under the same license. It is commonly used for software that is intended to be widely used and distributed


In [None]:
prompt = "Can you list the 5 most popular open source software licenses?"
pipe = pipeline(task="text-generation", model=model_final, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

<s>[INST] Can you list the 5 most popular open source software licenses? [/INST]  Sure! Here are the 5 most popular open source software licenses, based on the number of projects using them:
 sierp 2022 data:

1. MIT License: The MIT License is one of the most widely used open source licenses, with over 100,000 projects using it. It is a permissive license that allows for free use, modification, and distribution of the software.
2. Apache License 2.0: The Apache License 2.0 is the second most popular open source license, with over 60,000 projects using it. It is a permissive license that allows for free use, modification, and distribution of the software, with some restrictions on patent rights.
3. GNU General Public License (GPL): The GPL is a widely
