# Fine-tuning of Llama using 4-bit Quantization

This notebook allows you to load Llama-7B-chat (or 13B-chat) in 4bit and train it using Google Colab and PEFT library from Hugging Face 🤗.





# Install

In [1]:
!pip install -q -U bitsandbytes
#!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install transformers==4.31 #temporary fix required owing to breaking changes on Aug 9th 2023
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.31
  Downloading transformers-4.31.0-py3-none-any.whl.metadata (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.9/116.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting un

In [2]:
# Required when training models/data that are gated on HuggingFace, and required for pushing models to HuggingFace
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Load the model to use: Llama-7B!

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "meta-llama/Llama-2-7b-chat-hf"
# model_id = "meta-llama/Llama-2-13b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

# Training Setup

Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [4]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [6]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8,
    lora_alpha=32,
    # target_modules=["query_key_value"],
    target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"], #specific to Llama models.
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 8388608 || all params: 3508801536 || trainable%: 0.23907331075678143


# Data Setup

In [12]:
from datasets import load_dataset
dataset = "task2.csv"
data = load_dataset("text", data_files=dataset)

Generating train split: 0 examples [00:00, ? examples/s]

In [13]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf')
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)



Map:   0%|          | 0/711433 [00:00<?, ? examples/s]

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load a common dataset, english quotes, to fine tune our model on famous quotes.

# Training

Run the cell below to run the training! For the sake of the demo, we just ran it for few steps just to showcase how to use this integration with existing tools on the HF ecosystem.

In [15]:
import transformers

# needed for Llama tokenizer
tokenizer.pad_token = tokenizer.eos_token # </s>

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,1.9326
2,1.8488
3,1.861
4,1.8471
5,1.7835
6,1.6765
7,1.6969
8,1.6131
9,1.6507
10,1.6143


TrainOutput(global_step=10, training_loss=1.752457356452942, metrics={'train_runtime': 72.0199, 'train_samples_per_second': 0.555, 'train_steps_per_second': 0.139, 'total_flos': 145350457393152.0, 'train_loss': 1.752457356452942, 'epoch': 0.0})

# Inference

In [16]:
from transformers import TextStreamer
model.config.use_cache = True
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (

In [17]:
# Define a stream *without* function calling capabilities
def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = 'You are an AI designed to convert natural language queries into pandas commands for data analysis. Given the query, generate the appropriate pandas command.'

    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

    prompt = f"{B_INST} {B_SYS}{system_prompt.strip()}{E_SYS}{user_prompt.strip()} {E_INST}\n\n"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer)

    # Despite returning the usual output, the streamer will also print the generated text to stdout.
    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=500)

In [18]:
stream('What is the average value of counter_2 for cells with a value greater than 50?')

<s> [INST] <<SYS>>
You are an AI designed to convert natural language queries into pandas commands for data analysis. Given the query, generate the appropriate pandas command.
<</SYS>>

What is the average value of counter_2 for cells with a value greater than 50? [/INST]

Great! You've got me trained to generate pandas commands based on natural language queries. For the query "What is the average value of counter_2 for cells with a value greater than 50?", I would generate the following pandas command:

```
df['counter_2'][df['counter_2'] > 50].mean()
```

Explanation:

* `df` is the name of the pandas DataFrame that contains the data.
* `counter_2` is the column name that you want to perform the operation on.
* `[df['counter_2'] > 50]` selects the rows where the value in column `counter_2` is greater than 50.
* `mean()` calculates the average value of the selected rows.

Note that this command assumes that the data is already loaded in a pandas DataFrame. If the data is not already i

In [21]:
stream('What is the maximum value of counter_1 for each cell_no?')

<s> [INST] <<SYS>>
You are an AI designed to convert natural language queries into pandas commands for data analysis. Given the query, generate the appropriate pandas command.
<</SYS>>

What is the maximum value of counter_1 for each cell_no? [/INST]

Great! You've got me trained to generate pandas commands from natural language queries. Here's the command to answer your question:

`cell_no.value['counter_1'].max()`

This command uses the `value` attribute of the `cell_no` series to access the values of the `counter_1` column, and then calls the `max()` method to get the maximum value for each cell.

I hope this helps! Let me know if you have any other questions.</s>


In [22]:
stream('Find the average Bandwidth for all data points ?')

<s> [INST] <<SYS>>
You are an AI designed to convert natural language queries into pandas commands for data analysis. Given the query, generate the appropriate pandas command.
<</SYS>>

Find the average Bandwidth for all data points recorded in June [/INST]

To find the average bandwidth for all data points recorded in June using pandas, you can use the following command:
```
df['Bandwidth'].mean()
```
Explanation:

* `df`: This is the pandas DataFrame that contains the data points you want to analyze.
* `'Bandwidth'`: This is the column name of the column that contains the bandwidth values.
* `.mean()`: This is the pandas method that calculates the mean of a column.

By calling `.mean()` on the `Bandwidth` column, you are calculating the average bandwidth for all data points in the `df` DataFrame that have a `June` label.</s>


In [23]:
stream('What is the sum of counter_2 for the cell_no 1?')

<s> [INST] <<SYS>>
You are an AI designed to convert natural language queries into pandas commands for data analysis. Given the query, generate the appropriate pandas command.
<</SYS>>

What is the sum of counter_2 for the cell_no 1? [/INST]

Great! You've provided me with a query, and I'm happy to help you generate the appropriate pandas command.

Based on your query, I understand that you want to sum the value of `counter_2` for cell `1`. Here's the pandas command you can use to achieve this:
```
df['counter_2'].loc[1].sum()
```
In this command, `df` is the name of the pandas DataFrame that contains the data you want to work with, and `loc` is a method that allows you to select a specific cell or range of cells in the DataFrame.

By calling `loc[1]`, we are telling pandas to select the cell at index `1` in the DataFrame. Then, by calling `sum()`, we are summing the value of `counter_2` for that cell.

So, the final result of this command will be the sum of `counter_2` for cell `1` in

# Push Model to Hub (Hugging Face)

In [24]:
base_model_name = model_id.split("/")[-1]

adapter_model = f"lltutor/{base_model_name}-text2pandas-command-engine-fine-tuned-adapters"
new_model = f"lltutor/{base_model_name}-text2pandas-command-engine-fine-tuned"

In [25]:
# Save the model
model.save_pretrained(adapter_model, push_to_hub=True, use_auth_token=True)
# Push the model to the hub
model.push_to_hub(adapter_model, use_auth_token=True)



adapter_model.bin:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mahmoud-hussein16/Llama-2-7b-chat-hf-text2pandas-command-engine-fine-tuned-adapters/commit/47cba671335cc60ffefbdbd1019430c1222deda3', commit_message='Upload model', commit_description='', oid='47cba671335cc60ffefbdbd1019430c1222deda3', pr_url=None, pr_revision=None, pr_num=None)

In [26]:
from transformers import AutoModelForCausalLM
cache_dir = 'outputs/'
# reload the base model (you might need a pro subscription for this because you may need a high RAM environment for the 13B model since this is loading the full original model, not quantized)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='cpu', trust_remote_code=True, torch_dtype=torch.float16, cache_dir=cache_dir)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [27]:
from peft import PeftModel

# load perf model with new adapters
model = PeftModel.from_pretrained(
    model,
    adapter_model,
)

In [28]:
model = model.merge_and_unload()   # merge adapters with the base model.

In [29]:
model.push_to_hub(new_model, use_auth_token=True, max_shard_size="5GB")

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/3.59G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/lltutor/Llama-2-7b-chat-hf-text2pandas-command-engine-fine-tuned/commit/87f99fbdc5828381d7b4d0ce32814054680f0c6e', commit_message='Upload LlamaForCausalLM', commit_description='', oid='87f99fbdc5828381d7b4d0ce32814054680f0c6e', pr_url=None, pr_revision=None, pr_num=None)

In [30]:
#Push the tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.push_to_hub(new_model, use_auth_token=True)

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lltutor/Llama-2-7b-chat-hf-text2pandas-command-engine-fine-tuned/commit/ae338bca6888922d46cd173811b2e07fa93d1f63', commit_message='Upload tokenizer', commit_description='', oid='ae338bca6888922d46cd173811b2e07fa93d1f63', pr_url=None, pr_revision=None, pr_num=None)