<a href="https://colab.research.google.com/github/mightyoctopus/lora-model-inspection/blob/main/w7_d1_LoRA_Intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Predict Product Prices

An introduction to LoRA and QLoRA.

Take a close look at the footprint memory for each model and model architecture.

In [None]:
!pip install -q --upgrade torch==2.5.1+cu124 torchvision==0.20.1+cu124 torchaudio==2.5.1+cu124 --index-url https://download.pytorch.org/whl/cu124
!pip install -q requests bitsandbytes==0.46.0 transformers==4.48.3 accelerate==1.3.0
!pip install -q datasets requests peft

In [None]:
import os
import re
import math
from tqdm import tqdm
from google.colab import userdata
from huggingface_hub import login
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, set_seed
from peft import LoraConfig, PeftModel
from datetime import datetime

In [None]:
### Constants

BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
FINETUNED_MODEL = f"ed-donner/pricer-2024-09-13_13.04.39"


### HyperParameters for QLoRA

LORA_R = 32
LORA_ALPHA = 64
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]

In [None]:
### Log in to Hugging Face

hf_token = userdata.get("HF_TOKEN")
login(hf_token, add_to_git_credential=True)

In [None]:
### Load the base model (without quantization yet)

base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map="auto")

In [None]:
print(f"Memory Footprint: {base_model.get_memory_footprint() / 1e9:,.1f} GB")

In [None]:
base_model

## Restart the session

The model will be loaded with quantization 8 bit (Simplified quantization, not the full configuration)

In [None]:
### Load the base model using 8 bit

quant_config = BitsAndBytesConfig(load_in_8bit=True)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto"
)

In [None]:
print(f"Footprint Memory: {base_model.get_memory_footprint() / 1e9:,.1f}GB")

In [None]:
base_model

## Restart the session

The model will be loaded with quantization 4 bit (with full configuration)

In [None]:
### Load the base model with 4 bit quantization

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto",
)



In [None]:
### Check Memory Footprint of the model (with 4 bit)

print(f"Memory Footprint: {base_model.get_memory_footprint() / 1e9:,.2f}GB")

In [None]:
base_model

## Restart the session

A fine tuned model with 4 bit quantization and will be checked the model architecture

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=quant_config,
    device_map="auto"
)

In [None]:
print(f"Memory Footprint: {base_model.get_memory_footprint() / 1e9:,.2f} GB")

In [None]:
base_model

In [None]:
### Loading a fine tuned model

fine_tuned_model = PeftModel.from_pretrained(base_model,FINETUNED_MODEL)

In [None]:
print(f"Memory Footprint: {fine_tuned_model.get_memory_footprint() / 1e9:,.2f} GB")

In [None]:
fine_tuned_model

In [None]:
### Each of the Target Modules has 2 LoRA Adaptor matrices, called lora_A and lora_B
### These are designed so that weights can be adapted by adding alpha * lora_A * lora_B
### Let's count the number of weights using their dimensions:

in_f = 4096

lora_q_proj = LORA_R * in_f + LORA_R * 4096
lora_k_proj = LORA_R * in_f + LORA_R * 1024
lora_v_proj = LORA_R * in_f + LORA_R * 1024
lora_o_proj = LORA_R * in_f + LORA_R * 4096

lora_layer = lora_q_proj + lora_k_proj + lora_v_proj + lora_o_proj

print(f"{lora_layer:,} parameters per layer")


# There are 32 layers
params = lora_layer * 32

print(f"{params:,} parameters")


### Total size in MB:
size = (params * 4) / 1_000_000
print(f"{size:.2f} MB")