# LoRA Parameter Counting Analysis

## Goal
Demonstrate the parameter efficiency of LoRA by:
1. Loading a model and inspecting its linear layer dimensions
2. Counting total parameters in each module type (q_proj, k_proj, v_proj, etc.)
3. Calculating trainable parameters for LoRA at different ranks (r=8, r=16)
4. Comparing different targeting strategies (attention-only vs. all linear layers)

This analysis shows why LoRA is so memory-efficient: instead of fine-tuning millions or billions of parameters, we train only a small fraction (typically <1%) while maintaining strong performance.

## Key Insight
For a linear layer with shape `(m, n)`, full fine-tuning requires `m*n` parameters.

LoRA with rank `r` requires only `r*(m+n)` parameters - **a massive reduction!**

**Model used**: Llama 3.2 1B Instruct (smaller model for demonstration)

![lora-diagram](lora-diagram.webp)


In [1]:
# Installation (uncomment if needed)
!pip install -q torch transformers peft accelerate bitsandbytes pandas



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
import os
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoConfig
from huggingface_hub import login


# Authenticate with Hugging Face (required for gated models like Llama)


# uncomment if you are using google colab
# from google.colab import userdata
#login(token=userdata.get("HF_TOKEN"))

from dotenv import load_dotenv

load_dotenv()
login(token=os.getenv("HF_TOKEN"))


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [6]:
# Define the order for displaying module types (attention first, then MLP)
ORDERED_TAGS = [
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
    "gate_proj",
    "up_proj",
    "down_proj",
]

# Define the linear layer types we care about (attention + MLP projections)
TARGET_TAGS = {
    "q_proj": "q_proj",  # Query projection (attention)
    "k_proj": "k_proj",  # Key projection (attention)
    "v_proj": "v_proj",  # Value projection (attention)
    "o_proj": "o_proj",  # Output projection (attention)
    "gate_proj": "gate_proj",  # Gate projection (MLP)
    "up_proj": "up_proj",  # Up projection (MLP)
    "down_proj": "down_proj",  # Down projection (MLP)
}


In [7]:
MODEL_ID = "meta-llama/Llama-3.2-1B-Instruct"

# Determine device (GPU if available, otherwise CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}\n")

# Load model configuration first (lightweight, just for inspection)
cfg = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True)

# Load the full model
print(f"Loading {MODEL_ID}...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,  # Use FP16 for memory efficiency
    low_cpu_mem_usage=True,  # Reduce RAM usage during loading
    device_map="auto",  # Automatically use available GPUs
    trust_remote_code=True,  # Allow custom model code if needed
)

# Print total model size
total_params = sum(p.numel() for p in model.parameters())
print(f"Total model parameters: {total_params/1e9:.2f}B\n")


Using device: cpu

Loading meta-llama/Llama-3.2-1B-Instruct...


`torch_dtype` is deprecated! Use `dtype` instead!


Total model parameters: 1.24B



In [8]:
print("=" * 70)
print("MODEL ARCHITECTURE")
print("=" * 70)
print(model)
print("\n")


MODEL ARCHITECTURE
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,),

In [9]:
def layer_tag(name):
    """Extract the module type from a parameter name."""
    for k, v in TARGET_TAGS.items():
        if name.endswith(k + ".weight"):
            return v
    return None


def sort_by_ordered_tags(dataframe, column_name="tag"):
    """Sort a dataframe by the predefined module order."""
    df_copy = dataframe.copy()
    df_copy["_sort_order"] = df_copy[column_name].apply(
        lambda x: ORDERED_TAGS.index(x) if x in ORDERED_TAGS else len(ORDERED_TAGS)
    )
    df_sorted = df_copy.sort_values("_sort_order").drop(columns=["_sort_order"])
    return df_sorted


def lora_trainables(layer_dims, rank):
    """
    Calculate trainable parameters for LoRA.
    
    For a linear layer with shape (m, n):
    - Full fine-tuning: m * n parameters
    - LoRA with rank r: m*r + r*n = r*(m+n) parameters
    
    Args:
        layer_dims: List of (m, n) tuples for each layer
        rank: LoRA rank (r)
    
    Returns:
        Total trainable parameters across all layers
    """
    return sum(rank * (m + n) for (m, n) in layer_dims)


def fmt_millions(x):
    """Format large numbers as millions with 2 decimal places."""
    return f"{x/1e6:.2f}M"


In [10]:
# Collect information about all linear layers
rows = []
for name, param in model.named_parameters():
    # Focus on 2D weight matrices (linear layers)
    if param.ndim == 2 and name.endswith(".weight"):
        tag = layer_tag(name)
        if tag:
            m, n = param.shape  # Weight matrix is [out_features, in_features]
            rows.append(
                {
                    "name": name,
                    "tag": tag,
                    "m": m,  # Output dimension
                    "n": n,  # Input dimension
                    "params": m * n,  # Total parameters in this layer
                }
            )

# Create DataFrame for analysis
df = pd.DataFrame(rows)

# Aggregate by module type
total_by_tag = (
    df.groupby("tag")
    .agg(
        layers=("name", "count"),  # Number of layers of this type
        total_params=("params", "sum"),  # Total parameters across all layers
    )
    .reset_index()
)

# Sort by predefined order
total_by_tag = sort_by_ordered_tags(total_by_tag, column_name="tag")

# Format for readability
total_by_tag["total_params_in_millions"] = total_by_tag["total_params"].apply(
    fmt_millions
)

print("=" * 70)
print("PARAMETER COUNT BY MODULE TYPE")
print("=" * 70)
display(total_by_tag)
print("\n")


PARAMETER COUNT BY MODULE TYPE


Unnamed: 0,tag,layers,total_params,total_params_in_millions
4,q_proj,16,67108864,67.11M
2,k_proj,16,16777216,16.78M
6,v_proj,16,16777216,16.78M
3,o_proj,16,67108864,67.11M
1,gate_proj,16,268435456,268.44M
5,up_proj,16,268435456,268.44M
0,down_proj,16,268435456,268.44M






In [11]:
print("=" * 70)
print("MANUAL DIMENSION VERIFICATION")
print("=" * 70)

# Attention projections
q_proj_num_params = 2048 * 2048 * 16  # [hidden_size, hidden_size] × num_layers
k_proj_num_params = 2048 * 512 * 16  # [hidden_size, kv_channels] × num_layers
v_proj_num_params = 2048 * 512 * 16  # [hidden_size, kv_channels] × num_layers
o_proj_num_params = 2048 * 2048 * 16  # [hidden_size, hidden_size] × num_layers

# MLP projections
gate_proj_num_params = 2048 * 8192 * 16  # [hidden_size, intermediate_size] × num_layers
up_proj_num_params = 2048 * 8192 * 16  # [hidden_size, intermediate_size] × num_layers
down_proj_num_params = 8192 * 2048 * 16  # [intermediate_size, hidden_size] × num_layers

print(f"q_proj total:    {q_proj_num_params/1e6:>8.2f}M")
print(f"k_proj total:    {k_proj_num_params/1e6:>8.2f}M")
print(f"v_proj total:    {v_proj_num_params/1e6:>8.2f}M")
print(f"o_proj total:    {o_proj_num_params/1e6:>8.2f}M")
print(f"gate_proj total: {gate_proj_num_params/1e6:>8.2f}M")
print(f"up_proj total:   {up_proj_num_params/1e6:>8.2f}M")
print(f"down_proj total: {down_proj_num_params/1e6:>8.2f}M")

total_linear_params = (
    q_proj_num_params
    + k_proj_num_params
    + v_proj_num_params
    + o_proj_num_params
    + gate_proj_num_params
    + up_proj_num_params
    + down_proj_num_params
)
print(f"\nTotal linear layer params: {total_linear_params/1e6:.2f}M")
print("(This should match the sum from the table above)\n")


MANUAL DIMENSION VERIFICATION
q_proj total:       67.11M
k_proj total:       16.78M
v_proj total:       16.78M
o_proj total:       67.11M
gate_proj total:   268.44M
up_proj total:     268.44M
down_proj total:   268.44M

Total linear layer params: 973.08M
(This should match the sum from the table above)



In [12]:
print("=" * 70)
print("LORA EFFICIENCY: PER-MODULE ANALYSIS")
print("=" * 70)

# Calculate LoRA parameters for each module type
records = []
for tag, group in df.groupby("tag"):
    # Get dimensions for all layers of this type
    mn_list = list(zip(group["m"], group["n"]))
    
    # Full fine-tuning parameter count
    full_params = group["params"].sum()
    
    # LoRA trainable parameters for r=8 and r=16
    lora_r8 = lora_trainables(mn_list, rank=8)
    lora_r16 = lora_trainables(mn_list, rank=16)
    
    records.append(
        {
            "target": tag,
            "layers": len(mn_list),
            "full_params": full_params,
            "LoRA_r8_trainables": lora_r8,
            "LoRA_r16_trainables": lora_r16,
            "LoRA_r8_percent": 100 * lora_r8 / full_params if full_params > 0 else 0,
            "LoRA_r16_percent": 100 * lora_r16 / full_params if full_params > 0 else 0,
        }
    )

per_tag = pd.DataFrame(records)

# Sort by predefined order
per_tag = sort_by_ordered_tags(per_tag, column_name="target")

# Format for display
for col in ["full_params", "LoRA_r8_trainables", "LoRA_r16_trainables"]:
    per_tag[col + "_millions"] = per_tag[col].apply(fmt_millions)

# Round percentages
per_tag["LoRA_r8_percent"] = per_tag["LoRA_r8_percent"].round(2)
per_tag["LoRA_r16_percent"] = per_tag["LoRA_r16_percent"].round(2)

print("\nIf you target ONLY one module type:")
display(
    per_tag[
        [
            "target",
            "layers",
            "full_params_millions",
            "LoRA_r8_trainables_millions",
            "LoRA_r8_percent",
            "LoRA_r16_trainables_millions",
            "LoRA_r16_percent",
        ]
    ]
)
print("\n")


LORA EFFICIENCY: PER-MODULE ANALYSIS

If you target ONLY one module type:


Unnamed: 0,target,layers,full_params_millions,LoRA_r8_trainables_millions,LoRA_r8_percent,LoRA_r16_trainables_millions,LoRA_r16_percent
4,q_proj,16,67.11M,0.52M,0.78,1.05M,1.56
2,k_proj,16,16.78M,0.33M,1.95,0.66M,3.91
6,v_proj,16,16.78M,0.33M,1.95,0.66M,3.91
3,o_proj,16,67.11M,0.52M,0.78,1.05M,1.56
1,gate_proj,16,268.44M,1.31M,0.49,2.62M,0.98
5,up_proj,16,268.44M,1.31M,0.49,2.62M,0.98
0,down_proj,16,268.44M,1.31M,0.49,2.62M,0.98






In [15]:
print("=" * 70)
print("LORA EFFICIENCY: COMMON TARGETING SCENARIOS")
print("=" * 70)

# Define common targeting strategies (in display order)
SCENARIOS = [
    ("qv_only (style/format tasks)", ["q_proj", "v_proj"]),
    ("qkvo (instruction following)", ["q_proj", "k_proj", "v_proj", "o_proj"]),
    (
        "all_linear (domain adaptation)",
        ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    ),
]


def calculate_scenario(tags):
    """Calculate LoRA parameters for a given set of target modules."""
    filtered = df[df["tag"].isin(tags)]
    mn_list = list(zip(filtered["m"], filtered["n"]))
    
    full_params = int(filtered["params"].sum())
    lora_r8 = int(lora_trainables(mn_list, rank=8))
    lora_r16 = int(lora_trainables(mn_list, rank=16))
    
    return {
        "matrices": len(mn_list),
        "full_params": full_params,
        "LoRA_r8_trainables": lora_r8,
        "LoRA_r16_trainables": lora_r16,
        "LoRA_r8_percent": 100 * lora_r8 / full_params if full_params > 0 else 0,
        "LoRA_r16_percent": 100 * lora_r16 / full_params if full_params > 0 else 0,
    }


# Build scenario table (preserves insertion order)
scenario_rows = []
for name, tags in SCENARIOS:
    scenario_rows.append({"scenario": name, **calculate_scenario(tags)})

sc_df = pd.DataFrame(scenario_rows)

# Format for display
for col in ["full_params", "LoRA_r8_trainables", "LoRA_r16_trainables"]:
    sc_df[col + "_millions"] = sc_df[col].apply(fmt_millions)

sc_df["LoRA_r8_percent"] = sc_df["LoRA_r8_percent"].round(2)
sc_df["LoRA_r16_percent"] = sc_df["LoRA_r16_percent"].round(2)

print("\nCommon targeting strategies:")
display(
    sc_df[
        [
            "scenario",
            "matrices",
            "full_params_millions",
            "LoRA_r8_trainables_millions",
            "LoRA_r8_percent",
            "LoRA_r16_trainables_millions",
            "LoRA_r16_percent",
        ]
    ]
)


LORA EFFICIENCY: COMMON TARGETING SCENARIOS

Common targeting strategies:


Unnamed: 0,scenario,matrices,full_params_millions,LoRA_r8_trainables_millions,LoRA_r8_percent,LoRA_r16_trainables_millions,LoRA_r16_percent
0,qv_only (style/format tasks),32,83.89M,0.85M,1.02,1.70M,2.03
1,qkvo (instruction following),64,167.77M,1.70M,1.02,3.41M,2.03
2,all_linear (domain adaptation),112,973.08M,5.64M,0.58,11.27M,1.16


# Applying LoRA with PEFT

Let's apply LoRA to our model and verify the trainable parameter count:


In [24]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=512, bias=False)
            (lora_dropout): ModuleDict(
              (default)

In [21]:
from peft import LoraConfig, get_peft_model

# Create LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,  # Scaling factor
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Target attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA to model
peft_model = get_peft_model(model, lora_config)




In [25]:
peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [22]:
# Print trainable parameters before and after
print("BEFORE LoRA:")
total = sum(p.numel() for p in model.parameters())
print(f"  Total parameters: {total:,}")
print(f"  All parameters trainable: {total:,} (100%)")
print()

print("AFTER LoRA:")
peft_model.print_trainable_parameters()


BEFORE LoRA:
  Total parameters: 1,239,222,272
  All parameters trainable: 1,239,222,272 (100%)

AFTER LoRA:
trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750


In [23]:
peft_model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear

In [19]:
merged_model = peft_model.merge_and_unload()
merged_model


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (ro