In [1]:
!pip install transformers torch



In [None]:
from huggingface_hub import login
HF_TOKEN = ""  # replace with your token
login(HF_TOKEN)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [5]:
def analyze_model_kv_cache(model_name, model_label):
    """Loads a models config, extracts params, and calculates its KV cache size."""
    print(f"\n--- Analyzing {model_label}: {model_name} ---")
    
    temp_model = AutoModelForCausalLM.from_pretrained(model_name)
    model_dtype = temp_model.dtype
    config = temp_model.config
    del temp_model
    
    
    L = config.num_hidden_layers
    N_q = config.num_attention_heads
    N_kv = getattr(config, "num_key_value_heads", N_q)  # Some models have separate KV heads
    D_model = config.hidden_size
    D_head = D_model // N_q
    dtype_size_bytes = torch.finfo(model_dtype).bits // 8 if model_dtype.is_floating_point else 2
    
    print(f" Number of Layers (L): {L}")
    print(f" Number of Query Heads (N_q): {N_q}")
    print(f" Number of KV Heads (N_kv): {N_kv}")
    print (f" Head Dimension (D_head): {D_head}")
    print(f" Data Type size (bytes): {dtype_size_bytes} ({model_dtype})")
    
    total_cache_per_token_bytes = 2 * N_kv * D_head * dtype_size_bytes
    cache_layer_per_token_bytes = 2 * N_kv * D_head * dtype_size_bytes
    
    if N_kv == N_q:
        attention_type = "Multi-Head Attention (MHA)"
    elif 1 < N_kv < N_q:
        attention_type = "Grouped Query Attention (GQA)"
    else:
        attention_type = "Multi Query Attention"
        
    print(f" Attention Type: {attention_type}")
    
    resulrs = {
        "label": model_label, "L": L, "N_q": N_q, "N_kv": N_kv, "D_model": D_model, "D_head": D_head,
        "dtype_size_bytes": dtype_size_bytes, "attention_type": attention_type,
        "total_cache_per_token_bytes": total_cache_per_token_bytes,
        "cache_layer_per_token_bytes": cache_layer_per_token_bytes
    }
    return resulrs
    

In [8]:
model_name_gqa = "microsoft/Phi-3.5-mini-instruct"
model_name_mha = "openai-community/gpt2-xl"

results_gpa = analyze_model_kv_cache(model_name_gqa, "LLaMA 3.2 1B (GQA)")
results_mha = analyze_model_kv_cache(model_name_mha, "GPT-2 XL (MHA)")

This model config has set a `rope_parameters['original_max_position_embeddings']` field, to be used together with `max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, as it is compatible with most model architectures.



--- Analyzing LLaMA 3.2 1B (GQA): microsoft/Phi-3.5-mini-instruct ---


Loading weights: 100%|██████████| 195/195 [00:00<00:00, 2403.26it/s, Materializing param=model.norm.weight]                              


 Number of Layers (L): 32
 Number of Query Heads (N_q): 32
 Number of KV Heads (N_kv): 32
 Head Dimension (D_head): 96
 Data Type size (bytes): 2 (torch.bfloat16)
 Attention Type: Multi-Head Attention (MHA)

--- Analyzing GPT-2 XL (MHA): openai-community/gpt2-xl ---


Loading weights: 100%|██████████| 580/580 [00:00<00:00, 1555.90it/s, Materializing param=transformer.wte.weight]             
[1mGPT2LMHeadModel LOAD REPORT[0m from: openai-community/gpt2-xl
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...47}.attn.bias | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


 Number of Layers (L): 48
 Number of Query Heads (N_q): 25
 Number of KV Heads (N_kv): 25
 Head Dimension (D_head): 64
 Data Type size (bytes): 4 (torch.float32)
 Attention Type: Multi-Head Attention (MHA)


In [10]:
if results_gpa and results_mha:
    print("\nSummary:")
    
    # Per Layer Comaprison
    gpa_chache_per_layer_kb = results_gpa["cache_layer_per_token_bytes"] / 1024
    mha_chache_per_layer_kb = results_mha["cache_layer_per_token_bytes"] / 1024
    print (f" phi mini: {gpa_chache_per_layer_kb:.2f} KB/token/layer")
    print (f" gpt2-xl: {mha_chache_per_layer_kb:.2f} KB/token/layer")
    
    # GQA Internal Saving Factor
    internal_saving_factor = results_gpa['N_q'] / results_gpa['N_kv']
    print(f" GQA Internal Saving Factor: {internal_saving_factor:.2f}")
    print (f"  phi mini uses {results_gpa['N_kv']} KV heads instead of {results_gpa['N_q']} query heads, saving {internal_saving_factor:.2f}x memory within each layer's cache.")
    
    
    
    


Summary:
 phi mini: 12.00 KB/token/layer
 gpt2-xl: 12.50 KB/token/layer
 GQA Internal Saving Factor: 1.00
  phi mini uses 32 KV heads instead of 32 query heads, saving 1.00x memory within each layer's cache.


In [12]:
if results_gpa and results_mha:
    
    vram_budget_mb = 6 * 1024  # 6 GB in MB
    
    gpa_total_mb_per_token = results_gpa["total_cache_per_token_bytes"] / (1024 * 1024)
    mha_total_mb_per_token = results_mha["total_cache_per_token_bytes"] / (1024 * 1024)
    
    max_tokens_gpa = vram_budget_mb / gpa_total_mb_per_token
    max_tokens_mha = vram_budget_mb / mha_total_mb_per_token
    
    print (f"\n Max sequence length estimation (with {vram_budget_mb / 1024:.1f} GB VRAM budget):")
    print (f" phi mini (GQA): {max_tokens_gpa:.0f} tokens")
    print (f" gpt2-xl (MHA): {max_tokens_mha:.0f} tokens") 


 Max sequence length estimation (with 6.0 GB VRAM budget):
 phi mini (GQA): 524288 tokens
 gpt2-xl (MHA): 503316 tokens


In [14]:
L = 16
N_q = 16
N_kv = 4
D_model = 1024
D_head = D_model // N_q
dtype_size_bytes = 4  # float32

per_layer_per_token = 2* N_kv * D_head * dtype_size_bytes
total_per_token = L * per_layer_per_token

print(f"Per layer per token: {per_layer_per_token / 1024:.2f} KB")
print(f"Total per token: {total_per_token / (1024 * 1024):.2f} MB")

Per layer per token: 2.00 KB
Total per token: 0.03 MB
