# 基本设置

In [4]:
from typing import Literal, Dict
from enum import Enum, IntEnum
from dataclasses import dataclass
from pprint import pprint

In [39]:
@dataclass
class TinyLLaMAConfig:
    '''https://github.com/jzhang38/TinyLlama?tab=readme-ov-file#training-details'''
    '''https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0/blob/main/config.json'''
    # Transformer block 的数量
    num_decoder_blocks: int = 22

    # 模型最大可处理的上下文长度
    context_length: int = 2048

    # token 向量的维度
    n_embd: int = 2048

    # 前馈网络（MLP）中间层的维度
    ffw_size: int = 5632

    # 多头注意力的头数（head 数量）
    n_head: int = 32

    # 模型支持的 token 数量（即词表大小）
    vocab_size: int = 32000

    # 是否为 Linear 层添加 bias（偏置项）
    bias: Literal[False] = False

    def __post_init__(self) -> None:
        assert self.ffw_size >= self.n_embd, "ffw_size should be ≥ n_embd"
        assert self.bias is False, "bias must be False in this experiment."


class TinyLLaMAModelType(Enum):
    # TinyLlama/TinyLlama-1.1B-Chat-v1.0
    TINY_LLAMA_1B = "/nfs/home/xiaoxiao/models/hf_models/TinyLlama-1.1B"


class ByteUnits(IntEnum):
    B = 1  # Byte = 1 byte
    KB = 1000  # Kilobyte = 10^3 bytes
    MB = 1000 ** 2  # Megabyte = 10^6 bytes
    GB = 1000 ** 3  # Gigabyte = 10^9 bytes


class FloatingPointPrecision(IntEnum):
    FP32 = 4  # 32-bit floating-point, 4 bytes
    FP16 = 2  # 16-bit floating-point, 2 bytes
    BFLOAT16 = 2  # bfloat16, 16-bit, 2 bytes


class GPUMemory(Enum):
    A100_40GB = 40e9  # 40 GB for NVIDIA A100
    V100_16GB = 16e9  # 16 GB for NVIDIA V100
    V100_32GB = 32e9  # 32 GB for NVIDIA V100
    T4_16GB = 16e9  # 16 GB for NVIDIA T4
    P100_16GB = 16e9  # 16 GB for NVIDIA P100
    RTX4090_24GB = 24e9  # 24 GB for NVIDIA RTX 4090


class GPU:
    def __init__(self, name: str, flops: Dict[FloatingPointPrecision, float]) -> None:
        self.name = name
        self.flops = flops


class RTX4090(GPU):
    def __init__(self) -> None:
        super().__init__("RTX 4090", {
            FloatingPointPrecision.FP32: 82.6e12,  # 82.6 TFLOPs
            FloatingPointPrecision.FP16: 165.2e12,  # 165.2 TFLOPs
            FloatingPointPrecision.BFLOAT16: 165.2e12
        })

In [3]:
tinyllama_config = TinyLLaMAConfig()
pprint(tinyllama_config)

TinyLLaMAConfig(num_decoder_blocks=22,
                context_length=2048,
                n_embd=2048,
                ffw_size=5632,
                n_head=32,
                vocab_size=32000,
                bias=False)


# 可训练参数总数

In [21]:
import torch
from transformers import LlamaForCausalLM
from collections import OrderedDict
import pandas as pd
from tabulate import tabulate

In [30]:
def total_trainable_parameters(model: torch.nn.Module) -> int:
    """Returns the number of trainable parameters in the model."""
    return sum(p.numel() for p in model.parameters())

In [37]:
tinyllama = LlamaForCausalLM.from_pretrained(TinyLLaMAModelType.TINY_LLAMA_1B.value)

tinyllama_params = total_trainable_parameters(tinyllama)

print(
    f"Number of trainable parameters in TinyLlama model: {tinyllama_params_no_bias}.\n"
)

print(tinyllama)

Number of trainable parameters in TinyLlama model: 1100048384.

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layerno

In [35]:
def params(
        num_decoder_blocks: int,
        context_length: int,
        n_embd: int,
        ffw_size: int,
        vocab_size: int,
        n_head: int,
        n_kv_head: int,
) -> OrderedDict[str, int]:
    """estimates the number of parameters in the model"""
    out = OrderedDict()

    # token embeddings
    out["embedding"] = vocab_size * n_embd

    # attention blocks
    # Grouped Query Attention
    head_dim = n_embd // n_head
    kv_dim = n_kv_head * head_dim

    out["attention/q_proj"] = n_embd * n_embd
    out["attention/k_proj"] = n_embd * kv_dim
    out["attention/v_proj"] = n_embd * kv_dim
    out["attention/o_proj"] = n_embd * n_embd
    out["attention"] = (
            out["attention/q_proj"]
            + out["attention/k_proj"]
            + out["attention/v_proj"]
            + out["attention/o_proj"]
    )

    # MLP blocks
    out["mlp/gate_proj"] = n_embd * ffw_size
    out["mlp/up_proj"] = n_embd * ffw_size
    out["mlp/down_proj"] = ffw_size * n_embd
    out["mlp"] = (
            out["mlp/gate_proj"]
            + out["mlp/up_proj"]
            + out["mlp/down_proj"]
    )

    # RMS Norm layers
    out["rms/input_layernorm"] = n_embd
    out["rms/post_attention_layernorm"] = n_embd
    out["rms"] = out["rms/input_layernorm"] + out["rms/post_attention_layernorm"]

    # the transformer and the rest of it
    out["block"] = out["attention"] + out["mlp"] + out["rms"]
    out["transformer"] = num_decoder_blocks * out["block"]

    # Final Norm
    out["final_norm"] = n_embd

    # LM Head
    out["lm_head"] = vocab_size * n_embd

    # total
    out["total"] = out["embedding"] + out["transformer"] + out["lm_head"] + out["final_norm"]

    return out

In [36]:
params_dict = params(num_decoder_blocks=TinyLLaMAConfig.num_decoder_blocks,
                     context_length=TinyLLaMAConfig.context_length, n_embd=TinyLLaMAConfig.n_embd,
                     ffw_size=TinyLLaMAConfig.ffw_size, vocab_size=TinyLLaMAConfig.vocab_size,
                     n_head=TinyLLaMAConfig.n_head, n_kv_head=4)
tinyllama_params_no_bias_manual = params_dict["total"]

# Compare to expected PyTorch model parameter count
expected_params = tinyllama_params_no_bias
comparison_result = tinyllama_params_no_bias_manual == expected_params
comparison_msg = f"We see: {tinyllama_params_no_bias_manual}, Expected: {expected_params}, Match: {comparison_result}"

data = {
    "Name": params_dict.keys(),
    "Parameters": params_dict.values(),
    "Ratio (%)": [value / tinyllama_params_no_bias_manual * 100 for value in params_dict.values()],
}
df = pd.DataFrame(data)

# Printing comparison result and parameter distribution table
print(comparison_msg + "\n")
print(tabulate(df, headers="keys", tablefmt="pretty", showindex=False, numalign="right", floatfmt=".4f"))

We see: 1100048384, Expected: 1100048384, Match: True

+------------------------------+------------+------------------------+
|             Name             | Parameters |       Ratio (%)        |
+------------------------------+------------+------------------------+
|          embedding           |  65536000  |    5.95755613600356    |
|       attention/q_proj       |  4194304   |   0.3812835927042278   |
|       attention/k_proj       |   524288   |  0.047660449088028474  |
|       attention/v_proj       |   524288   |  0.047660449088028474  |
|       attention/o_proj       |  4194304   |   0.3812835927042278   |
|          attention           |  9437184   |   0.8578880835845126   |
|        mlp/gate_proj         |  11534336  |   1.0485298799366265   |
|         mlp/up_proj          |  11534336  |   1.0485298799366265   |
|        mlp/down_proj         |  11534336  |   1.0485298799366265   |
|             mlp              |  34603008  |   3.1455896398098795   |
|     rms/input_layern

# 计算 Checkpoint 大小和 Fluff Ratio

In [None]:
from safetensors import safe_open

In [49]:
def calculate_checkpoint_size(params_count: int, precision: FloatingPointPrecision, units: ByteUnits) -> float:
    """
    Calculate the estimated checkpoint size in specified units.

    This function estimates the checkpoint size for a model given the number
    of parameters, the precision of these parameters, and
    the desired units for the result. It accounts for the AdamW optimizer's
    storage requirements by adding two times the parameter bytes to account
    for the optimizer's moment and velocity vectors.

    Parameters
    ----------
    params_count : int
        The number of parameters excluding biases.
    precision : FloatingPointPrecision
        The floating point precision of the parameters.
    units : ByteUnits
        The units for the resulting checkpoint size.

    Returns
    -------
    float
        The estimated checkpoint size in the specified units.

    Notes
    -----
    The AdamW optimizer requires additional storage for each parameter
    for maintaining momentum and variance vectors, hence the calculation
    includes 2 * params_bytes to accommodate these.
    """
    params_bytes = params_count * precision.value
    params_and_buffers_bytes = params_bytes
    return params_and_buffers_bytes / units.value


def calculate_fluff_ratio(measured_bytes: int, estimated_bytes: float, units: ByteUnits) -> float:
    """
    Calculate the fluff ratio between measured and estimated checkpoint sizes.

    The fluff ratio is a measure of the overhead or additional data in the
    checkpoint file, expressed as a percentage of the estimated size. This
    function converts the estimated size from gigabytes (or specified units)
    to bytes before calculating the ratio to ensure consistency in units.

    Parameters
    ----------
    measured_bytes : int
        The actual size of the checkpoint file, in bytes.
    estimated_bytes : float
        The estimated size of the checkpoint file, in the specified units.
    units : ByteUnits
        The units in which the estimated bytes are provided.

    Returns
    -------
    float
        The fluff ratio, expressed as a percentage.
    """
    estimated_bytes_in_bytes = estimated_bytes * units.value
    return (measured_bytes / estimated_bytes_in_bytes) * 100

In [52]:
tinyllama_checkpoint_size_measured_in_bytes = 2200119864  # from 'wc -c model.safetensors'
tinyllama_checkpoint_size_measured_in_gb = tinyllama_checkpoint_size_measured_in_bytes / ByteUnits.GB

tinyllama_checkpoint_size_estimated_in_bytes = calculate_checkpoint_size(
    params_count=tinyllama_params,
    precision=FloatingPointPrecision.BFLOAT16,
    units=ByteUnits.B,
)
tinyllama_checkpoint_size_estimated_in_gb = tinyllama_checkpoint_size_estimated_in_bytes / ByteUnits.GB

fluff_ratio = calculate_fluff_ratio(
    measured_bytes=tinyllama_checkpoint_size_measured_in_bytes,
    estimated_bytes=tinyllama_checkpoint_size_estimated_in_bytes,
    units=ByteUnits.B,
)

data = [
    ["Measured Checkpoint Size (bytes)", tinyllama_checkpoint_size_measured_in_bytes],
    ["Measured Checkpoint Size (GB)", tinyllama_checkpoint_size_measured_in_gb],
    ["Estimated Checkpoint Size (bytes)", tinyllama_checkpoint_size_estimated_in_bytes],
    ["Estimated Checkpoint Size (GB)", tinyllama_checkpoint_size_estimated_in_gb],
    ["Fluff Ratio", fluff_ratio],
]

print(tabulate(data, headers=["Metric", "Value"], tablefmt="pretty"))

+-----------------------------------+--------------------+
|              Metric               |       Value        |
+-----------------------------------+--------------------+
| Measured Checkpoint Size (bytes)  |     2200119864     |
|   Measured Checkpoint Size (GB)   |    2.200119864     |
| Estimated Checkpoint Size (bytes) |    2200096768.0    |
|  Estimated Checkpoint Size (GB)   |    2.200096768     |
|            Fluff Ratio            | 100.00104977200712 |
+-----------------------------------+--------------------+


# GPU Memory Footprint

In [51]:
def calculate_memory_ratio(checkpoint_size: float, gpu_memory: GPUMemory) -> str:
    memory_ratio = checkpoint_size / gpu_memory.value * 100
    return f"Memory ratio taken up just for parameters: {memory_ratio:.2f}%"


print(calculate_memory_ratio(checkpoint_size=tinyllama_checkpoint_size_estimated_in_bytes,
                             gpu_memory=GPUMemory.RTX4090_24GB))

Memory ratio taken up just for parameters: 9.17%
