In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Step 1: Load the quantized model and tokenizer
# MODEL_ID = "TinyLlama-1.1B-Chat-v1.0-Smooth-GPTQ-SYM-W8A8-Dynamic-Per-Token"
MODEL_ID = "TinyLlama-1.1B-Chat-v1.0-Smooth-GPTQ-ASYM-W8A8-Dynamic-Per-Token"
# MODEL_ID = "TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token"
# MODEL_ID = "TinyLlama-1.1B-Chat-v1.0-W8A8-Static-Per-Token"
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Step 1: Load the quantized model and tokenizer (vLLM)
# import os
# from vllm import LLM
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# model = LLM("./TinyLlama-1.1B-Chat-v1.0-W8A8-Dynamic-Per-Token")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import load_dataset

# Step 2: Prepare Calibration Data
NUM_CALIBRATION_SAMPLES=4
MAX_SEQUENCE_LENGTH=2048

# Load dataset.
ds = load_dataset("HuggingFaceH4/ultrachat_200k", split=f"train_sft[:{NUM_CALIBRATION_SAMPLES}]")
ds = ds.shuffle(seed=42)

# Preprocess the data into the format the model is trained with.
def preprocess(example):
    return {"text": tokenizer.apply_chat_template(example["messages"], tokenize=False,)}
ds = ds.map(preprocess)

# Tokenize the data (be careful with bos tokens - we need add_special_tokens=False since the chat_template already added it).
def tokenize(sample):
    return tokenizer(sample["text"], padding=False, max_length=MAX_SEQUENCE_LENGTH, truncation=True, add_special_tokens=False)
ds = ds.map(tokenize, remove_columns=ds.column_names)

Map: 100%|██████████| 4/4 [00:00<00:00, 186.04 examples/s]
Map: 100%|██████████| 4/4 [00:00<00:00, 193.19 examples/s]


In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# Example forward pass to trigger the hook
for i, sample in enumerate(ds):
    with torch.no_grad():
        input_ids = torch.tensor(sample["input_ids"]).unsqueeze(0).to(device)
        attention_mask = torch.tensor(sample["attention_mask"]).unsqueeze(0).to(device)
        print(f"Processing sample: {i}, length: {input_ids.shape}")
        _ = model(input_ids=input_ids)

Processing sample: 0, length: torch.Size([1, 499])
Processing sample: 1, length: torch.Size([1, 2048])
Processing sample: 2, length: torch.Size([1, 1799])
Processing sample: 3, length: torch.Size([1, 787])


In [4]:
import numpy as np
layer_distributions = {}

for name, module in model.named_modules():
    if hasattr(module, "weight") and hasattr(module, "weight_scale"):
        weight = module.weight.data.detach().cpu()
        weight_scale = module.weight_scale.detach().float().cpu()
        weight_int8 = (weight / weight_scale).detach().int().cpu().numpy().flatten()
        
        inputs = np.concatenate([
            inp.flatten().cpu().numpy()
            for inp in module.inputs
        ])
        inputs_int8 = np.concatenate([
            inp.flatten().cpu().numpy()
            for inp in module.quantized_inputs
        ])
        
        layer_distributions[name] = {
            "weight_scale": weight_scale,
            "weight_int8": weight_int8,
            "inputs": inputs,
            "inputs_scales": module.input_scales,
            "inputs_int8": inputs_int8,
        }

In [5]:
import os
import csv


os.makedirs(f"output", exist_ok=True)

csv_rows = [
    [
        "layer",
        "weight_zero_pct", "weight_neg1_pct", "weight_pos1_pct",
        "input_zero_pct", "input_neg1_pct", "input_pos1_pct",
        "input_scale_min", "input_scale_max", "input_scale_mean"
    ]
]

for k, v in layer_distributions.items():
    weight = v["weight_int8"]
    inputs = v["inputs_int8"]

    # Weight stats
    weight_zero_pct = (weight == 0).sum().item() / weight.size
    weight_neg1_pct = (weight == -1).sum().item() / weight.size
    weight_pos1_pct = (weight == 1).sum().item() / weight.size

    # Input stats
    input_zero_pct = (inputs == 0).sum().item() / inputs.size
    input_neg1_pct = (inputs == -1).sum().item() / inputs.size
    input_pos1_pct = (inputs == 1).sum().item() / inputs.size

    inp_scales = np.concatenate([s.numpy().flatten() for s in v["inputs_scales"] if hasattr(s, "numpy")])
    inp_scales_inv = 1.0 / inp_scales
    input_scale_min = inp_scales_inv.min()
    input_scale_max = inp_scales_inv.max()
    input_scale_mean = inp_scales_inv.mean()

    csv_rows.append([
        k,
        weight_zero_pct, weight_neg1_pct, weight_pos1_pct,
        input_zero_pct, input_neg1_pct, input_pos1_pct,
        input_scale_min, input_scale_max, input_scale_mean
    ])

with open(f"output/{MODEL_ID}_layer_distribution.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(csv_rows)

In [6]:
import os
import matplotlib.pyplot as plt

os.makedirs(f"figures/{MODEL_ID}", exist_ok=True)

for layer_name, v in layer_distributions.items():
    weight = v["weight_int8"]
    inputs = v["inputs_int8"]

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.hist(weight, bins=256, range=(-128,127), color='blue', alpha=0.7, density=True)
    plt.title(f"{layer_name} Weight Distribution")
    plt.xlabel("Value")
    plt.ylabel("Percentage")
    plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y*100:.2f}%'))

    plt.subplot(1, 2, 2)
    plt.hist(inputs, bins=256, range=(-128,127), color='green', alpha=0.7, density=True)
    plt.title(f"{layer_name} Input Distribution")
    plt.xlabel("Value")
    plt.ylabel("Percentage")
    plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: f'{y*100:.2f}%'))

    plt.tight_layout()
    plt.savefig(f"figures/{MODEL_ID}/{layer_name}_distributions.png", dpi=500)
    plt.close()