## QuantizationConfig Application ##

`QuantizationConfig` allows for compressing the model on disk by reducing the precision of weights e.g. from float16 to int8.

In order to save a compressed (quantized)

1. Create a "vanilla" model. For that purpose we are using a `TinyLlaMa` model
2. Define the arguments of the `QuantizationConfig`
3. Use the function `apply_quantization_config` to modify the model to, for the relevant weight matrices, add parameters that simulate the quantize and dequantize operations (scale and zero point).
4. Calibrate the scale and zero point through few forward passes of the calibration data
5. Using the obtained scales and zero points to quantize the weight matrices to `int8` representation.

The example below shows how to quantize the model. It also demonstrates the benefits of the quantization over "dense" representation.

In [1]:
from compressed_tensors.quantization import QuantizationConfig, apply_quantization_config, freeze_module_quantization
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
model_name = "neuralmagic/llama2.c-stories110M-pruned50" # model to quantize and calibrate
dataset_name = "garage-bAInd/Open-Platypus" # dataset to calibrate on
num_calibration_samples = 1 # num calibration samples to calibrate on

In [11]:
# Step 1: Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 768)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=768, out_features=768, bias=False)
          (k_proj): Linear(in_features=768, out_features=768, bias=False)
          (v_proj): Linear(in_features=768, out_features=768, bias=False)
          (o_proj): Linear(in_features=768, out_features=768, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=768, out_features=2048, bias=False)
          (up_proj): Linear(in_features=768, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=768, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(in_feat

In [12]:
for name, module in model.model.named_modules():
    module_type = module.__class__.__name__
    if module_type == "Linear":
        is_quantized = hasattr(module, "quantization_scheme")
        print(f"Module: {name} has been quantized: {is_quantized}")
        break

Module: layers.0.self_attn.q_proj has been quantized: False


In [13]:
# Step 2: Define the quantization configuration
quantization_config = QuantizationConfig(
        # "fakequant" means that the weights are still in their original format
        # (e.g. float16), but quantization is emulated by adding scales/zeropoints 
        # to the model state_dict
        format = "fakequant",
        # "initialize" means that scale/zeropoints and observers have been attached 
        # to the layer but are set to dummy values (not yet calibrated)
        quantization_status = "calibration",
        config_groups ={
        # "group_1" acts on all the nn.Linear layers of the model
        # it quantizes the weights to 8 bits (symmetric, so zero_point = 0) 
        # it quantizes the input activations to 8 bits (asymmetric, so zero_point != 0)
        "group_1": {
            "weights": {
                "num_bits": 8,
                "type": "int",
                "symmetric": True,
                "strategy": "tensor"
            },
            "input_activations": {
                "num_bits": 8,
                "type": "int",
                "symmetric": False,
                "strategy": "tensor"
            },
            "targets": ["Linear"]
        },
    },
)

In [20]:
# Step 3: Apply the quantization configuration
apply_quantization_config(model, quantization_config)

In [18]:
for name, module in model.model.named_modules():
    module_type = module.__class__.__name__
    if module_type == "Linear":
        is_quantized = hasattr(module, "quantization_scheme")
        print(f"Module: {name} has been quantized: {is_quantized}")
        print(f"Input activation scale: {module.input_scale}")
        print(f"Input activation zero point: {module.input_zero_point}")
        print(f"Weight scale: {module.weight_scale}")
        print(f"Weight zero point {module.weight_zero_point}")
        break

Module: layers.0.self_attn.q_proj has been quantized: True
Input activation scale: Parameter containing:
tensor([])
Input activation zero point: Parameter containing:
tensor([], dtype=torch.int64)
Weight scale: Parameter containing:
tensor([])
Weight zero point Parameter containing:
tensor([], dtype=torch.int64)


In [21]:
# Step 4: Calibrate the quantized layers
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset(dataset_name, split='train', streaming=True)

# run calibration
for idx, sample in tqdm(enumerate(dataset),  total = num_calibration_samples):
    sample = tokenizer(sample['output'], return_tensors="pt")
    _ = model(**sample)

    if idx >= num_calibration_samples:
        break

100%|██████████| 1/1 [01:23<00:00, 83.74s/it]


In [22]:
for name, module in model.model.named_modules():
    module_type = module.__class__.__name__
    if module_type == "Linear":
        is_quantized = hasattr(module, "quantization_scheme")
        print(f"Module: {name} has been quantized: {is_quantized}")
        print(f"Input activation scale: {module.input_scale.item()}")
        print(f"Input activation zero point: {module.input_zero_point.item()}")
        print(f"Weight scale: {module.weight_scale.item()}")
        print(f"Weight zero point {module.weight_zero_point.item()}")
        break

Module: layers.0.self_attn.q_proj has been quantized: True
Input activation scale: 0.02618597261607647
Input activation zero point: 85
Weight scale: 0.008901744149625301
Weight zero point 0


In [23]:
model.apply(freeze_module_quantization)
quantization_config.format = "compressed"
apply_quantization_config(model, quantization_config)

In [24]:
for name, module in model.model.named_modules():
    module_type = module.__class__.__name__
    if module_type == "Linear":
        is_quantized = hasattr(module, "quantization_scheme")
        print(f"Module: {name} has been quantized: {is_quantized}")
        print(f"Input activation scale: {module.input_scale}")
        print(list(module.parameters()))
        break

Module: layers.0.self_attn.q_proj has been quantized: True
Input activation scale: Parameter containing:
tensor([])
[Parameter containing:
tensor([[ 0.0000,  0.0000, -0.0712,  ...,  0.0000, -0.0178,  0.0000],
        [ 0.0000,  0.0979,  0.1246,  ...,  0.1869,  0.0801, -0.1602],
        [ 0.0000, -0.0712,  0.0000,  ..., -0.2492, -0.1157,  0.2671],
        ...,
        [-0.0801, -0.0801,  0.0000,  ...,  0.0000,  0.0000,  0.1068],
        [-0.1157, -0.0801, -0.0712,  ...,  0.0890,  0.0712,  0.0890],
        [-0.1068,  0.0445,  0.0000,  ...,  0.0712,  0.1246,  0.0534]],
       requires_grad=True), Parameter containing:
tensor([]), Parameter containing:
tensor([], dtype=torch.int64), Parameter containing:
tensor([]), Parameter containing:
tensor([], dtype=torch.int64)]
