In [4]:
!pip install datasets

Collecting datasets
  Using cached datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl

## Import the Required Libraries

In [106]:
import torch
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import time
import numpy as np
from datasets import load_dataset
from tqdm import tqdm

In [107]:
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
sample_data = [text[:128] for text in dataset['text'][:3000] if text.strip()]




In [108]:
# Calculate perplexity (using tqdm for progress bar)
def calculate_perplexity(model, tokenizer, dataset, device):
    model.to(device)
    model.eval()
    total_loss = 0
    num_batches = 0

    # Wrap the loop with tqdm to create a progress bar
    for i in tqdm(range(0, len(dataset), 5), desc="Calculating Perplexity"):
        batch_texts = dataset[i:i + 5]
        inputs = preprocess_data(batch_texts, tokenizer).to(device)

        with torch.no_grad():
            outputs = model(**inputs, labels=inputs['input_ids'])
            loss = outputs.loss
            total_loss += loss.item()
            num_batches += 1

    average_loss = total_loss / num_batches
    perplexity = torch.exp(torch.tensor(average_loss))
    return perplexity.item()

In [109]:
# Calculate memory footprint
def calculate_model_size(model, weight_bit_width=32):
    """Calculates the size of the quantized model on disk (approximate)."""
    total_size_bytes = 0
    for name, param in model.named_parameters():
        if "weight" in name and param.dim() > 1:  # Quantized weights (exclude bias and other params)
            num_elements = param.numel()
            bytes_per_element = weight_bit_width // 8  # e.g., 1 byte for INT8
            total_size_bytes += num_elements * bytes_per_element
        else: #original size if the data is not quantized.
            total_size_bytes += param.numel() * param.element_size()

    total_size_mb = total_size_bytes / (1024 ** 2)
    return total_size_mb

In [110]:
# Measure inference latency
def measure_inference_latency(model, inputs, device):
    inputs = {k: v.to(device) for k, v in inputs.items()}
    start_time = time.time()
    with torch.no_grad():
        model(**inputs)
    end_time = time.time()
    latency = end_time - start_time
    return latency

In [111]:
# Preprocess the data: tokenize and prepare input tensors
def preprocess_data(texts, tokenizer, max_length=512):
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=max_length)

# Prepare sample input data (first 5 samples for inference testing)
sample_texts = sample_data[:5]
inputs = preprocess_data(sample_texts, tokenizer)

# Move model to appropriate device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Ensure model is in evaluation mode
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## BEFORE QUANTISATION

In [113]:
# Memory footprint before quantization
memory_before_quant = calculate_model_size(model, weight_bit_width=32)  # No modules quantized
print(f"Memory Footprint (Pre-Quantization): {memory_before_quant:.2f} MB")

# Inference latency before quantization
latency_before_quant = measure_inference_latency(model, inputs, device)
print(f"Inference Latency (Pre-Quantization): {latency_before_quant:.4f} seconds")

# Perplexity before quantization
perplexity_before_quant = calculate_perplexity(model, tokenizer, sample_data, device)
print("\n")
print(f"Perplexity (Pre-Quantization): {perplexity_before_quant:.2f}")


Memory Footprint (Pre-Quantization): 474.70 MB
Inference Latency (Pre-Quantization): 0.0144 seconds


Calculating Perplexity: 100%|██████████| 383/383 [00:07<00:00, 50.66it/s]




Perplexity (Pre-Quantization): 851.16


## WHOLE QUANTISATION

In [114]:
def quantize_model(model, weight_bit_width=8):
    """
    Quantizes the model's weights to a specified bit width (default: 8-bit).
    The function quantizes all linear layers within the model.
    """
    quantized_param_names = []

    def quantize_tensor(tensor, bit_width):
        """Quantizes a tensor to the specified bit width."""
        min_val = tensor.min()
        max_val = tensor.max()

        # Calculate scale for symmetric quantization
        scale = (max_val - min_val) / (2**bit_width - 1)
        zero_point = 0  # Symmetric quantization

        # Quantize the tensor
        quantized_tensor = (tensor / scale).round().clamp(-2**(bit_width - 1), 2**(bit_width - 1) - 1)
        return quantized_tensor

    # Iterate over named parameters and quantize only certain layers
    for name, param in model.named_parameters():
        # Check if it's a weight parameter of a Linear layer
        if "weight" in name and any(layer_type in name for layer_type in ['transformer', 'linear', 'fc']):
            with torch.no_grad():  # Ensure gradients aren't tracked during this operation
                param.data = quantize_tensor(param.data, weight_bit_width)
                quantized_param_names.append(name)

    return model, quantized_param_names



In [115]:
weight_bit_width = 8
quantized_model, quantized_param_names = quantize_model(model, weight_bit_width=weight_bit_width)


Quantized Model Size (on disk): 119.02 MB


In [116]:
# Measure inference latency after quantization
quantized_model_size = calculate_model_size(quantized_model,weight_bit_width=weight_bit_width)
print(f"Quantized Model Size (on disk): {quantized_model_size:.2f} MB")

latency_after_quant = measure_inference_latency(quantized_model, inputs, device)
print(f"Inference Latency (Post-Quantization): {latency_after_quant:.4f} seconds")

# Compute perplexity after quantization
perplexity_after_quant = calculate_perplexity(quantized_model, tokenizer, sample_data, device)
print(f"Perplexity (Post-Quantization): {perplexity_after_quant:.2f}")

Quantized Model Size (on disk): 119.02 MB
Inference Latency (Post-Quantization): 0.0358 seconds


Calculating Perplexity: 100%|██████████| 383/383 [00:06<00:00, 58.63it/s]

Perplexity (Post-Quantization): inf





## SELECTIVE QUANTISATION

In [97]:
import torch

def select_quantize_model(model, layers_to_quantize=None, weight_bit_width=8):
    """
    Quantizes the model's weights to a specified bit width (default: 8-bit).
    Selectively quantizes only the specified layers in the model (e.g., transformer blocks).

    Args:
        model: The PyTorch model to be quantized (e.g., GPT-2).
        layers_to_quantize: A list of strings or substrings representing layers
                            to be quantized (e.g., ['h', 'attn', 'mlp']).
        weight_bit_width: The bit width for quantized weights (default: 8-bit).

    Returns:
        A tuple containing the quantized model and a list of quantized parameter names.
    """
    quantized_param_names = []

    def quantize_tensor(tensor, bit_width):
        """Quantizes a tensor to the specified bit width."""
        min_val = tensor.min()
        max_val = tensor.max()

        # Calculate scale for symmetric quantization
        scale = (max_val - min_val) / (2**bit_width - 1)
        zero_point = 0  # Symmetric quantization

        # Quantize the tensor
        quantized_tensor = (tensor / scale).round().clamp(-2**(bit_width - 1), 2**(bit_width - 1) - 1)
        return quantized_tensor

    # Iterate over named parameters and selectively quantize
    for name, param in model.named_parameters():
        if "weight" in name:
            # Check if the parameter should be quantized based on the provided list
            if layers_to_quantize and any(layer_type in name for layer_type in layers_to_quantize):
                print(f"Quantizing: {name}")
                with torch.no_grad():  # Ensure gradients aren't tracked during this operation
                    param.data = quantize_tensor(param.data, weight_bit_width)
                    quantized_param_names.append(name)

    return model, quantized_param_names




In [124]:
weight_bit_width = 8
layers_to_quantize = ['mlp']  # 'h' refers to transformer blocks in GPT-2
quantized_model_2, quantized_param_names = select_quantize_model(model, layers_to_quantize=layers_to_quantize, weight_bit_width=weight_bit_width)

Quantizing: transformer.h.0.mlp.c_fc.weight
Quantizing: transformer.h.0.mlp.c_proj.weight
Quantizing: transformer.h.1.mlp.c_fc.weight
Quantizing: transformer.h.1.mlp.c_proj.weight
Quantizing: transformer.h.2.mlp.c_fc.weight
Quantizing: transformer.h.2.mlp.c_proj.weight
Quantizing: transformer.h.3.mlp.c_fc.weight
Quantizing: transformer.h.3.mlp.c_proj.weight
Quantizing: transformer.h.4.mlp.c_fc.weight
Quantizing: transformer.h.4.mlp.c_proj.weight
Quantizing: transformer.h.5.mlp.c_fc.weight
Quantizing: transformer.h.5.mlp.c_proj.weight
Quantizing: transformer.h.6.mlp.c_fc.weight
Quantizing: transformer.h.6.mlp.c_proj.weight
Quantizing: transformer.h.7.mlp.c_fc.weight
Quantizing: transformer.h.7.mlp.c_proj.weight
Quantizing: transformer.h.8.mlp.c_fc.weight
Quantizing: transformer.h.8.mlp.c_proj.weight
Quantizing: transformer.h.9.mlp.c_fc.weight
Quantizing: transformer.h.9.mlp.c_proj.weight
Quantizing: transformer.h.10.mlp.c_fc.weight
Quantizing: transformer.h.10.mlp.c_proj.weight
Quantizi

In [125]:
import torch

def calculate_model_size(model, quantized_param_names, weight_bit_width=8):
    """
    Calculates the size of the model on disk, accounting for quantized modules.

    Args:
        model: The PyTorch model.
        quantized_param_names: A list of strings representing the names of parameters
                               that have been quantized.
        weight_bit_width: The bit width for quantized weights (default: 8-bit).

    Returns:
        The total size of the model in megabytes (MB).
    """
    total_size_bytes = 0
    for name, param in model.named_parameters():
        # Check if the parameter is one of the selectively quantized parameters
        if name in quantized_param_names:
            # Quantized weights
            num_elements = param.numel()
            bytes_per_element = weight_bit_width // 8
            total_size_bytes += num_elements * bytes_per_element
        else:
            # Regular weights
            total_size_bytes += param.numel() * param.element_size()

    total_size_mb = total_size_bytes / (1024 ** 2)
    return total_size_mb


quantized_model_size = calculate_model_size(quantized_model_2, quantized_param_names, weight_bit_width=weight_bit_width)
print(f"Quantized Model Size (on disk): {quantized_model_size:.2f} MB")

Quantized Model Size (on disk): 312.70 MB


In [126]:
# Measure inference latency after quantization
latency_after_quant = measure_inference_latency(quantized_model_2.to(device), inputs, device)
print(f"Inference Latency (Post-Quantization): {latency_after_quant:.4f} seconds")

Inference Latency (Post-Quantization): 0.0142 seconds


In [127]:
# Compute perplexity after quantization
perplexity_after_quant = calculate_perplexity(quantized_model_2.to(device), tokenizer, sample_data, device)
print(f"Perplexity (Post-Quantization): {perplexity_after_quant:.2f}")

Calculating Perplexity: 100%|██████████| 383/383 [00:06<00:00, 63.75it/s]

Perplexity (Post-Quantization): 5744.22



