In [None]:
!pip install torch transformers datasets accelerate bitsandbytes
!pip install -q -U transformers accelerate bitsandbytes safetensors
!pip install evaluate

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.1 M

In [None]:
from IPython import get_ipython
from IPython.display import display

import os
import torch.nn as nn
import torch.nn.functional as F
import torch
import numpy as np
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from typing import Dict, Optional, Tuple, List, Any
import time
import json
from dataclasses import dataclass
import matplotlib.pyplot as plt
from scipy.stats import entropy
import pandas as pd
import evaluate

print(f"CUDA: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA: True
GPU: Tesla T4


In [None]:
os.environ["HF_TOKEN"] = "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"

In [None]:
model_name = "facebook/opt-350m"

tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side='left')
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/733k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [None]:
@dataclass
class QuantConfig:
    min_bits: int = 4
    max_bits: int = 8
    skip_layers: List[str] = None

    def __post_init__(self):
        if self.skip_layers is None:
            self.skip_layers = ['attention_mask', 'norm', 'layernorm']

@dataclass
class EvalConfig:
    batch_size: int = 8
    max_length: int = 512
    num_samples: int = 100
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
class LayerProfiler:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = next(model.parameters()).device
        self.stats_cache = None

    def get_layer_stats(self):
        """Get statistics for each layer in the model."""
        stats = {}
        for name, module in self.model.named_modules():
            if hasattr(module, 'weight'):
                with torch.no_grad():
                    weight = module.weight
                    # Convert to float32 for histogram computation
                    weight_float = weight.to(torch.float32)
                    stats[name] = {
                        'mean': weight.mean().item(),
                        'std': weight.std().item(),
                        'sparsity': (weight == 0).float().mean().item(),
                        'shape': list(weight.shape),
                        'bins': torch.histc(weight_float, bins=10).tolist()
                    }
        self.stats_cache = stats
        return stats

    def profile_speed(self, text="Test input for profiling", num_runs=10):
        """Profile model inference speed."""
        inputs = self.tokenizer(text, return_tensors="pt").to(self.device)
        # Warmup run
        with torch.no_grad():
            self.model(**inputs)

        # Actual profiling
        times = []
        for _ in range(num_runs):
            start = time.perf_counter()
            with torch.no_grad():
                self.model(**inputs)
            times.append(time.perf_counter() - start)

        return sum(times) / len(times)

    def get_layer_importances(self):
        """Calculate importance scores for each layer based on statistics."""
        if self.stats_cache is None:
            self.get_layer_stats()

        importances = {}
        max_std = max(layer['std'] for layer in self.stats_cache.values())

        for name, stats in self.stats_cache.items():
            importance = stats['std'] / max_std
            importances[name] = importance

        return importances

    def get_activation_stats(self, sample_input="Test input for profiling"):
        activation_stats = {}

        def hook_fn(name):
            def hook(module, input, output):
                if isinstance(output, tuple):
                    output = output[0]
                if torch.is_tensor(output):
                    with torch.no_grad():
                        activation_stats[name] = {
                            'mean': output.mean().item(),
                            'std': output.std().item(),
                            'max': output.abs().max().item(),
                            'shape': list(output.shape)
                        }
            return hook

        hooks = []
        for name, module in self.model.named_modules():
            if hasattr(module, 'weight'):
                hooks.append(module.register_forward_hook(hook_fn(name)))

        inputs = self.tokenizer(sample_input, return_tensors="pt").to(self.device)
        with torch.no_grad():
            self.model(**inputs)

        for hook in hooks:
            hook.remove()

        return activation_stats

In [None]:
class ActivationQuantizer:
    def __init__(self, model: nn.Module, num_bits: int = 4):
        self.model = model
        self.num_bits = num_bits
        self.hooks = []
        self.activation_stats = {}
        self.eps = 1e-7

    def quantize_activation(self, x: torch.Tensor, num_bits: int = 4) -> torch.Tensor:
        if x is None:
            return x
        if isinstance(x, tuple):
            x = x[0]
        with torch.no_grad():
            if not torch.is_tensor(x) or x.numel() == 0:
                return x

            # Per-channel quantization for activations
            dims = tuple(range(1, x.dim()))
            max_abs = torch.amax(torch.abs(x), dim=dims, keepdim=True)
            scale = (2**(num_bits-1) - 1) / (max_abs + self.eps)

            x_q = torch.round(x * scale)
            x_q = torch.clamp(x_q, -2**(num_bits-1), 2**(num_bits-1)-1)
            return x_q / scale

    def activation_hook(self, name: str):
        def hook(module: nn.Module,
                input: Tuple[torch.Tensor],
                output: torch.Tensor) -> torch.Tensor:
            with torch.no_grad():
                if isinstance(output, tuple):
                    return tuple(self.quantize_activation(o, self.num_bits)
                               if o is not None else None
                               for o in output)
                return self.quantize_activation(output, self.num_bits)
        return hook

    def add_hooks(self):
        for name, module in self.model.named_modules():
            if any(layer in name for layer in ['self_attn', 'fc1', 'fc2']):
                hook = module.register_forward_hook(self.activation_hook(name))
                self.hooks.append(hook)

    def remove_hooks(self):
        for hook in self.hooks:
            hook.remove()
        self.hooks = []

In [None]:
# from https://github.com/OpenGVLab/EfficientQAT/blob/main/quantize/quantizer.py#L23
CLIPMIN = 1e-4
class QuantLinear(nn.Module):
    """
    Quantized Module that can perform quantized convolution or normal convolution.
    To activate quantization, please use set_quant_state function.
    """
    def __init__(
        self,
        org_module: nn.Linear,
        wbits=4,
        group_size=64
    ):
        super().__init__()
        self.fwd_kwargs = dict()
        self.fwd_func = F.linear
        self.register_parameter('weight',org_module.weight) # trainable
        if org_module.bias is not None:
            self.register_buffer('bias',org_module.bias)
        else:
            self.bias = None
        self.in_features = org_module.in_features
        self.out_features = org_module.out_features
        # de-activate the quantized forward default
        self.use_weight_quant = False
        # initialize quantizer
        self.weight_quantizer = UniformAffineQuantizer(wbits, group_size, weight=org_module.weight)
        self.use_temporary_parameter = False



    def forward(self, input: torch.Tensor):
        if self.use_weight_quant:
            weight = self.weight_quantizer(self.weight)
            bias = self.bias
        else:
            weight = self.weight
            bias = self.bias


        out = self.fwd_func(input, weight, bias, **self.fwd_kwargs)


        return out

    def set_quant_state(self, weight_quant: bool = False):
        self.use_weight_quant = weight_quant


def round_ste(x: torch.Tensor):
    """
    Implement Straight-Through Estimator for rounding operation.
    """
    return (x.round() - x).detach() + x

def clamp_ste(x: torch.Tensor, min, max):
    return (x.clamp(min,max) - x).detach() + x

def clamp_ste(x: torch.Tensor, min, max):
    return (x.clamp(min,max) - x).detach() + x


class UniformAffineQuantizer(nn.Module):
    def __init__(
        self,
        n_bits: int = 8,
        group_size=None,
        weight=None,
    ):
        super().__init__()
        assert 2 <= n_bits <= 16, "bitwidth not supported"
        self.n_bits = n_bits
        self.qmin = 0
        self.qmax = 2 ** (n_bits) - 1
        self.group_size = group_size if group_size != -1 else weight.shape[-1]
        assert weight.shape[-1] % group_size == 0
        self.enable = True

        # init scale and zero point through Max-Min quantization
        with torch.no_grad():
            if weight is not None:
                x = weight.reshape(-1,self.group_size)
                xmin = x.amin([-1], keepdim=True)
                xmax =  x.amax([-1], keepdim=True)
                range = xmax - xmin
                scale = range / (2**self.n_bits-1)
                scale = scale.clamp(min=1e-4, max=1e4)
                zero_point = -(xmin/scale).clamp(min=-1e4, max=1e4)
                self.scale = nn.Parameter(scale)
                self.zero_point = nn.Parameter(zero_point.round())


    def change_n_bits(self, n_bits):
        self.n_bits = n_bits
        self.qmin = 0
        self.qmax = int(2 ** (n_bits) - 1)

    def fake_quant(self, x):
        scale = clamp_ste(self.scale,1e-4, 1e4)
        round_zero_point = clamp_ste(round_ste(self.zero_point), self.qmin, self.qmax)

        dim1, dim2 = x.shape
        x = x.reshape(-1, self.group_size)
        x_int = round_ste(x / scale)
        if round_zero_point is not None:
            x_int = x_int.add(round_zero_point)
        x_int = x_int.clamp(self.qmin, self.qmax)
        x_dequant = x_int
        if round_zero_point is not None:
            x_dequant = x_dequant.sub(round_zero_point)
        x_dequant = x_dequant.mul(scale)
        if self.group_size:
            x_dequant = x_dequant.reshape(dim1, dim2)
        return x_dequant


    def forward(self, x: torch.Tensor):
        if self.n_bits >= 16 or not self.enable:
            return x

        x_dequant = self.fake_quant(x)
        return x_dequant

In [None]:
class Int8Linear(nn.Module):
    def __init__(self, weight, bias=None, scales=None, zeros=None):
        super().__init__()
        self.register_buffer('weight_q', weight)  # int8
        self.register_buffer('weight_scales', scales)  # fp16
        self.register_buffer('weight_zeros', zeros)  # fp16
        self.register_buffer('bias', bias)  # fp16

    def forward(self, x):
        # Dequantize only during computation
        weight_deq = (self.weight_q.float() - self.weight_zeros) / self.weight_scales
        weight_deq = weight_deq.to(x.dtype)

        # Dequantize bias as well if it exists
        if self.bias is not None:
            bias_deq = (self.bias.float() - self.weight_zeros[0]) / self.weight_scales[0]
            bias_deq = bias_deq.to(x.dtype)
        else:
            bias_deq = None

        out = F.linear(x, weight_deq, bias_deq)
        return out

class W4A4Quantizer:
    def __init__(self, model):
        self.model = model
        self.original_state = {}
        self.eps = 1e-7
        self.model_dtype = next(model.parameters()).dtype

    def _quantize_per_channel(self, tensor):
        with torch.no_grad():
            orig_shape = tensor.shape
            if len(orig_shape) == 1:
                return self._quantize_tensor(tensor)

            tensor = tensor.reshape(orig_shape[0], -1)
            scales = []
            zeros = []
            tensor_q = torch.zeros_like(tensor, dtype=torch.int8)

            for idx in range(tensor.shape[0]):
                max_val = torch.max(torch.abs(tensor[idx]))
                scale = (2**3 - 1) / (max_val + self.eps)
                scales.append(scale)
                zeros.append(0)
                tensor_q[idx] = torch.clamp(torch.round(tensor[idx] * scale), -8, 7)

            # Reshape scales and zeros to match the original weight tensor
            scales = torch.tensor(scales, dtype=self.model_dtype, device=tensor.device).view(-1, 1)
            zeros = torch.tensor(zeros, dtype=self.model_dtype, device=tensor.device).view(-1, 1)

            return tensor_q.reshape(orig_shape), scales, zeros

    def quantize_model(self):
        print("\nStarting model quantization...")
        total_size_before = sum(p.numel() * p.element_size() for p in self.model.parameters())
        total_size_after = 0

        with torch.no_grad():
            for name, module in self.model.named_modules():
                if isinstance(module, nn.Linear):
                    if 'lm_head' in name or 'embed' in name:
                        size = module.weight.numel() * module.weight.element_size()
                        if module.bias is not None:
                            size += module.bias.numel() * module.bias.element_size()
                        total_size_after += size
                        continue

                    # Store original module
                    self.original_state[name] = module

                    # Quantize weights
                    weight_q, scales, zeros = self._quantize_per_channel(module.weight.data)

                    # Create quantized module
                    quantized_module = Int8Linear(
                        weight_q,
                        module.bias.data if module.bias is not None else None,
                        scales,
                        zeros
                    )

                    # Replace module in model
                    parent_name = '.'.join(name.split('.')[:-1])
                    child_name = name.split('.')[-1]
                    if parent_name:
                        parent = self.model.get_submodule(parent_name)
                        setattr(parent, child_name, quantized_module)
                    else:
                        setattr(self.model, name, quantized_module)

                    # Calculate memory
                    total_size_after += (
                        weight_q.numel() +  # int8 weights
                        scales.numel() * scales.element_size() +  # fp16 scales
                        zeros.numel() * zeros.element_size()  # fp16 zeros
                    )
                    if module.bias is not None:
                        total_size_after += module.bias.numel() * module.bias.element_size()

        print(f"Original size: {total_size_before/1024/1024:.2f}MB")
        print(f"Quantized size: {total_size_after/1024/1024:.2f}MB")
        print(f"Compression ratio: {total_size_before/total_size_after:.2f}x")

    def restore_model(self):
        for name, original_module in self.original_state.items():
            parent_name = '.'.join(name.split('.')[:-1])
            child_name = name.split('.')[-1]
            if parent_name:
                parent = self.model.get_submodule(parent_name)
                setattr(parent, child_name, original_module)
            else:
                setattr(self.model, name, original_module)

In [None]:
class QuantizationEvaluator:
    def __init__(self, model, tokenizer, config):
        self.model = model
        self.tokenizer = tokenizer
        self.config = config
        self.device = next(model.parameters()).device
        self.results = {}

    def _get_model_memory(self):
        """Get total memory used by model parameters."""
        total = 0
        for param in self.model.parameters():
            if param.data.is_cuda:
                total += param.data.element_size() * param.data.nelement()
        return total

    def evaluate_perplexity(self):
            """Evaluate model perplexity on WikiText-2."""
            dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
            nlls = []

            try:
                for i in range(0, min(len(dataset), self.config.num_samples)):

                    encodings = self.tokenizer(dataset[i]["text"],
                                            return_tensors="pt",
                                            truncation=True,
                                            max_length=self.config.max_length)

                    input_ids = encodings.input_ids.to(self.device)
                    if input_ids.size(1) == 0:
                        continue

                    with torch.no_grad():
                        outputs = self.model(input_ids, labels=input_ids.view(-1))

                        neg_log_likelihood = outputs["loss"].float()

                    if not torch.isnan(neg_log_likelihood) and not torch.isinf(neg_log_likelihood):
                        nlls.append(neg_log_likelihood)

                if nlls:
                    avg_nll = torch.stack(nlls).mean()
                    ppl = torch.exp(avg_nll).item()
                else:
                    ppl = float('inf')

            except Exception as e:
                print(f"Error in perplexity calculation: {str(e)}")
                ppl = float('inf')

            self.results["perplexity"] = ppl
            return ppl

    def evaluate_mmlu(self):
        """Evaluate model on MMLU benchmark."""
        subjects = ["abstract_algebra", "astronomy", "business_ethics", "philosophy"]
        results = {}

        for subject in subjects:
            dataset = load_dataset("cais/mmlu", subject, split="test")
            correct = 0
            total = 0

            for i in range(min(len(dataset), self.config.num_samples)):
                prompt = f"Question: {dataset[i]['question']}\nChoices:\n"
                for j, choice in enumerate(dataset[i]['choices']):
                    prompt += f"{chr(65+j)}) {choice}\n"
                prompt += "Answer:"

                inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
                with torch.no_grad():
                    outputs = self.model.generate(
                        **inputs,
                        max_length=len(inputs["input_ids"][0]) + 5,
                        num_return_sequences=1,
                        do_sample=True,
                        temperature=0.1
                    )

                pred = self.tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):])
                if chr(65 + dataset[i]['answer']) in pred:
                    correct += 1
                total += 1

            accuracy = correct / total if total > 0 else 0
            results[subject] = accuracy

        self.results["mmlu"] = results
        return results

    def evaluate_coherence(self, prompts):
        """Evaluate text generation coherence."""
        results = {
            "repetition_score": 0.0,
            "consistency_score": 0.0,
            "fluency_score": 0.0
        }

        for prompt in prompts:
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_length=200,
                    num_return_sequences=1,
                    do_sample=True,
                    temperature=0.7,
                    top_p=0.9
                )
            text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            results["repetition_score"] += self._compute_repetition_score(text)
            results["consistency_score"] += self._compute_consistency_score(text)
            results["fluency_score"] += self._compute_fluency_score(text)

        for key in results:
            results[key] /= len(prompts)

        self.results["coherence"] = results
        return results

    def evaluate_hardware_performance(self, input_lengths):
            """Evaluate model hardware performance metrics."""
            results = {
                "latency": {},
                "throughput": {},
                "memory_usage": {},
                "model_memory": self._get_model_memory(),
                "peak_memory": {}
            }

            # Get base memory usage including model weights
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()
            base_mem = torch.cuda.memory_allocated()
            model_size = sum(p.numel() * p.element_size() for p in self.model.parameters())

            # print(f"\nModel weight size: {model_size/1024/1024:.2f}MB")
            # print(f"Base GPU memory: {base_mem/1024/1024:.2f}MB")

            for length in input_lengths:
                # print(f"\nTesting input length {length}:")
                torch.cuda.empty_cache()
                torch.cuda.reset_peak_memory_stats()

                # Generate input
                input_ids = torch.randint(
                    0, self.tokenizer.vocab_size,
                    (self.config.batch_size, length),
                    device=self.device
                )

                # Warmup
                with torch.no_grad():
                    self.model(input_ids)
                    torch.cuda.synchronize()

                # Measure latency and memory
                latencies = []
                peak_mem = base_mem

                for _ in range(5):
                    torch.cuda.reset_peak_memory_stats()
                    start = time.perf_counter()

                    with torch.no_grad():
                        self.model(input_ids)
                        torch.cuda.synchronize()
                        peak_mem = max(peak_mem, torch.cuda.max_memory_allocated())

                    latencies.append(time.perf_counter() - start)

                total_mem = peak_mem - base_mem + model_size
                # print(f"Total memory (weights + runtime): {total_mem/1024/1024:.2f}MB")

                # Store results
                avg_latency = sum(latencies[1:-1]) / len(latencies[1:-1])
                results["latency"][length] = avg_latency
                results["throughput"][length] = self.config.batch_size / avg_latency
                results["memory_usage"][length] = total_mem
                results["peak_memory"][length] = peak_mem

                # Clean up
                del input_ids
                torch.cuda.empty_cache()

            self.results["hardware"] = results
            return results

    def _compute_repetition_score(self, text):
        """Compute repetition penalty score."""
        words = text.split()
        if not words:
            return 0.0

        bigrams = list(zip(words[:-1], words[1:]))
        unique_bigrams = len(set(bigrams))
        total_bigrams = len(bigrams)

        return 1.0 - (unique_bigrams / total_bigrams if total_bigrams > 0 else 0)

    def _compute_consistency_score(self, text):
        """Compute semantic consistency score."""
        sentences = text.split(".")
        if len(sentences) < 2:
            return 1.0

        # Simple heuristic based on sentence length variance
        lengths = [len(s.split()) for s in sentences if s.strip()]
        return 1.0 - (np.std(lengths) / np.mean(lengths) if lengths else 0)

    def _compute_fluency_score(self, text):
        """Compute language fluency score."""
        words = text.split()
        if not words:
            return 0.0

        # Calculate normalized word length distribution
        lengths = [len(word) for word in words]
        hist = np.histogram(lengths, bins=range(1, max(lengths) + 2))[0]
        hist = hist / hist.sum()

        # Use entropy as a proxy for fluency
        return 1.0 - entropy(hist) / np.log(len(hist))

    def generate_report(self, output_path):
        """Generate evaluation report and visualizations."""
        report = {
            "model_info": {
                "name": self.model.config.name_or_path,
                "parameters": sum(p.numel() for p in self.model.parameters()),
                "device": str(self.device)
            },
            "evaluation_results": self.results,
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
        }

        # Save report
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, "w") as f:
            json.dump(report, f, indent=2)

        # Generate visualizations
        self._create_visualizations(output_path.replace(".json", "_viz"))

    def _create_visualizations(self, output_prefix: str):
        """Create visualization plots."""
        os.makedirs(os.path.dirname(output_prefix), exist_ok=True)

        # MMLU Results
        if "mmlu" in self.results:
            plt.figure(figsize=(10, 6))
            subjects = list(self.results["mmlu"].keys())
            scores = list(self.results["mmlu"].values())
            plt.bar(subjects, scores)
            plt.title("MMLU Performance by Subject")
            plt.xticks(rotation=45)
            plt.ylabel("Accuracy")
            plt.tight_layout()
            plt.savefig(f"{output_prefix}_mmlu.png")
            plt.close()

        # Coherence Metrics
        if "coherence" in self.results:
            plt.figure(figsize=(8, 6))
            metrics = list(self.results["coherence"].keys())
            values = list(self.results["coherence"].values())
            plt.bar(metrics, values)
            plt.title("Text Generation Coherence Metrics")
            plt.xticks(rotation=45)
            plt.ylabel("Score")
            plt.tight_layout()
            plt.savefig(f"{output_prefix}_coherence.png")
            plt.close()

        # Hardware Performance
        if "hardware" in self.results:
            # Latency vs Input Length
            plt.figure(figsize=(15, 5))
            plt.subplot(1, 2, 1)
            lengths = list(self.results["hardware"]["latency"].keys())
            latencies = list(self.results["hardware"]["latency"].values())
            plt.plot(lengths, [l*1000 for l in latencies], marker='o')
            plt.title("Latency vs Input Length")
            plt.xlabel("Input Length")
            plt.ylabel("Latency (ms)")

            # Throughput vs Input Length
            plt.subplot(1, 2, 2)
            throughputs = list(self.results["hardware"]["throughput"].values())
            plt.plot(lengths, throughputs, marker='o')
            plt.title("Throughput vs Input Length")
            plt.xlabel("Input Length")
            plt.ylabel("Throughput (samples/s)")

            plt.tight_layout()
            plt.savefig(f"{output_prefix}_performance.png")
            plt.close()

            # Memory Usage
            plt.figure(figsize=(10, 6))
            mem_usage = list(self.results["hardware"]["memory_usage"].values())
            plt.plot(lengths, [m/1024/1024 for m in mem_usage], marker='o')
            plt.title("Memory Usage vs Input Length")
            plt.xlabel("Input Length")
            plt.ylabel("Memory Usage (MB)")
            plt.tight_layout()
            plt.savefig(f"{output_prefix}_memory.png")
            plt.close()

In [None]:
def get_optimal_bits(stats: Dict, config: Optional[QuantConfig] = None) -> Dict[str, int]:
    """Advanced bit allocation strategy"""
    if config is None:
        config = QuantConfig()

    allocations = {}
    max_std = max(layer['std'] for layer in stats.values())

    for name, layer_stats in stats.items():
        # Skip specific layers
        if not any(skip in name for skip in config.skip_layers):
            if 'embed' in name or 'lm_head' in name:
                bits = config.max_bits
            elif 'self_attn' in name:
                bits = 8  # Fixed bits for attention
            else:
                # Dynamic allocation based on statistics
                importance = layer_stats['std'] / max_std
                sparsity = layer_stats['sparsity']

                # Consider distribution shape via histogram
                if 'bins' in layer_stats:
                    hist = np.array(layer_stats['bins'])
                    entropy = -np.sum((hist/hist.sum()) * np.log2(hist/hist.sum() + 1e-10))
                    distribution_factor = entropy / np.log2(len(hist))  # Normalize by max entropy
                else:
                    distribution_factor = 1.0

                base_bits = config.min_bits + (config.max_bits - config.min_bits) * importance
                adjusted_bits = base_bits * (1 - sparsity * 0.5) * distribution_factor
                bits = max(config.min_bits, min(config.max_bits, int(adjusted_bits)))

            allocations[name] = bits

    return allocations

def calculate_model_size(model: nn.Module, bits_config: Optional[Dict[str, int]] = None) -> float:
    """Calculate model size in MB with detailed memory tracking"""
    total_params = 0
    total_size = 0
    size_details = {}

    for name, param in model.named_parameters():
        num_params = param.numel()
        total_params += num_params

        if bits_config and name in bits_config:
            bits = bits_config[name]
            size = (num_params * bits) / 8  # Convert bits to bytes
        else:
            size = num_params * param.element_size()

        total_size += size
        size_details[name] = {
            'params': num_params,
            'bits': bits_config.get(name, 16) if bits_config else 16,
            'size_mb': size / (1024 * 1024)
        }

    return total_size / (1024 * 1024)  # Convert to MB

def generate_text(text, model, tokenizer):
    try:
        inputs = tokenizer(text, return_tensors="pt", padding=True).to(model.device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=50,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.2  # dd repetition penalty
            )
        return tokenizer.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        return f"Generation error: {str(e)}"

In [None]:
def run_evaluation_pipeline(
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    output_dir: str,
    eval_config: Optional[EvalConfig] = None
) -> Dict:
    """Run complete evaluation pipeline."""
    if eval_config is None:
        eval_config = EvalConfig()

    # initialize evaluator
    evaluator = QuantizationEvaluator(model, tokenizer, eval_config)

    print("Evaluating perplexity...")
    evaluator.evaluate_perplexity()

    print("Evaluating MMLU...")
    evaluator.evaluate_mmlu()

    print("Evaluating coherence...")
    test_prompts = [
        "Explain the theory of relativity:",
        "Write a story about a magical forest:",
        "Describe the process of photosynthesis:"
    ]
    evaluator.evaluate_coherence(test_prompts)

    print("Evaluating hardware performance...")
    input_lengths = [128, 256, 512, 1024]
    evaluator.evaluate_hardware_performance(input_lengths)

    os.makedirs(output_dir, exist_ok=True)
    evaluator.generate_report(f"{output_dir}/evaluation_report.json")

    return evaluator.results

def print_comparison(baseline: dict, quantized: dict):
    def safe_division(n, d):
        try:
            if d == 0:
                return float('inf')
            return ((n / d) - 1) * 100
        except:
            return float('nan')

    metrics = ["perplexity", "mmlu", "coherence", "hardware"]

    for metric in metrics:
        if metric in baseline and metric in quantized:
            print(f"\n{metric.upper()} COMPARISON:")

            if metric == "hardware":
                for metric_type in ["latency", "throughput", "memory_usage"]:
                    if metric_type in baseline[metric] and metric_type in quantized[metric]:
                        print(f"\n{metric_type.upper()}:")

                        # Get all unique lengths from both baseline and quantized
                        lengths = set(baseline[metric][metric_type].keys()) | set(quantized[metric][metric_type].keys())

                        for length in sorted(lengths):
                            try:
                                base_val = baseline[metric][metric_type].get(length, 0)
                                quant_val = quantized[metric][metric_type].get(length, 0)

                                # Skip if both values are 0
                                if base_val == 0 and quant_val == 0:
                                    continue

                                rel_diff = safe_division(quant_val, base_val)

                                print(f"Input length {length}:")
                                if metric_type == "latency":
                                    print(f"  Baseline: {base_val*1000:.2f}ms")
                                    print(f"  Quantized: {quant_val*1000:.2f}ms")
                                elif metric_type == "throughput":
                                    print(f"  Baseline: {base_val:.2f} samples/s")
                                    print(f"  Quantized: {quant_val:.2f} samples/s")
                                else:  # memory_usage
                                    print(f"  Baseline: {base_val/(1024*1024):.2f}MB")
                                    print(f"  Quantized: {quant_val/(1024*1024):.2f}MB")

                                if not np.isnan(rel_diff) and not np.isinf(rel_diff):
                                    print(f"  Relative Difference: {rel_diff:+.2f}%")
                            except Exception as e:
                                print(f"  Error processing length {length}: {str(e)}")

            elif isinstance(baseline[metric], dict):
                for key in baseline[metric]:
                    if key in quantized[metric]:
                        try:
                            base_val = baseline[metric][key]
                            quant_val = quantized[metric][key]
                            if isinstance(base_val, (int, float)) and isinstance(quant_val, (int, float)):
                                rel_diff = safe_division(quant_val, base_val)
                                print(f"{key}:")
                                print(f"  Baseline: {base_val:.4f}")
                                print(f"  Quantized: {quant_val:.4f}")
                                if not np.isnan(rel_diff) and not np.isinf(rel_diff):
                                    print(f"  Relative Difference: {rel_diff:+.2f}%")
                        except Exception as e:
                            print(f"  Error processing {key}: {str(e)}")
            else:
                try:
                    if isinstance(baseline[metric], (int, float)) and isinstance(quantized[metric], (int, float)):
                        rel_diff = safe_division(quantized[metric], baseline[metric])
                        print(f"Baseline: {baseline[metric]:.4f}")
                        print(f"Quantized: {quantized[metric]:.4f}")
                        if not np.isnan(rel_diff) and not np.isinf(rel_diff):
                            print(f"Relative Difference: {rel_diff:+.2f}%")
                except Exception as e:
                    print(f"Error processing metric: {str(e)}")

In [None]:
# model config
model_name = "facebook/opt-350m"
output_dir = "./evaluation_results"
os.makedirs(output_dir, exist_ok=True)

# evaluation config
eval_config = EvalConfig(
    batch_size=4,
    max_length=256,
    num_samples=50,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

In [None]:
# base model for FP16 baseline
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
print("\n=== Running FP16 Baseline Evaluation ===\n")
baseline_results = run_evaluation_pipeline(
    base_model,
    tokenizer,
    os.path.join(output_dir, "baseline")
)


=== Running FP16 Baseline Evaluation ===

Evaluating perplexity...
Evaluating MMLU...


README.md:   0%|          | 0.00/53.2k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/138k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/9.96k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/28.3k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/6.05k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/152 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/21.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/5.09k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.96k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

test-00000-of-00001.parquet:   0%|          | 0.00/48.6k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/9.15k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/311 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/34 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/5 [00:00<?, ? examples/s]

Evaluating coherence...
Evaluating hardware performance...


In [None]:
import torch
import torch.nn as nn
import copy
class ActivationClipping(nn.Module):
    def __init__(self, pretrained_model, clip_min=-1.0, clip_max=1.0, dynamic_clipping = "static", std_dev_mult = 2, lower_percentile = 1.0, upper_percentile = 99.0):
        super().__init__()
        self.base_model = copy.deepcopy(pretrained_model)
        self.clip_min = clip_min
        self.std_multiplier = std_dev_mult
        self.clip_max = clip_max
        self.dynamic = dynamic_clipping
        self.hooks_list = []
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.clip_weights()


    def clip_activations(self, module, input, output, clip_min=-1.0, clip_max=1.0):
      return torch.clamp(output, min=clip_min, max=clip_max)
    def clip_activations_percentile(self, module, input, output):
        lower_bound = torch.quantile(output.flatten().float(), self.lower_percentile / 100.0)
        upper_bound = torch.quantile(output.flatten().float(), self.upper_percentile / 100.0)
        return torch.clamp(output, min=lower_bound, max=upper_bound)
    def clip_dynamic_activations(self, module, input, output):
        mean = output.mean()
        std = output.std()
        clip_min = mean - self.std_multiplier * std
        clip_max = mean + self.std_multiplier * std
        return torch.clamp(output, min=clip_min, max=clip_max)

    def clip_weights(self):
      for name, module in self.base_model.named_modules():
        if 'activation' in name:
          if self.dynamic == "std":
            hook = module.register_forward_hook(
                lambda m, inp, out: self.clip_dynamic_activations(m, inp, out)
            )
          elif self.dynamic == "percentile":
            hook = module.register_forward_hook(
                lambda m, inp, out: self.clip_activations_percentile(m, inp, out)
            )
          else:
            hook = module.register_forward_hook(
                lambda m, inp, out: self.clip_activations(m, inp, out, self.clip_min, self.clip_max)
            )
          self.hooks_list.append(hook)
    def forward(self, *args, **kwargs):
         return self.base_model(*args, **kwargs)

    def eval(self, tokenizer, eval_config):
      evaluator = QuantizationEvaluator(self.model, tokenizer, eval_config)

      print("Evaluating perplexity...")
      print(evaluator.evaluate_perplexity())


In [None]:

def evaluate_perplexity(model, tokenizer, model_type=""):
    """Evaluate model perplexity using consistent number of samples"""
    print(f"\nEvaluating {model_type} model perplexity...")

    # Use the correct dataset split
    dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='validation')

    # Concatenate all texts with progress bar
    texts = [text for text in tqdm(dataset["text"],
                                 desc="Processing texts",
                                 leave=False) if text.strip()]
    full_text = " ".join(texts)

    # Tokenize without padding/truncation
    print("Tokenizing text...")
    encodings = tokenizer(full_text, return_tensors="pt")
    input_ids = encodings['input_ids']  # Correctly access the input_ids tensor

    # Move input_ids to the correct device
    input_ids = input_ids.to(model.device)

    # Setup sliding window evaluation
    stride = 512
    seq_len = 1024
    nlls = []

    # Calculate number of steps for progress bar
    n_steps = (input_ids.size(1) - 1) // stride + 1

    progress_bar = tqdm(
        range(0, input_ids.size(1), stride),
        desc=f"{model_type} PPL",
        total=n_steps,
        bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, ppl={postfix[0]:.2f}]',
        postfix=[0.0]
    )

    for i in progress_bar:
        begin_loc = max(i + seq_len - stride, 0)
        end_loc = min(i + seq_len, input_ids.size(1))
        target_len = end_loc - i

        # Get the appropriate slice of input_ids
        current_input_ids = input_ids[:, i:end_loc]
        target_ids = current_input_ids.clone()

        try:
            with torch.no_grad():
                outputs = model(current_input_ids)

                # Compute loss
                shift_logits = outputs.logits[..., :-1, :].contiguous()
                shift_labels = target_ids[..., 1:].contiguous()

                loss_fct = nn.CrossEntropyLoss(reduction='none')
                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                              shift_labels.view(-1))

                # Only use the non-overlapping part for perplexity
                if i != 0:
                    loss = loss[-target_len:]

                nlls.append(loss)

                # Update progress bar with current perplexity
                if len(nlls) > 0:
                    current_nll = torch.cat(nlls).mean()
                    current_ppl = torch.exp(current_nll).item()
                    progress_bar.postfix[0] = current_ppl

        except RuntimeError as e:
            print(f"Error during evaluation: {str(e)}")
            return float('inf')

    # Compute final perplexity
    nll = torch.cat(nlls).mean()
    ppl = torch.exp(nll).item()

    print(f"\n{model_type} Final Results:")
    print(f"Perplexity: {ppl:.4f}")

    return ppl

In [None]:
from tqdm import tqdm

In [None]:
print("Evaluating perplexity...")
print(evaluate_perplexity(base_model, tokenizer))

Evaluating perplexity...

Evaluating  model perplexity...




Tokenizing text...


 PPL: 100%|██████████| 488/488 [00:21<00:00, ppl=25.14]


 Final Results:
Perplexity: 25.1406
25.140625





In [None]:
activation_clipping = ActivationClipping(base_model, clip_min = -1.5, clip_max = 1.5)
activation_clipping.cuda()
print("Evaluating perplexity...")
print(evaluate_perplexity(activation_clipping, tokenizer))

Evaluating perplexity...

Evaluating  model perplexity...




Tokenizing text...


 PPL: 100%|██████████| 488/488 [00:23<00:00, ppl=35.19]



 Final Results:
Perplexity: 35.1875
35.1875


In [None]:
activation_clipping = ActivationClipping(base_model, clip_min = -1, clip_max = 1)

print("Evaluating perplexity...")
print(evaluate_perplexity(activation_clipping, tokenizer))

Evaluating perplexity...

Evaluating  model perplexity...




Tokenizing text...


 PPL: 100%|██████████| 488/488 [00:23<00:00, ppl=70.62]


 Final Results:
Perplexity: 70.6250
70.625





In [None]:
activation_clipping = ActivationClipping(base_model, dynamic_clipping = "std", std_dev_mult = 2)

print("Evaluating perplexity...")
print(evaluate_perplexity(activation_clipping, tokenizer))

Evaluating perplexity...

Evaluating  model perplexity...




Tokenizing text...


 PPL: 100%|██████████| 488/488 [00:27<00:00, ppl=722.00]



 Final Results:
Perplexity: 722.0000
722.0


In [None]:
activation_clipping = ActivationClipping(base_model, dynamic_clipping = "std", std_dev_mult = 3)

print("Evaluating perplexity...")
print(evaluate_perplexity(activation_clipping, tokenizer))

Evaluating perplexity...

Evaluating  model perplexity...




Tokenizing text...


 PPL: 100%|██████████| 488/488 [00:25<00:00, ppl=446.50]


 Final Results:
Perplexity: 446.5000
446.5





In [None]:
activation_clipping = ActivationClipping(base_model, dynamic_clipping = "std", std_dev_mult = 5)

print("Evaluating perplexity...")
print(evaluate_perplexity(activation_clipping, tokenizer))

Evaluating perplexity...

Evaluating  model perplexity...




Tokenizing text...


 PPL: 100%|██████████| 488/488 [00:26<00:00, ppl=165.62]



 Final Results:
Perplexity: 165.6250
165.625


In [None]:
activation_clipping = ActivationClipping(base_model, dynamic_clipping = "std", std_dev_mult = 10)

print("Evaluating perplexity...")
print(evaluate_perplexity(activation_clipping, tokenizer))

Evaluating perplexity...

Evaluating  model perplexity...




Tokenizing text...


 PPL: 100%|██████████| 488/488 [00:27<00:00, ppl=29.86]



 Final Results:
Perplexity: 29.8594
29.859375


In [None]:
activation_clipping = ActivationClipping(base_model, dynamic_clipping = "percentile")

print("Evaluating perplexity...")
print(evaluate_perplexity(activation_clipping, tokenizer))

Evaluating perplexity...

Evaluating  model perplexity...




Tokenizing text...


 PPL: 100%|██████████| 488/488 [01:22<00:00, ppl=189.12]


 Final Results:
Perplexity: 189.1250
189.125





In [None]:
activation_clipping = ActivationClipping(base_model, dynamic_clipping = "percentile", lower_percentile = 0.5, upper_percentile = 99.5)

print("Evaluating perplexity...")
print(evaluate_perplexity(activation_clipping, tokenizer))

Evaluating perplexity...

Evaluating  model perplexity...




Tokenizing text...


 PPL: 100%|██████████| 488/488 [01:21<00:00, ppl=70.94]


 Final Results:
Perplexity: 70.9375
70.9375





In [None]:
activation_clipping = ActivationClipping(base_model, dynamic_clipping = "percentile", lower_percentile = 5.0, upper_percentile = 95.0)

print("Evaluating perplexity...")
print(evaluate_perplexity(activation_clipping, tokenizer))

Evaluating perplexity...

Evaluating  model perplexity...




Tokenizing text...


 PPL: 100%|██████████| 488/488 [01:21<00:00, ppl=4512.00]


 Final Results:
Perplexity: 4512.0000
4512.0





In [None]:
lp = LayerProfiler(activation_clipping, tokenizer)
lp.get_activation_stats()

{'base_model.model.decoder.embed_tokens': {'mean': -0.0017795562744140625,
  'std': 0.0484619140625,
  'max': 0.14404296875,
  'shape': [1, 5, 512]},
 'base_model.model.decoder.embed_positions': {'mean': 0.0004832744598388672,
  'std': 0.0264129638671875,
  'max': 0.137939453125,
  'shape': [1, 5, 1024]},
 'base_model.model.decoder.project_in': {'mean': -0.0008072853088378906,
  'std': 0.042938232421875,
  'max': 0.98583984375,
  'shape': [1, 5, 1024]},
 'base_model.model.decoder.layers.0.self_attn.q_proj': {'mean': -0.0011806488037109375,
  'std': 0.497802734375,
  'max': 1.7509765625,
  'shape': [1, 5, 1024]},
 'base_model.model.decoder.layers.0.self_attn.k_proj': {'mean': -0.00171661376953125,
  'std': 0.67626953125,
  'max': 1.677734375,
  'shape': [1, 5, 1024]},
 'base_model.model.decoder.layers.0.self_attn.v_proj': {'mean': -0.0003352165222167969,
  'std': 0.022216796875,
  'max': 0.15673828125,
  'shape': [1, 5, 1024]},
 'base_model.model.decoder.layers.0.self_attn.out_proj': {'

In [None]:
# quantized version
quantizer = W4A4Quantizer(base_model)
quantizer.quantize_model()


Starting model quantization...
Original size: 631.71MB
Quantized size: 339.37MB
Compression ratio: 1.86x


In [None]:
print("\n=== Running Quantized Model Evaluation ===\n")
quantized_results = run_evaluation_pipeline(
    base_model,
    tokenizer,
    os.path.join(output_dir, "quantized")
)


=== Running Quantized Model Evaluation ===

Evaluating perplexity...
Evaluating MMLU...
Evaluating coherence...
Evaluating hardware performance...


In [None]:
evaluator = QuantizationEvaluator(base_model, tokenizer, eval_config)

print("Evaluating perplexity...")
evaluator.evaluate_perplexity()

Evaluating perplexity...


90.92023468017578

In [None]:
print("\n=== Results Comparison ===")
print_comparison(baseline_results, activation_clip_results)


=== Results Comparison ===

PERPLEXITY COMPARISON:
Baseline: 90.1556
Quantized: 90.1556
Relative Difference: +0.00%

MMLU COMPARISON:
abstract_algebra:
  Baseline: 0.1900
  Quantized: 0.1900
  Relative Difference: +0.00%
astronomy:
  Baseline: 0.2100
  Quantized: 0.2100
  Relative Difference: +0.00%
business_ethics:
  Baseline: 0.2900
  Quantized: 0.3000
  Relative Difference: +3.45%
philosophy:
  Baseline: 0.1900
  Quantized: 0.2000
  Relative Difference: +5.26%

COHERENCE COMPARISON:
repetition_score:
  Baseline: 0.5245
  Quantized: 0.5116
  Relative Difference: -2.44%
consistency_score:
  Baseline: 0.8252
  Quantized: 0.7116
  Relative Difference: -13.76%
fluency_score:
  Baseline: 0.2219
  Quantized: 0.2097
  Relative Difference: -5.50%

HARDWARE COMPARISON:

LATENCY:
Input length 128:
  Baseline: 35.80ms
  Quantized: 35.67ms
  Relative Difference: -0.37%
Input length 256:
  Baseline: 70.48ms
  Quantized: 72.05ms
  Relative Difference: +2.23%
Input length 512:
  Baseline: 149.49ms