<a href="https://colab.research.google.com/github/oluwafemidiakhoa/MLprject/blob/main/Quantization_4_bit_Quantizer_Class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Step 1: Building the MYQ 4-bit Quantizer Class
First, we need to build the QuantizedLinearLayer4Bit class that will replace the linear layers in the base model.

Code for QuantizedLinearLayer4Bit Class

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

class QuantizedLinearLayer4Bit(nn.Module):
    def __init__(self, in_features, out_features, bias=True, dtype=torch.float32):
        super().__init__()
        self.register_buffer("weight", torch.randint(-8, 7, (out_features, in_features)).to(torch.int8))
        self.register_buffer("scale", torch.randn((out_features), dtype=dtype))
        if bias:
            self.register_buffer("bias", torch.randn((1, out_features), dtype=dtype))
        else:
            self.bias = None

    def quantize(self, weight):
        weight_f32 = weight.clone().to(torch.float32)
        Qmin = -8
        Qmax = 7
        scale = weight_f32.abs().max(dim=-1).values / Qmax
        scale = scale.to(weight.dtype)
        quantized_weight = torch.clamp(torch.round(weight / scale.unsqueeze(1)), Qmin, Qmax).to(torch.int8)
        packed_weight = self.pack_weights(quantized_weight)
        self.weight = packed_weight
        self.scale = scale

    def pack_weights(self, weight):
        packed_weight = (weight[:, ::2] << 4) | (weight[:, 1::2] & 0xF)
        return packed_weight

    def unpack_weights(self, packed_weight):
        unpacked_weight = torch.zeros((packed_weight.shape[0], packed_weight.shape[1] * 2), dtype=torch.int8)
        unpacked_weight[:, ::2] = (packed_weight >> 4) & 0xF
        unpacked_weight[:, 1::2] = packed_weight & 0xF
        unpacked_weight[unpacked_weight > 7] -= 16
        return unpacked_weight

    def forward(self, input):
        unpacked_weight = self.unpack_weights(self.weight)
        output = F.linear(input, unpacked_weight.to(input.dtype)) * self.scale
        if self.bias is not None:
            output = output + self.bias
        return output


## Step 2: Fetching the Base Model
We'll fetch the facebook/opt-350m model from Hugging Face.

Code to Fetch the Model

In [10]:
# Authenticate with Hugging Face
!huggingface-cli login --token hf_wqrgFOHJcCYZydYyJrYHJUd##############

# Fetch the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.bfloat16)

print("facebook/opt-350m: base model architecture before quantization")
print("-" * 50)
print(model)

# Check the size of the base model
model_memory_size_before_quantization = model.get_memory_footprint()
print(f"Total memory size before quantization (in GB): {model_memory_size_before_quantization / 1e+9}")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Traceback (most recent call last):
  File "/usr/local/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/commands/huggingface_cli.py", line 51, in main
    service.run()
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/commands/user.py", line 98, in run
    login(token=self.args.token, add_to_git_credential=self.args.add_to_git_credential)
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/_login.py", line 111, in login
    _login(token, add_to_git_credential=add_to_git_credential, write_permission=write_permission)
  File "/usr/local/lib/python3.10/dist-packages/huggingface_hub/_login.py", line 307, in _login
    raise ValueError("Invalid token passed!")
Value

tokenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

facebook/opt-350m: base model architecture before quantization
--------------------------------------------------
OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=Tru

## Step 3: Replace Linear Layers and Perform Quantization
Next, we'll replace the linear layers in the base model with our QuantizedLinearLayer4Bit and perform quantization.

Code to Replace Linear Layers and Quantize

In [11]:
def replace_linearlayer(base_model, quantizer_class, exclude_list, quantized=True):
    for name, child in base_model.named_children():
        if isinstance(child, nn.Linear) and not any([x == name for x in exclude_list]):
            old_bias = child.bias
            old_weight = child.weight
            in_features = child.in_features
            out_features = child.out_features
            quantizer_layer = quantizer_class(in_features, out_features, old_bias is not None, old_weight.dtype)
            setattr(base_model, name, quantizer_layer)
            if quantized:
                getattr(base_model, name).quantize(old_weight)
            if old_bias is not None:
                getattr(base_model, name).bias = old_bias
        else:
            replace_linearlayer(child, quantizer_class, exclude_list, quantized=quantized)

# Replace linear layers in the model
replace_linearlayer(model, QuantizedLinearLayer4Bit, ["lm_head"], quantized=True)

print("facebook/opt-350m: quantized model architecture")
print("-" * 50)
print(model)

# Check the size of the quantized model
model_memory_size_after_quantization = model.get_memory_footprint()
print(f"Total memory size after quantization (in GB): {model_memory_size_after_quantization / 1e+9}")

# Perform inference on the quantized model
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
print(pipe("Malaysia is a beautiful country and ", max_new_tokens=50))


facebook/opt-350m: quantized model architecture
--------------------------------------------------
OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): QuantizedLinearLayer4Bit()
      (project_in): QuantizedLinearLayer4Bit()
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): QuantizedLinearLayer4Bit()
            (v_proj): QuantizedLinearLayer4Bit()
            (q_proj): QuantizedLinearLayer4Bit()
            (out_proj): QuantizedLinearLayer4Bit()
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): QuantizedLinearLayer4Bit()
          (fc2): QuantizedLinearLayer4Bit()
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
       