# Quantization example with Hugging Face

In [None]:
!pip install bitsandbytes



In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
)
import torch
import os

In [None]:
MODEL_NAME = "bert-base-uncased" # 你可以自行換成更大的模型

In [None]:
# 載入模型 (FP32)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
torch.save(model.state_dict(), "model.pt")
fp32_size = os.path.getsize("model.pt") / 1e6  # 換算成 MB

## FP32 -> FP16

In [None]:
model = model.half()  # 把所有 weights 轉為 FP16
torch.save(model.state_dict(), "model.pt")
fp16_size = os.path.getsize("model.pt") / 1e6  # 換算成 MB

## FP32 -> 4-bit
- 使用 [BitsAndBytesConfig](https://huggingface.co/docs/transformers/v4.51.3/en/main_classes/quantization#transformers.BitsAndBytesConfig)

In [None]:
# 建立 4-bit 量化設定
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, # 以 4-bit 的精度儲存模型參數數值
    bnb_4bit_compute_dtype="float16",  # weights 會先「解壓縮」成高精度數值，然後以較高精度進行計算
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config  # 套用 BitsAndBytesConfig 設定
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 儲存量化後模型大小
torch.save(model.state_dict(), "model.pt")
int4_size = os.path.getsize("model.pt") / 1e6  # 換算成 MB

In [None]:
# 顯示結果
print(f"Float32 模型大小: {fp32_size:.2f} MB")
print(f"Float16 模型大小: {fp16_size:.2f} MB")
print(f"INT4 量化後大小: {int4_size:.2f} MB")

Float32 模型大小: 438.00 MB
Float16 模型大小: 219.03 MB
INT4 量化後大小: 96.17 MB
