<a href="https://colab.research.google.com/github/nayeon-duck/TinyML_quantizaion_pj/blob/main/PTQprac.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.quantization as quantization
from torch.quantization import QuantStub, DeQuantStub
import torchvision
import torchvision.transforms as transforms

# 1. 모델 정의
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        # QuantStub과 DeQuantStub 추가
        self.quant = QuantStub()
        self.dequant = DeQuantStub()

        # 레이어 정의
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU()

        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.relu2 = nn.ReLU()

        self.pool = nn.AdaptiveAvgPool2d((8, 8))
        self.fc = nn.Linear(64 * 8 * 8, 10)

    def forward(self, x):
        x = self.quant(x)

        x = self.relu1(self.bn1(self.conv1(x)))
        x = self.relu2(self.bn2(self.conv2(x)))
        x = self.pool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        x = self.dequant(x)
        return x

# 2. FP32 모델 학습 (일반적인 학습 과정)
def train_fp32_model():
    # 데이터 준비
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                           download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=128,
                                             shuffle=True, num_workers=2)

    # 모델, 손실 함수, 옵티마이저
    model = SimpleModel()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    # 학습
    model.train()
    for epoch in range(5):  # 5 에포크
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(trainloader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 100 == 99:
                print(f'[Epoch {epoch + 1}, Batch {i + 1}] loss: {running_loss / 100:.3f}')
                running_loss = 0.0

    return model

# 3. PTQ 적용 함수
def apply_ptq(fp32_model, calibration_loader):
    """
    학습된 FP32 모델에 PTQ를 적용
    """
    # CPU로 이동 (quantization은 CPU에서 수행)
    fp32_model = fp32_model.cpu()
    fp32_model.eval()

    # QConfig 설정 (Intel CPU용)
    fp32_model.qconfig = quantization.get_default_qconfig('fbgemm')
    # ARM용을 원한다면: quantization.get_default_qconfig('qnnpack')

    # Module Fusion (Conv-BN-ReLU 융합)
    model_fused = quantization.fuse_modules(fp32_model, [
        ['conv1', 'bn1', 'relu1'],
        ['conv2', 'bn2', 'relu2']
    ])

    # PTQ 준비 (observer 삽입)
    model_prepared = quantization.prepare(model_fused, inplace=False)

    # Calibration: 대표 데이터로 통계 수집
    print("Calibration 시작...")
    model_prepared.eval()
    with torch.no_grad():
        for i, (data, _) in enumerate(calibration_loader):
            model_prepared(data)
            if i >= 100:  # 100 배치만 사용
                break
    print("Calibration 완료")

    # Quantized 모델로 변환
    model_quantized = quantization.convert(model_prepared, inplace=False)

    return model_quantized

# 4. 모델 평가 함수
def evaluate_model(model, test_loader, device='cpu'):
    model = model.to(device)
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            _, predicted = torch.max(output.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    accuracy = 100 * correct / total
    return accuracy

# 5. 모델 크기 측정 함수
def get_model_size(model, filename):
    torch.save(model.state_dict(), filename)
    import os
    size_mb = os.path.getsize(filename) / (1024 * 1024)
    return size_mb

# 6. 전체 실행 코드
if __name__ == "__main__":
    # 데이터 로더 준비
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                          download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=128,
                                            shuffle=False, num_workers=2)

    calibration_loader = torch.utils.data.DataLoader(testset, batch_size=128,
                                                     shuffle=True, num_workers=2)

    # FP32 모델 학습
    print("FP32 모델 학습 중...")
    fp32_model = train_fp32_model()

    # FP32 모델 평가
    fp32_accuracy = evaluate_model(fp32_model, testloader, 'cuda' if torch.cuda.is_available() else 'cpu')
    fp32_size = get_model_size(fp32_model, 'fp32_model.pth')
    print(f"\nFP32 모델 - 정확도: {fp32_accuracy:.2f}%, 크기: {fp32_size:.2f} MB")

    # PTQ 적용
    print("\nPTQ 적용 중...")
    quantized_model = apply_ptq(fp32_model, calibration_loader)

    # Quantized 모델 평가
    int8_accuracy = evaluate_model(quantized_model, testloader, 'cpu')
    int8_size = get_model_size(quantized_model, 'int8_model.pth')
    print(f"INT8 모델 - 정확도: {int8_accuracy:.2f}%, 크기: {int8_size:.2f} MB")

    # 결과 비교
    print(f"\n압축률: {fp32_size / int8_size:.2f}x")
    print(f"정확도 손실: {fp32_accuracy - int8_accuracy:.2f}%")


100%|██████████| 170M/170M [00:20<00:00, 8.26MB/s]


FP32 모델 학습 중...
[Epoch 1, Batch 100] loss: 1.636
[Epoch 1, Batch 200] loss: 1.309
[Epoch 1, Batch 300] loss: 1.194
[Epoch 2, Batch 100] loss: 1.010
[Epoch 2, Batch 200] loss: 1.002
[Epoch 2, Batch 300] loss: 0.971
[Epoch 3, Batch 100] loss: 0.883
[Epoch 3, Batch 200] loss: 0.882
[Epoch 3, Batch 300] loss: 0.901
[Epoch 4, Batch 100] loss: 0.836
[Epoch 4, Batch 200] loss: 0.819
[Epoch 4, Batch 300] loss: 0.810
[Epoch 5, Batch 100] loss: 0.785
[Epoch 5, Batch 200] loss: 0.783
[Epoch 5, Batch 300] loss: 0.776

FP32 모델 - 정확도: 71.01%, 크기: 0.24 MB

PTQ 적용 중...
Calibration 시작...


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_prepared = quantization.prepare(model_fused, inplace=False)


Calibration 완료


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  model_quantized = quantization.convert(model_prepared, inplace=False)


INT8 모델 - 정확도: 70.82%, 크기: 0.07 MB

압축률: 3.60x
정확도 손실: 0.19%


In [15]:

# 결과 요약 출력
print("\n" + "="*50)
print("PTQ 결과 요약")
print("="*50)
print(f"FP32 모델 정확도: {fp32_accuracy:.2f}%")
print(f"INT8 모델 정확도: {int8_accuracy:.2f}%")
print(f"정확도 손실: {accuracy_loss:.2f}%")
print(f"정확도 유지율: {accuracy_retention:.2f}%")
print(f"\nFP32 모델 크기: {fp32_size:.2f} MB")
print(f"INT8 모델 크기: {int8_size:.2f} MB")
print(f"압축률: {compression_ratio:.2f}x")
print(f"크기 감소: {size_reduction:.1f}%")
print("="*50)



PTQ 결과 요약
FP32 모델 정확도: 71.01%
INT8 모델 정확도: 70.82%
정확도 손실: 0.19%
정확도 유지율: 99.73%

FP32 모델 크기: 0.24 MB
INT8 모델 크기: 0.07 MB
압축률: 3.60x
크기 감소: 72.2%
