In [None]:
import argparse
import os
import time
import shutil

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
     

import torchvision
import torchvision.transforms as transforms

from models import *
from collections import OrderedDict


global best_prec
use_gpu = torch.cuda.is_available()
print('=> Building model...')
    
    
    
batch_size = 128
model_name = "VGG16_quant_aware_trained"
model = VGG16_quant()

print(model)

normalize = transforms.Normalize(mean=[0.491, 0.482, 0.447], std=[0.247, 0.243, 0.262])

train_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ]))

trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)


test_dataset = torchvision.datasets.CIFAR10(
    root='./data',
    train=False,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor(),
        normalize,
    ]))

testloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)


print_freq = 100 # every 100 batches, accuracy printed. Here, each batch includes "batch_size" data points
# CIFAR10 has 50,000 training data, and 10,000 validation data.

def train(trainloader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    model.train()

    end = time.time()
    for i, (input, target) in enumerate(trainloader):
        # measure data loading time
        data_time.update(time.time() - end)

        input, target = input.cuda(), target.cuda()

        # compute output
        output = model(input)

        # error = torch.mean(torch.abs(captured['psum_recovered'] - captured['next_input']))

        # After output = model(input), inside train() or validate()
        # if 'psum_recovered' in captured and 'next_input' in captured:
        #     # Apply ReLU to recovered psum to match next layer's input
        #     psum_relu = F.relu(captured['psum_recovered'])
        #     next_input = captured['next_input']
        #     error = torch.mean(torch.abs(psum_relu - next_input)).item()
        
        loss = criterion(output, target)

        # measure accuracy and record loss
        prec = accuracy(output, target)[0]
        losses.update(loss.item(), input.size(0))
        top1.update(prec.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()


        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec {top1.val:.3f}% ({top1.avg:.3f}%)'.format(
                   epoch, i, len(trainloader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1))

            

def validate(val_loader, model, criterion ):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
         
            input, target = input.cuda(), target.cuda()

            # compute output
            output = model(input)

            # if 'psum_recovered' in captured and 'next_input' in captured:
            #     # Apply ReLU to recovered psum to match next layer's input
            #     psum_relu = F.relu(captured['psum_recovered'])
            #     next_input = captured['next_input']
            #     error = torch.mean(torch.abs(psum_relu - next_input)).item()
            
            loss = criterion(output, target)

            # measure accuracy and record loss
            prec = accuracy(output, target)[0]
            losses.update(loss.item(), input.size(0))
            top1.update(prec.item(), input.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0:  # This line shows how frequently print out the status. e.g., i%5 => every 5 batch, prints out
                print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec {top1.val:.3f}% ({top1.avg:.3f}%)'.format(
                   i, len(val_loader), batch_time=batch_time, loss=losses,
                   top1=top1))

    print(' * Prec {top1.avg:.3f}% '.format(top1=top1))
    return top1.avg


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

        
def save_checkpoint(state, is_best, fdir):
    filepath = os.path.join(fdir, 'checkpoint.pth')
    torch.save(state, filepath)
    if is_best:
        shutil.copyfile(filepath, os.path.join(fdir, 'model_best.pth.tar'))


def adjust_learning_rate(optimizer, epoch):
    """For resnet, the lr starts from 0.1, and is divided by 10 at 80 and 120 epochs"""
    adjust_list = [150, 225]
    if epoch in adjust_list:
        for param_group in optimizer.param_groups:
            param_group['lr'] = param_group['lr'] * 0.1        

#model = nn.DataParallel(model).cuda()
#all_params = checkpoint['state_dict']
#model.load_state_dict(all_params, strict=False)
#criterion = nn.CrossEntropyLoss().cuda()
#validate(testloader, model, criterion)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()

dataiter = iter(testloader)
images, labels = next(dataiter) ## If you run this line, the next data batch is called subsequently.

imshow(torchvision.utils.make_grid(images))


In [None]:
# This cell won't be given, but students will complete the training

lr = 0.02
weight_decay = 5e-4
epochs = 170
best_prec = 0

#model = nn.DataParallel(model).cuda()
model.cuda()

criterion = nn.CrossEntropyLoss(label_smoothing=0.1).cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
#cudnn.benchmark = True

if not os.path.exists('result'):
    os.makedirs('result')
fdir = 'result/'+str(model_name)
if not os.path.exists(fdir):
    os.makedirs(fdir)
        

for epoch in range(0, epochs):
    adjust_learning_rate(optimizer, epoch)

    train(trainloader, model, criterion, optimizer, epoch)
    
    # evaluate on test set
    print("Validation starts")
    prec = validate(testloader, model, criterion)

    # remember best precision and save checkpoint
    is_best = prec > best_prec
    best_prec = max(prec,best_prec)
    print('best acc: {:1f}'.format(best_prec))
    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'best_prec': best_prec,
        'optimizer': optimizer.state_dict(),
    }, is_best, fdir)

In [27]:

fdir = 'result/VGG16_quant/model_best.pth.tar'

checkpoint = torch.load(fdir)
model.load_state_dict(checkpoint['state_dict'])


criterion = nn.CrossEntropyLoss().cuda()

model.eval()
model.cuda()

print('Accuracy for base case')
prec = validate(testloader, model, criterion)
print('First conv layer’s weights’ absolute sum = ', model.features[0].weight.abs().sum())

Accuracy for base case
Test: [0/79]	Time 0.107 (0.107)	Loss 0.2514 (0.2514)	Prec 94.531% (94.531%)
 * Prec 92.020% 
First conv layer’s weights’ absolute sum =  tensor(117.6888, device='cuda:0', grad_fn=<SumBackward0>)


In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from models.quant_layer import QuantConv2d

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---- find the 8×8 squeezed QuantConv2d ----
features = model.features
sq_idx = None
for i, m in enumerate(features):
    if isinstance(m, QuantConv2d) and m.in_channels == 8 and m.out_channels == 8:
        sq_idx = i
        break
assert sq_idx is not None, "No 8×8 squeezed conv found"
sq = features[sq_idx]

# next real layer
nx = sq_idx + 1
while isinstance(features[nx], (nn.BatchNorm2d, nn.ReLU, nn.Identity)):
    nx += 1
next_layer = features[nx]

# ---- hook inputs ----
_cache = {}
captured = {}  
def hook_sq(m, inp): captured["x"] = inp[0].detach().to(DEVICE)
def hook_nx(m, inp): captured["y"] = inp[0].detach().to(DEVICE)

h1 = sq.register_forward_pre_hook(hook_sq)
h2 = next_layer.register_forward_pre_hook(hook_nx)

# ---- run 1 batch ----
dummy = torch.randn(1, 3, 32, 32).to(DEVICE)
with torch.no_grad():
    model(dummy)

h1.remove()
h2.remove()

assert "x" in captured
x_in = captured["x"]

# ---- quantization params ----
def qparams(alpha, nbit, signed):
    if signed:
        qmax = (2**(nbit-1))-1
        qmin = -(2**(nbit-1))
    else:
        qmax = (2**nbit)-1
        qmin = 0
    scale = alpha / qmax  # use scale only
    return scale

# ---- quantize weights ----
w_q = getattr(sq, "weight_q", None)
w = sq.weight.detach()
alpha_w = w.abs().max()
d_w = qparams(alpha_w, 4, True)  # compute scale in all cases

if w_q is None:
    w_int = torch.clamp(torch.round(w / d_w), -(2**3), (2**3)-1)
    w_q = w_int * d_w
else:
    w_q = w_q.detach()

# ---- quantize activation ----
aq = getattr(sq, "act_quant", None)
alpha_x = getattr(aq, "alpha", x_in.abs().max())
signed_x = getattr(aq, "signed", False)

d_x = qparams(alpha_x, 4, signed_x)  # only scale
x_int = torch.round(x_in / d_x)
x_q = x_int * d_x

# ---- integer conv ----
stride, pad, groups = sq.stride, sq.padding, sq.groups
bias = sq.bias

psum_int = F.conv2d(x_int.float(), torch.round(w_q/d_w).float(),
                    bias=None, stride=stride, padding=pad, groups=groups)
psum_fp = psum_int * (d_x * d_w)
if bias is not None:
    psum_fp = psum_fp + bias.view(1,-1,1,1)
psum_relu = torch.clamp(psum_fp, min=0.0)

# ---- reference conv(x_q, w_q) ----
conv_ref = nn.Conv2d(
    in_channels=w_q.size(1),
    out_channels=w_q.size(0),
    kernel_size=w_q.shape[2:],
    stride=stride,
    padding=pad,
    bias=(bias is not None)
).to(DEVICE)
conv_ref.weight = nn.Parameter(w_q.clone())
if bias is not None:
    conv_ref.bias = nn.Parameter(bias.clone())

with torch.no_grad():
    y_ref = torch.clamp(conv_ref(x_q), min=0.0)

# ---- final quantization MSE ----
mse = (psum_relu - y_ref).pow(2).mean().item()
print(f"4A/4W Quantization MSE: {mse:.6e}")


4A/4W Quantization MSE: 2.620530e-07


In [5]:
def compute_sparsity(model):
    total_zeros = 0
    total_params = 0
    layer_wise = {}
    for name, module in model.named_modules():
        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
            weight = module.weight.data
            zeros = torch.sum(weight == 0).item()
            params = weight.numel()
            sparsity = zeros / params * 100
            total_zeros += zeros
            total_params += params
            layer_wise[name] = sparsity
    total_sparsity = total_zeros / total_params * 100
    return layer_wise, total_sparsity

def compute_macs_vgg_quant(model, input_size=(3,32,32)):
    C_in, H, W = input_size
    total_macs = 0
    for name, module in model.named_modules():
        if isinstance(module, nn.Conv2d):
            C_out, C_in_layer, kH, kW = module.weight.shape
            strideH, strideW = module.stride
            padH, padW = module.padding
            H_out = (H + 2*padH - kH)//strideH + 1
            W_out = (W + 2*padW - kW)//strideW + 1
            total_macs += H_out * W_out * C_out * C_in_layer * kH * kW
            H, W, C_in = H_out, W_out, C_out
        elif isinstance(module, nn.Linear):
            total_macs += module.weight.shape[0] * module.weight.shape[1]
    return total_macs

def compute_model_size(model, bits_per_weight=32):
    total_params = 0
    for module in model.modules():
        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
            total_params += module.weight.numel()
    return total_params * bits_per_weight / 8 / 1024**2

In [None]:
# HW

#  1. Train with 4 bits for both weight and activation to achieve >90% accuracy
#  2. Find x_int and w_int for the 2nd convolution layer
#  3. Check the recovered psum has similar value to the un-quantized original psum
#     (such as example 1 in W3S2)

In [6]:
import torch
import torch.nn as nn

# ---------------- Helper functions ----------------
def compute_sparsity(model):
    total_zeros = 0
    total_params = 0
    layer_wise = {}
    for name, module in model.named_modules():
        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
            weight = module.weight.data
            zeros = torch.sum(weight == 0).item()
            params = weight.numel()
            sparsity = zeros / params * 100
            total_zeros += zeros
            total_params += params
            layer_wise[name] = sparsity
    total_sparsity = total_zeros / total_params * 100
    return layer_wise, total_sparsity

def compute_macs_vgg_quant(model, input_size=(3,32,32)):
    C_in, H, W = input_size
    total_macs = 0
    for name, module in model.named_modules():
        if isinstance(module, nn.Conv2d):
            C_out, C_in_layer, kH, kW = module.weight.shape
            strideH, strideW = module.stride
            padH, padW = module.padding
            H_out = (H + 2*padH - kH)//strideH + 1
            W_out = (W + 2*padW - kW)//strideW + 1
            total_macs += H_out * W_out * C_out * C_in_layer * kH * kW
            H, W, C_in = H_out, W_out, C_out
        elif isinstance(module, nn.Linear):
            total_macs += module.weight.shape[0] * module.weight.shape[1]
    return total_macs

def compute_model_size(model, bits_per_weight=32):
    total_params = 0
    for module in model.modules():
        if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
            weight = module.weight.data
            total_params += torch.sum(weight != 0).item()  # count only non-zero weights
    return total_params * bits_per_weight / 8 / 1024**2  # in MB

# ---------------- Load Unpruned Model ----------------
PATH = 'result/VGG16_quant/model_best.pth.tar'
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
device = torch.device("cuda") 
model.cuda()
model.eval()

# Compute Accuracy for Unpruned
correct = 0
with torch.no_grad():
    for data, target in testloader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()

unpruned_acc = 100. * correct / len(testloader.dataset)
unpruned_macs = compute_macs_vgg_quant(model, input_size=(3,32,32))
unpruned_model_size = compute_model_size(model, bits_per_weight=4)  # quantized int4

print("Metrics for Unpruned Model ---")
print(f"Accuracy: {unpruned_acc:.2f}%")
print(f"MACs: {unpruned_macs}")
print(f"Model Size (int4): {unpruned_model_size:.2f} MB")

# ---------------- Load Pruned Model ----------------
save_path = "result/VGG16_quant/model_pruned_finetuned.pth.tar"
checkpoint = torch.load(save_path)
model.load_state_dict(checkpoint['state_dict'])
model.cuda()
model.eval()

# Compute Accuracy for Pruned
correct = 0
with torch.no_grad():
    for data, target in testloader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()

pruned_acc = 100. * correct / len(testloader.dataset)
pruned_macs = compute_macs_vgg_quant(model, input_size=(3,32,32))
pruned_model_size = compute_model_size(model, bits_per_weight=4)  # quantized int4

print("\nMetrics for Pruned Model ---")
print(f"Accuracy: {pruned_acc:.2f}%")
print(f"MACs: {pruned_macs}")
print(f"Model Size (int4): {pruned_model_size:.2f} MB")

# ---------------- Memory Bandwidth Reduction ----------------
reduction_percent = (unpruned_model_size - pruned_model_size) / unpruned_model_size * 100
print(f"Estimated Memory Bandwidth Reduction: {reduction_percent:.2f}%")
layer_wise_sparsity, total_sparsity = compute_sparsity(model)
print(f"\nTotal model sparsity: {total_sparsity:.2f}%")

Metrics for Unpruned Model ---
Accuracy: 92.02%
MACs: 12656579584
Model Size (int4): 5.90 MB

Metrics for Pruned Model ---
Accuracy: 88.82%
MACs: 12656579584
Model Size (int4): 2.95 MB
Estimated Memory Bandwidth Reduction: 49.98%

Total model sparsity: 49.98%


In [7]:
#send an input and grap the value by using prehook like HW3


In [14]:
import torch
import torch.nn.functional as F
import numpy as np

conv_layer = model.features[29]  # your squeezed quantized conv layer
w_bit = 4
x_bit = 4

# ---------- Stochastic rounding function ----------
def stochastic_round(tensor):
    floor = torch.floor(tensor)
    prob = tensor - floor
    return floor + torch.bernoulli(prob)

# ---------- Quantized weights ----------
weight_q = conv_layer.weight_q                      # stored during training
w_alpha = conv_layer.weight_quant.wgt_alpha.data    

# Symmetric quantization delta (corrected)
w_delta = w_alpha / (2**(w_bit - 1) - 1)

# Use stochastic rounding instead of torch.round
weight_int = stochastic_round(weight_q / w_delta).to(torch.int32)
# print(weight_int.size())
W_int= torch.reshape(weight_int, (weight_int.size(0),weight_int.size(1),-1))
# print(W_int.size())
Kij = 8;
W = W_int[:,:,Kij]
# print(W.size())

# ---------- Quantized input activations ----------
x = captured['x']                                    # input tensor to layer 29
x_alpha = conv_layer.act_alpha.data

# Symmetric activations quantization (0 to α → ReLU)
x_delta = x_alpha / (2**x_bit - 1)

# Quantize using same function used during training
act_quant_fn = conv_layer.act_alq
x_q = act_quant_fn(x, x_alpha)

x_int = stochastic_round(x_q / x_delta).to(torch.int32)
# print(x_int.size())
x_int_b0 = x_int[0,:,:,:]
# print(x_int_b0.size())
X = torch.reshape(x_int_b0, (x_int_b0.size(0),-1)).float().cuda()
# print(X.size())


# ---------- INT psum simulation (Hardware-like) ----------
output_int = F.conv2d(x_int.float(), weight_int.float(), bias=None,
                      stride=conv_layer.stride,
                      padding=conv_layer.padding)
# print(output_int.size())

output_recovered = torch.relu(output_int * (x_delta * w_delta))

output_fp32 = torch.relu(conv_layer(x))

difference = torch.abs(output_fp32 - output_recovered)
print('Mean difference:', difference.mean().item())
# np.savetxt("weight_int_layer29.txt", weight_int.cpu().numpy().flatten(), fmt="%d", delimiter=",")
# np.savetxt("output_recovered.txt", output_recovered.cpu().numpy().flatten(), fmt="%d", delimiter=",")



Mean difference: 1.5890691429376602e-08


In [None]:
import torch


bit_precision = 4
file = open('activation_final.txt', 'w') #write to file
file.write('#time0row7[msb-lsb],time0row6[msb-lst],....,time0row0[msb-lst]#\n')
file.write('#time1row7[msb-lsb],time1row6[msb-lst],....,time1row0[msb-lst]#\n')
file.write('#................#\n')

for i in range(X.size(1)):  # time step
    for j in range(X.size(0)): # row #
        X_bin = '{0:04b}'.format(round(X[7-j,i].item()))
        for k in range(bit_precision):
            file.write(X_bin[k])        
        #file.write(' ')  # for visibility with blank between words, you can use
    file.write('\n')
file.close()

p_nij = range(X.size(1))
psum = torch.zeros(8,16,9).cuda()
print(W_int[0,0,0])

print(psum.size())

psum = torch.zeros(8, 16, 9).cuda()

for kij in range(9):
    for nij in range(16):
        # Use raw signed weights, do NOT convert them to 0..15
        weight = W_int[:, :, kij].float().cuda()   # keep negative values intact
        x_input = X[:, nij].unsqueeze(0)          # shape [1,8]
        psum[:, nij, kij] = torch.matmul(weight, x_input.squeeze(0))  # matmul preserves sign



kij = 8

psum_int = psum[:,:,kij]
print(W_int[0,0,0])
print(X[0,0])
print(psum_int[0,0])

bit_precision = 16

bit_precision = 4
file = open('weight_kij_8.txt', 'w') #write to file
file.write('#time0row7[msb-lsb],time0row6[msb-lst],....,time0row0[msb-lst]#\n')
file.write('#time1row7[msb-lsb],time1row6[msb-lst],....,time1row0[msb-lst]#\n')
file.write('#................#\n')

for i in range(W.size(1)):  
    for j in range(W.size(0)): 
       if (W[7-j,i] < 0):
        W[7-j,i] = W[7-j,i] + 16
       W_bin = '{0:04b}'.format(round(W[7-j,i].item()))
       for k in range(bit_precision):
           file.write(W_bin[k])        
       #file.write(' ')  # for visibility with blank between words, you can use
    file.write('\n')
file.close()

def int_to_unsigned_bits(x, bits=4):
    """Convert integer x to unsigned binary string of length `bits` (0 to 2^bits-1)."""
    max_val = (1 << bits) - 1  # 15 for 4 bits
    x_clipped = max(min(int(x), max_val), 0)  # clip to 0..15
    return format(x_clipped, '0{}b'.format(bits))

def write_activation_stream_row_major_unsigned(x_int, filename="x_int_layer29_hw_row_unsigned.txt",
                                               batch_index=0, bits=4):
    """
    Converts quantized tensor x_int to 4-bit unsigned binary stream, **row by row**.
    x_int: [B, 8, H, W]
    batch_index: which batch to export
    bits: bit precision
    """
    assert x_int.shape[1] == 8, "This helper expects 8 channels."
    X = x_int[batch_index].cpu().detach().clone()  # [C=8, H, W]
    C, H, W = X.shape

    with open(filename, 'w') as f:
        f.write('# Activation stream (row-major, unsigned): batch_index={}, 8 channels, {}-bit\n'.format(batch_index, bits))
        f.write('# Each line: one cycle, channels 7->0 concatenated\n')
        f.write('# Scanning order: for h in range(H): for w in range(W)\n')

        for h in range(H):        # row-first
            for w in range(W):    # column inside the row
                line = ''
                for ch in reversed(range(C)):  # ch7 -> ch0
                    val = X[ch, h, w].item()
                    line += int_to_unsigned_bits(val, bits)
                f.write(line + '\n')

    print(f"Written hardware stream (row-major, unsigned) to {filename}")
    return filename

# Usage
write_activation_stream_row_major_unsigned(x_int, filename="x_int_layer29_hw_row_unsigned.txt", batch_index=0, bits=4)

# ---------- Function to convert signed int to 2's complement bits ----------
def int_to_signed_bits(x, bits=4):
    """Convert integer x in range [-2^(bits-1), 2^(bits-1)-1] to 2's complement binary string."""
    min_val = -(1 << (bits-1))
    max_val = (1 << (bits-1)) - 1
    x_clipped = max(min(int(x), max_val), min_val)
    if x_clipped < 0:
        x_clipped = (1 << bits) + x_clipped
    return format(x_clipped, '0{}b'.format(bits))

# ---------- Write weights row-major for weight-stationary array ----------
def write_weights_row_major_signed(weight_int, filename="weight_layer29_hw_row.txt", bits=4):
    """
    Converts conv weights to 4-bit signed binary stream, row by row (input channel dimension is row).
    weight_int: [out_channels, in_channels, kH, kW]
    """
    O, I, kH, kW = weight_int.shape
    with open(filename, 'w') as f:
        f.write(f'# Weight stream (row-major, signed): {O}x{I}x{kH}x{kW}, {bits}-bit\n')
        f.write('# Each line: one cycle, input channels 0->7 concatenated\n')
        # For weight-stationary: scan output channels (rows) and kernel spatial positions
        for oc in range(O):
            for ic in range(I):
                for kh in range(kH):
                    for kw in range(kW):
                        val = weight_int[oc, ic, kh, kw].item()
                        bits_str = int_to_signed_bits(val, bits)
                        f.write(bits_str)
                    f.write('\n')  # each line is one kernel element across channels
    # print(f"Written weight stream (row-major, signed) to {filename}")
    return filename

# ---------- Usage ----------
write_weights_row_major_signed(weight_int, filename="weight_layer29_hw_row.txt", bits=4)

# ---------- Write INT or Recovered outputs ----------
def write_output_stream_row_major(output_tensor, filename="output_hw_row.txt", bits=4, signed=False, batch_index=0):
    """
    Converts output tensor to 4-bit row-major binary stream for hardware.
    output_tensor: [B, C, H, W]
    signed: whether to use signed 2's complement (-8..7) or unsigned (0..15)
    """
    X = output_tensor[batch_index].cpu().detach().clone()  # [C, H, W]
    C, H, W = X.shape

    if signed:
        int_to_bits = int_to_signed_bits
    else:
        # unsigned helper
        def int_to_bits(x, bits=4):
            x_clipped = max(min(int(round(x)), (1<<bits)-1), 0)
            return format(x_clipped, '0{}b'.format(bits))

    with open(filename, 'w') as f:
        f.write(f'# Output stream (row-major, {"signed" if signed else "unsigned"}): {C}x{H}x{W}, {bits}-bit\n')
        f.write('# Each line: one cycle, channels 7->0 concatenated\n')
        for h in range(H):        # row-first
            for w in range(W):    # column inside row
                line = ''
                for ch in reversed(range(C)):  # ch7->ch0
                    val = X[ch, h, w].item()
                    line += int_to_bits(val, bits)
                f.write(line + '\n')

    # print(f"Written output stream (row-major) to {filename}")
    return filename

# ---------- Usage ----------
write_output_stream_row_major(output_int, filename="output_int_hw_row.txt", bits=4, signed=True)
write_output_stream_row_major(output_recovered, filename="output_recovered_hw_row.txt", bits=4, signed=False)

In [43]:
import torch
import torch.nn.utils.prune as prune
PATH = "result/VGG16_quant/model_best.pth.tar"
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
device = torch.device("cuda")

model.cuda()
criterion = nn.CrossEntropyLoss().cuda()

# print("Accuracy of the pre-trained model:")
# validate(testloader, model, criterion)

# print("\n Applying 80% Unstructured Pruning:")
# for name, module in model.named_modules():
#     if isinstance(module, QuantConv2d):
#         prune.random_unstructured(module, name='weight', amount=0.8)
        
# print("Checking the accuracy after performing 80% unstructured pruning:")
# validate(testloader, model, criterion)

# print("\n Training the unstructured-pruned model for gaining back accuracy :")
best_prec = 0
epochs = 250
optimizer = torch.optim.SGD(model.parameters(), lr=5e-3, momentum=0.9, weight_decay=1e-4)

# for epoch in range(epochs):
#     train(trainloader, model, criterion, optimizer, epoch)
#     prec = validate(testloader, model, criterion)
#     is_best = prec > best_prec
#     best_prec = max(prec, best_prec)
    
# print(f"Best accuracy after re-training the unstructured-pruned model:{best_prec:.3f}%")

# for name, module in model.named_modules():
#     if isinstance(module, QuantConv2d):
#         prune.remove(module, 'weight')

print("\n Loading the original model again for structured pruning: ")
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
model.cuda()

print("\nApplying 50% Structured Pruning:")
for name, module in model.named_modules():
    if isinstance(module, QuantConv2d):
        prune.ln_structured(module, name='weight', amount=0.5, n=1, dim=0)
        
print("Checking the accuracy after performing 50% structured pruning:")
validate(testloader, model, criterion)

print("\n Training the structured-pruned model for gaining back accuracy :")
best_prec = 0
optimizer = torch.optim.SGD(model.parameters(), lr=5e-3, momentum=0.9, weight_decay=1e-4)

for epoch in range(epochs):
    train(trainloader, model, criterion, optimizer, epoch)
    prec = validate(testloader, model, criterion)
    is_best = prec > best_prec
    best_prec = max(prec, best_prec)
    
print(f"Best accuracy after finetuning the structured-pruned model: {best_prec:.3f}%")
for name, module in model.named_modules():
    if isinstance(module, QuantConv2d):
        prune.remove(module, 'weight')
        
# --- Save final pruned & fine-tuned model ---
save_path = "result/VGG16_quant/model_pruned_finetuned_pt1.pth.tar"

torch.save({
    'state_dict': model.state_dict(),
    'best_prec': best_prec,
}, save_path)

print("Saved pruned + fine-tuned model to:", save_path)


 Loading the original model again for structured pruning: 

Applying 50% Structured Pruning:
Checking the accuracy after performing 50% structured pruning:
Test: [0/79]	Time 0.235 (0.235)	Loss 2.6400 (2.6400)	Prec 10.156% (10.156%)
 * Prec 10.000% 

 Training the structured-pruned model for gaining back accuracy :
Epoch: [0][0/391]	Time 0.249 (0.249)	Data 0.209 (0.209)	Loss 2.4074 (2.4074)	Prec 15.625% (15.625%)
Epoch: [0][100/391]	Time 0.049 (0.051)	Data 0.001 (0.003)	Loss 2.1087 (2.2385)	Prec 20.312% (15.780%)
Epoch: [0][200/391]	Time 0.049 (0.050)	Data 0.001 (0.002)	Loss 1.8001 (2.1164)	Prec 33.594% (19.831%)
Epoch: [0][300/391]	Time 0.049 (0.050)	Data 0.001 (0.002)	Loss 1.6515 (1.9922)	Prec 39.062% (24.307%)
Test: [0/79]	Time 0.223 (0.223)	Loss 1.4444 (1.4444)	Prec 48.438% (48.438%)
 * Prec 46.560% 
Epoch: [1][0/391]	Time 0.295 (0.295)	Data 0.264 (0.264)	Loss 1.3570 (1.3570)	Prec 49.219% (49.219%)
Epoch: [1][100/391]	Time 0.049 (0.052)	Data 0.001 (0.004)	Loss 1.1242 (1.2156)	Prec 

In [45]:
PATH = "result/VGG16_quant/model_pruned_finetuned_pt1.pth.tar"
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
device = torch.device("cuda")

model.cuda()

def compute_sparsity(model):
    total_zeros = 0
    total_params = 0
    layer_sparsity = {}

    print("\n----- Per-layer sparsity (zeros %) -----")

    for name, module in model.named_modules():
        if isinstance(module, QuantConv2d):
            W = module.weight.detach().cpu()
            zeros = torch.sum(W == 0).item()
            params = W.numel()

            sparsity = 100 * zeros / params
            layer_sparsity[name] = sparsity
            print(f"{name}: {sparsity:.2f}% sparse")

            total_zeros += zeros
            total_params += params

    total_sparsity = 100 * total_zeros / total_params
    print("\n----- Total model sparsity -----")
    print(f"Overall sparsity: {total_sparsity:.2f}%")

    return layer_sparsity, total_sparsity

compute_sparsity(model)


----- Per-layer sparsity (zeros %) -----
features.0: 50.00% sparse
features.3: 50.00% sparse
features.7: 50.00% sparse
features.10: 50.00% sparse
features.14: 50.00% sparse
features.17: 50.00% sparse
features.20: 50.00% sparse
features.24: 50.00% sparse
features.27: 50.00% sparse
features.29: 50.00% sparse
features.31: 50.00% sparse
features.33: 50.00% sparse
features.37: 50.00% sparse
features.40: 50.00% sparse
features.43: 50.00% sparse

----- Total model sparsity -----
Overall sparsity: 50.00%


({'features.0': 50.0,
  'features.3': 50.0,
  'features.7': 50.0,
  'features.10': 50.0,
  'features.14': 50.0,
  'features.17': 50.0,
  'features.20': 50.0,
  'features.24': 50.0,
  'features.27': 50.0,
  'features.29': 50.0,
  'features.31': 50.0,
  'features.33': 50.0,
  'features.37': 50.0,
  'features.40': 50.0,
  'features.43': 50.0},
 50.0)

In [22]:
def compute_structured_filter_sparsity(model):
    print("\n----- Structured Filter Sparsity -----")
    for name, module in model.named_modules():
        if isinstance(module, QuantConv2d):
            W = module.weight.detach().cpu()    # shape: [C_out, C_in, k, k]
            C_out = W.shape[0]

            # A filter is dead if *all weights* in that channel = 0
            dead_filters = 0
            for c in range(C_out):
                if torch.sum(W[c]) == 0:
                    dead_filters += 1

            sparsity = 100 * dead_filters / C_out
            print(f"{name}: {dead_filters}/{C_out} filters removed ({sparsity:.2f}%)")
compute_structured_filter_sparsity(model)


----- Structured Filter Sparsity -----
features.0: 32/64 filters removed (50.00%)
features.3: 32/64 filters removed (50.00%)
features.7: 64/128 filters removed (50.00%)
features.10: 64/128 filters removed (50.00%)
features.14: 128/256 filters removed (50.00%)
features.17: 128/256 filters removed (50.00%)
features.20: 128/256 filters removed (50.00%)
features.24: 256/512 filters removed (50.00%)
features.27: 4/8 filters removed (50.00%)
features.29: 4/8 filters removed (50.00%)
features.31: 256/512 filters removed (50.00%)
features.33: 256/512 filters removed (50.00%)
features.37: 256/512 filters removed (50.00%)
features.40: 256/512 filters removed (50.00%)
features.43: 256/512 filters removed (50.00%)


In [46]:
import torch
import torch.nn.utils.prune as prune
import torch.nn as nn

# ------------------------------
# Load your pruned + fine-tuned model
# ------------------------------
PATH = "result/VGG16_quant/model_pruned_finetuned_pt1.pth.tar"
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['state_dict'])
model.eval().cuda()

# ---------------- Compute Metrics ----------------

# Layer-wise and total sparsity
layer_sparsity, total_sparsity = compute_sparsity(model)
print("----- Per-layer sparsity (%) -----")
for layer, sp in layer_sparsity.items():
    print(f"{layer}: {sp:.2f}%")
print(f"\nTotal sparsity: {total_sparsity:.2f}%")

# Compute MACs
macs_after = compute_macs_vgg_quant(model)
print(f"Estimated MACs after pruning: {macs_after/1e6:.2f} M")

# Compute model size reduction (assume 4-bit weights for quantized model)
pruned_model_size = compute_model_size(model, bits_per_weight=4)
print(f"Pruned model size (int4): {pruned_model_size:.2f} MB")

# Memory bandwidth reduction is roughly proportional to model size reduction
print(f"Estimated memory bandwidth reduction: {total_sparsity:.2f}%")

# Final accuracy
def validate(model, testloader, criterion):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in testloader:
            images = images.cuda()
            labels = labels.cuda()
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100.0 * correct / total

accuracy = validate(model, testloader, criterion)
print(f"\nFinal accuracy after pruning & fine-tuning: {accuracy:.2f}%")
print('Number of MACs before pruning: ', compute_macs_vgg_quant(model, input_size=(3,32,32)))


----- Per-layer sparsity (zeros %) -----
features.0: 50.00% sparse
features.3: 50.00% sparse
features.7: 50.00% sparse
features.10: 50.00% sparse
features.14: 50.00% sparse
features.17: 50.00% sparse
features.20: 50.00% sparse
features.24: 50.00% sparse
features.27: 50.00% sparse
features.29: 50.00% sparse
features.31: 50.00% sparse
features.33: 50.00% sparse
features.37: 50.00% sparse
features.40: 50.00% sparse
features.43: 50.00% sparse

----- Total model sparsity -----
Overall sparsity: 50.00%
----- Per-layer sparsity (%) -----
features.0: 50.00%
features.3: 50.00%
features.7: 50.00%
features.10: 50.00%
features.14: 50.00%
features.17: 50.00%
features.20: 50.00%
features.24: 50.00%
features.27: 50.00%
features.29: 50.00%
features.31: 50.00%
features.33: 50.00%
features.37: 50.00%
features.40: 50.00%
features.43: 50.00%

Total sparsity: 50.00%
Estimated MACs after pruning: 12656.58 M
Pruned model size (int4): 2.95 MB
Estimated memory bandwidth reduction: 50.00%

Final accuracy after

In [None]:
import torch

# Make sure your model and captured hook are ready
x_q = captured['x']               # Already quantized activations from hook
weight_q = conv_layer.weight_q    # Already quantized weights

# Flatten and convert to CPU for saving
x_flat = x_q.detach().cpu().numpy().flatten()
w_flat = weight_q.detach().cpu().numpy().flatten()

# Save to text files
import os
save_dir = 'result/VGG16_quant/'
os.makedirs(save_dir, exist_ok=True)

x_path = os.path.join(save_dir, 'feature29_input_activations.txt')
w_path = os.path.join(save_dir, 'feature29_weights.txt')

# Save as float values
with open(x_path, 'w') as f:
    for val in x_flat:
        f.write(f"{val}\n")

with open(w_path, 'w') as f:
    for val in w_flat:
        f.write(f"{val}\n")

print(f"Saved {x_flat.shape[0]} input activations to {x_path}")
print(f"Saved {w_flat.shape[0]} weights to {w_path}")


In [None]:
w_bit = 4
weight_q = conv_layer.weight_q # quantized value is stored during the training
w_alpha = conv_layer.weight_quant.wgt_alpha.data  # alpha is defined in your model already. bring it out here
w_delta = 2 * w_alpha / (2**w_bit - 1)    # delta can be calculated by using alpha and w_bit
weight_int = torch.round(weight_q / w_delta).to(torch.int32) # w_int can be calculated by weight_q and w_delta
print(weight_int) # you should see clean integer numbers

In [None]:
x_bit = 4    
x = x = captured['x']   # input of the 2nd conv layer
x_alpha  = conv_layer.act_alpha.data
x_delta = 2 * x_alpha / (2**x_bit - 1)

act_quant_fn = act_quantization(x_bit) # define the quantization function
x_q = act_quant_fn(x, x_alpha)         # create the quantized value for x

x_int = torch.round(x_q / x_delta).to(torch.int32)
print(x_int) # you should see clean integer numbers 

In [None]:
conv_int = torch.nn.Conv2d(in_channels = 64, out_channels=64, kernel_size = 3, bias = False)
conv_int.weight = torch.nn.parameter.Parameter(weight_int)

output_int =  F.conv2d(x_int.float(), weight_int.float(), bias=None,
                      stride=conv_layer.stride, padding=conv_layer.padding)    # output_int can be calculated with conv_int and x_int
output_recovered = output_int * (x_delta * w_delta)  # recover with x_delta and w_delta
print(output_recovered)

In [None]:
#### input floating number / weight quantized version

conv_ref = torch.nn.Conv2d(in_channels = 64, out_channels=64, kernel_size = 3, bias = False)
conv_ref.weight = model.features[3].weight_q 

output_ref = conv_ref(x)
print(output_ref)    

In [None]:
difference = abs( output_ref - output_recovered )
print(difference.mean())  ## It should be small, e.g.,2.3 in my trainned model

In [None]:
#### input floating number / weight floating number version

conv_ref = torch.nn.Conv2d(in_channels = 64, out_channels=64, kernel_size = 3, bias = False)
weight = model.features[3].weight
mean = weight.data.mean()
std = weight.data.std()
conv_ref.weight = torch.nn.parameter.Parameter(weight.add(-mean).div(std))

output_ref = conv_ref(x)
print(output_ref)


In [None]:
difference = abs( output_ref - output_recovered )
print(difference.mean())  ## It should be small, e.g.,2.3 in my trainned model