# **Tutorial on NEMO: NEural Minimizer for pytOrch**

*Hanna Müller (ETHZ)*

*Credits: Dr. Francesco Conti,  Lorenzo Lamberti, Leonardo Ravaglia, Marcello Zanghieri (University of Bologna)*

# **Setup** - Student Task 1
Let us start by installing dependencies...

In [None]:
#@title Install dependencies { form-width: "20%" }

# these ones should already be available
!pip install numpy
!pip install tqdm
!pip install pillow
!pip install -q holoviews

# recommended versions for this tutorial
!pip install torch==1.5.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip install torchvision==0.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
!pip install pytorch-nemo==0.0.8

... and import all packages, including NEMO itself:

In [None]:
#@title Imports { form-width: "25%" }

#basic
import numpy as np
from pandas import DataFrame
from copy import deepcopy
from tqdm import tqdm

#plotting
import matplotlib.pyplot as plt
import holoviews as hv
hv.notebook_extension('bokeh')
from holoviews import opts
from bokeh.models import HoverTool
from bokeh.plotting import show


#torch
import torch; print('\nPyTorch version in use:', torch.__version__, '\ncuda avail: ', torch.cuda.is_available())
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

#nemo!
import nemo

In [None]:
#@title Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: %s' % device)

In [None]:
class ExampleNet(nn.Module):
    def __init__(self):
        super(ExampleNet, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 4, 3, padding=1, bias=False)
        self.bn1   = nn.BatchNorm2d(4)
        self.relu1 = nn.ReLU() # <== Module, not Function!

        self.conv2 = nn.Conv2d(4, 20, 3, padding=1, stride=2, bias=False)
        self.bn2   = nn.BatchNorm2d(20)
        self.relu2 = nn.ReLU() # <== Module, not Function!

        self.conv3 = nn.Conv2d(20, 40, 3, padding=1, stride=2, bias=False)
        self.bn3   = nn.BatchNorm2d(40)
        self.relu3 = nn.ReLU() # <== Module, not Function!

        self.conv4 = nn.Conv2d(40, 80, 3, padding=1, stride=2, bias=False)
        self.bn4   = nn.BatchNorm2d(80)
        self.relu4 = nn.ReLU() # <== Module, not Function!
        
        self.fc1 = nn.Linear(80 * 4**2, 500, bias=False)
        self.fcrelu1 = nn.ReLU() # <== Module, not Function!
        self.fc2 = nn.Linear(500, 10, bias=False)

    def forward(self, x):
        x = self.relu1(self.bn1(self.conv1(x)))
        x = self.relu2(self.bn2(self.conv2(x)))
        x = self.relu3(self.bn3(self.conv3(x)))
        x = self.relu4(self.bn4(self.conv4(x)))
        x = x.flatten(1)
        x = self.fcrelu1(self.fc1(x))
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1) # <== the softmax operation does not need to be quantized, we can keep it as it is
        return output

Let's define a couple of useful functions to measure the model size (`Kb`) and number of parameters.

In [None]:
import pprint

#@title Model Summary
def print_summary(model):
    summary = nemo.utils.get_summary(model, (1, 28, 28),verbose=True)
    pprint.pprint(summary['dict'], sort_dicts=False)

def network_size(model):
    summary = nemo.utils.get_summary(model, (1, 28, 28),verbose=True)
    params_size = 0
    for layer_name, layer_info in summary['dict'].items():
        try:
            params_size += abs(layer_info["nb_params"]  * layer_info["W_bits"] / 8. / (1024.))
        except KeyError:
            params_size += abs(layer_info["nb_params"] * 32. / 8. / (1024.))
    return int(params_size)

print("Full precision model statistics: \n")
model = ExampleNet().to(device)
print_summary(model)
print("Just network size: %.2fkB" % network_size(model))


Then we define a function for validation.

In [None]:
#@title Define Metrics and validation function
# convenience class to keep track of averages
class Metric(object):
    def __init__(self, name):
        self.name = name
        self.sum  = 0
        self.n    = 0
    def update(self, value):
        self.sum += value
        self.n += 1
    @property
    def avg(self):
        return self.sum / self.n


def validate(model, device, dataloader, verbose=True, integer=False):
    model.eval()
    loss    = 0
    correct = 0
    acc     = Metric('test_acc')
    with tqdm(
        total=len(dataloader), desc='Validation', disable=not verbose,
        ) as t:
        with torch.no_grad():
            for data, target in dataloader:
                if integer:      # support for production of
                    data *= 255  # non-negative integer data
                data, target = data.to(device), target.to(device)
                output = model(data)
                loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
                pred = output.argmax(dim=1) # get index of largest log-probability
                correct += pred.eq(target).sum().item()
                acc.update(pred.eq(target).float().mean().item())
                t.set_postfix({'acc': acc.avg})
                t.update(1)
    loss /= len(dataloader.dataset)
    return acc.avg

Set up the dataloaders:
* a *calibration loader*, which loads a subset (for speed) of MNIST training set;
* a *validation loader*, which loads MNIST's "test" set (actually, a validation set).

In [None]:
# calibration set
Mcalib = 1024 # calibration set size
train_set = datasets.MNIST('./data/', train=True , download=True, transform=transforms.ToTensor())
calib_set = torch.utils.data.Subset(train_set, indices=np.random.permutation(len(train_set))[:Mcalib])
del train_set # we load a pretrained model, we won't train it in this script. Just needed for calibration!

# validation set
valid_set = datasets.MNIST('./data/', train=False, download=True, transform=transforms.ToTensor())


# set up the dataloaders
kwargs = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else {}
calib_loader = torch.utils.data.DataLoader(calib_set, batch_size=128, shuffle=False, drop_last=True, **kwargs)
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=128, shuffle=False, drop_last=True, **kwargs)

# **FullPrecision (FP) stage** - Student Task 2

Download a pretrained model, and test it! Here we operate at what we call the ***FullPrecision* (FP)** stage: the regular PyTorch representation, which relies on real-valued tensors represented by `float32` in your CPU/GPU.

In [None]:
!mkdir ./content/
%cd ./content/
!rm -rf examplenet.pt
!wget https://github.com/MarcelloZanghieri2/NeMO_tutorial/blob/main/smallernet_4.pt?raw=true
!md5sum smallernet_4.pt?raw=true
!mv smallernet_4.pt?raw=true examplenet.pt

model = ExampleNet().to(device)
state_dict = torch.load('examplenet.pt', map_location='cpu')
model.load_state_dict(state_dict, strict=True)

acc = validate(model, device, valid_loader)
model_size = network_size(model)
print("\n\nFullPrecision accuracy: %.3f, model size: %.2fkB" % (acc, model_size))

# **FakeQuantized (FQ) stage** - Student Task 3



In [None]:
#@title Set N-bit Quantization { run: "auto", form-width: "20%", display-mode: "both" }

Q =  16#@param {type:"integer"}

model_q = nemo.transform.quantize_pact(deepcopy(model), dummy_input=torch.randn((1, 1, 28, 28)).to(device))
model_q.change_precision(bits=Q, scale_weights=True, scale_activations=True)
with model_q.statistics_act():
    validate(model_q, device, calib_loader)
model_q.reset_alpha_act()
acc = validate(model_q, device, valid_loader)
model_size = network_size(model_q)
print("\nFakeQuantized @ %db: accuracy: %.3f, model_size %dkB" % (Q, acc, model_size))

## **Explore model accuracy vs bitwidth Q** - Student Task 4


### Hints and plotting function


In [None]:
Qmin, Qmax = 0, 16
q_values = np.arange(Qmin, 1 + Qmax)
acc_array = []
model_size_array = []


for Q in q_values:
    # TODO
acc_array = np.array(acc_array)

In [None]:
#@title Plot with Holoviews and Bokeh { form-width: "10%" }

# default options to apply to your plots
opts.defaults(opts.Scatter(width=1000, height=500, show_grid=True,tools=['hover'], size=8, marker='circle'), opts.Curve(width=1000, height=500, tools=[], show_grid=True) )

# first store your data in a Pandas DataFrame  (define dataframe keys, and the corresponding values)
source = DataFrame({ 'q_values' : q_values,
                     'acc_array' : acc_array,
                     'err_rate' : 1- acc_array,
                     'model_size_array' : model_size_array})

# Define which data you want to be able to hover and get information on in the form of (text label, "@pandas_dataframe_key_name)
hover = HoverTool(tooltips=[("q_values", "@q_values"), ("acc_array Rate", "@acc_array"), ("model_size_array", "@model_size_array"), ("err_rate", "@err_rate")])

#accuracy
acc_points = hv.Scatter(source, kdims=['q_values', 'acc_array'])
acc_points.opts( title='Accuracy and error-rate VS bitwidth', tools=[hover], xlim=(Qmin, Qmax), ylim=(0, 1), xlabel='Q (adim.)', ylabel='Accuracy (%)')
acc_curve = hv.Curve(acc_array,  label='accuracy')
fig = hv.render(acc_points*acc_curve)

#err rate
err_rate_points = hv.Scatter(source, kdims=['q_values', 'err_rate'])
err_rate_points.opts(xlim=(Qmin, Qmax), ylim=(0, 1), xlabel='Q (adim.)', ylabel='Err-Rate (%)', color='red')
err_rate_curve = hv.Curve(1-acc_array, label='err-rate')
err_rate_curve.opts(color='red')

#render
fig = hv.render(acc_points*acc_curve*err_rate_points*err_rate_curve)
show(fig)


# model size
model_size_points = hv.Scatter(source, kdims=['q_values', 'model_size_array'])
model_size_points.opts(xlim=(Qmin, Qmax), xlabel='Q (adim.)', ylabel='model size [kB]', color='green')
model_size_curve = hv.Curve(model_size_array, label='model size [kB]')
model_size_curve.opts(color='green')

fig = hv.render(model_size_points*model_size_curve)
show(fig)

## **Mixed-Precision: non-uniform layer-wise quantization and precision dictionaries** - Student Task 5



In [None]:
precision = {
    'conv1': {
        'W_bits' : #TODO
    },
    'relu1': {
        'x_bits' : #TODO
    },
    'conv2': {
        'W_bits' : #TODO
    },
    'relu2': {
        'x_bits' : #TODO
    },
    'conv3': {
        'W_bits' : #TODO
    },
    'relu3': {
        'x_bits' : #TODO
    },
    'conv4': {
        'W_bits' : #TODO
    },
    'relu4': {
        'x_bits' : #TODO
    },
    'fc1': {
        'W_bits' : #TODO
    },
    'fcrelu1': {
        'x_bits' : #TODO
    },
    'fc2': {
        'W_bits' : #TODO
    },
}

model_q = nemo.transform.quantize_pact(deepcopy(model), dummy_input=torch.randn((1, 1, 28, 28)).to(device))
model_q.change_precision(bits=1, min_prec_dict=precision)
with model_q.statistics_act():
    _ = validate(model_q, device, valid_loader) # blank validation is needed for calibrating
model_q.reset_alpha_act()
acc = validate(model_q, device, valid_loader)
model_size = network_size(model_q)
print("\nFakeQuantized @ mixed-precision: accuracy: %.2f%%  model_size: %dkB" % (acc, model_size))

---

## **Quantization for DORY: 8-bit** -- Student Task 6


In [None]:
#@title Define precision dict for dory format: it supports just 8-bit quantization

precision_dict_for_dory = {
    'conv1': {
        'W_bits' : #TODO
    },
    'relu1': {
        'x_bits' : #TODO
    },
    'conv2': {
        'W_bits' : #TODO
    },
    'relu2': {
        'x_bits' : #TODO
    },
    'conv3': {
        'W_bits' : #TODO
    },
    'relu3': {
        'x_bits' : #TODO
    },
    'conv4': {
        'W_bits' : #TODO
    },
    'relu4': {
        'x_bits' : #TODO
    },
    'fc1': {
        'W_bits' : #TODO
    },
    'fcrelu1': {
        'x_bits' : #TODO
    },
    'fc2': {
        'W_bits' : #TODO
    },
}

In [None]:
model_q = nemo.transform.quantize_pact(deepcopy(model), dummy_input=torch.randn((1, 1, 28, 28)).to(device))
model_q.change_precision(bits=1, min_prec_dict=precision_dict_for_dory, scale_weights=True, scale_activations=True)
with model_q.statistics_act():
    validate(model_q, device, calib_loader)
model_q.reset_alpha_act()
acc = validate(model_q, device, valid_loader)
print("\nFakeQuantized accuracy: %.3f\n\n" % acc)

# **QuantizedDeployable (QD) stage** - Student Task 7


In [None]:
model_q.qd_stage(eps_in=1./255)                  # eps_in is the input quantum, and must be set by the user
print('\n\n', model_q, '\n')
acc = validate(model_q, device, valid_loader)
print("\nQuantizedDeployable accuracy: %.3f" % acc)

We can look inside our model and parameters by using the following methods:

### ***Debug Hooks: methods to print activations, epsilon values and weights***

In [None]:
buf_in, buf_out, _ = nemo.utils.get_intermediate_activations(model_q, validate, model_q, device, valid_loader)
print('\n\n', buf_in.keys(), '\n', buf_out.keys())

In [None]:
eps = nemo.utils.get_intermediate_eps(model_q, 1./255)
eps['conv1'] = torch.tensor([1./255], device=device)
for k in eps:
    print(k, '\t', eps[k])

In [None]:
for name, param in model_q.named_parameters():
    if param.requires_grad:
        print(name, param)

In [None]:
print(buf_out)

In [None]:
bi = nemo.utils.get_integer_activations(buf_in, eps)
bo = nemo.utils.get_integer_activations(buf_out, eps)

# **IntegerDeployable (ID) stage** - Student Task 8

ONNX export

In [None]:
model_q.id_stage()

# ONNX Export

nemo.utils.export_onnx('mnist_id_4dory.onnx', model_q, model_q, (1, 28, 28))
from IPython.display import IFrame 
from IPython.display import FileLink 
import urllib
display(FileLink('mnist_id_4dory.onnx'))
url = urllib.parse.quote("https://lutzroeder.github.io/netron", safe=':/?=&')
IFrame(url, width=800, height=400)

Intermediate activations export

In [None]:
# Activation buffers
buf_in, buf_out, _ = nemo.utils.get_intermediate_activations(model_q, validate, model_q, device, valid_loader, integer=True)

%cd ./content

# Save the input (one MNIST image - we pick the last one), not included in the buffer
t = buf_in['conv1'][0][-1].cpu().detach().numpy()
np.savetxt('input.txt', t.flatten(), '%.3f', newline=',\\\n', header = 'input (shape %s)' % str(list(t.shape)))
display(FileLink('input.txt'))

# Save the outputs
names = ['relu1', 'relu2', 'relu3', 'relu4', 'fcrelu1', 'fc2']
L = len(names)
for l in range(L):
    t = np.moveaxis(buf_out[names[l]][-1].cpu().detach().numpy(), 0, -1)
    np.savetxt('out_layer%d.txt' % l, t.flatten(), '%.3f', newline=',\\\n', header = names[l] + ' (shape %s)' % str(list(t.shape)))
    display(FileLink('out_layer%d.txt' % l))

### *Debug Hooks: methods to print activations, epsilon values and weights*

In [None]:
nemo.utils.get_intermediate_activations(model_q, validate, model_q, device, valid_loader, integer=True)

In [None]:
nemo.utils.get_intermediate_eps(model_q, 1./255)

In [None]:
for name, param in model_q.named_parameters():
    if param.requires_grad:
        print(name, param)