# ResNet kvantizácia

Podľa: https://nervanasystems.github.io/distiller/prepare_model_quant.html

1. Replace direct tensor operations with modules

* Replace re-used modules with dedicated instances

* Replace torch.nn.functional calls with equivalent modules

* Special cases - replace modules that aren't quantize-able with quantize-able variants


In [1]:
import torch
import torchvision
import distiller
from distiller.models import create_model


print(torch.__version__)
print(torchvision.__version__)

1.3.1
0.4.2


In [2]:
model = create_model(pretrained=True,dataset='imagenet',arch='resnet18') 
model

DataParallel(
  (module): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): DistillerBasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (add): EltwiseAdd()
        (relu2): ReLU(inplace=True)
      )
      (1): DistillerBasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bia

### Načítanie datasetu pre cpu a gpu (cuda)
Používame distiller funkcionalitu v distiller.apputils.load_data


In [20]:
DATASET_PATH = "/home/bohumil/FIIT/BP/BP/Zdroje_kod/imagenet"
val_images = DATASET_PATH + "/val/images"


# z <distiller_root>/jupyter/post_train_quant_convert_pytorch.ipynb
distiller.set_seed(0)

subset_size = 1.0
val_split = 0.8

batch_size_gpu = 32
workers_gpu = 4
train_loader_gpu, val_loader_gpu , test_loader_gpu, _ = distiller.apputils.load_data('imagenet', DATASET_PATH, batch_size=batch_size_gpu, workers=workers_gpu, 
                             validation_split=val_split, fixed_subset=False, sequential=False, 
                             test_only=False)

In [4]:
# z <distiller_root>/jupyter/post_train_quant_convert_pytorch.ipynb
# for CPU vs GPU speed comparison
distiller.set_seed(0)

batch_size_cpu = 32
num_workers_cpu = 2
train_loader_cpu, val_loader_cpu, test_loader_cpu, _ = distiller.apputils.load_data(
    'imagenet', DATASET_PATH, batch_size_cpu, num_workers_cpu,
    validation_split=val_split, fixed_subset=True, test_only=False)

In [9]:
# z <distiller_root>/jupyter/post_train_quant_convert_pytorch.ipynb
import torchnet as tnt
import math
import numpy as np


def eval_model(data_loader, model, device='cpu', print_freq=10):
    # print('Evaluation model ', model.arch)
    
    criterion = torch.nn.CrossEntropyLoss().to(device)
    
    loss = tnt.meter.AverageValueMeter()
    classerr = tnt.meter.ClassErrorMeter(accuracy=True, topk=(1, 5))

    total_samples = len(data_loader.sampler)
    batch_size = data_loader.batch_size
    total_steps = math.ceil(total_samples / batch_size)
    print('{0} samples ({1} per mini-batch)'.format(total_samples, batch_size))

    # Switch to evaluation mode
    model.eval()

    for step, (inputs, target) in enumerate(data_loader):
        with torch.no_grad():
            inputs, target = inputs.to(device), target.to(device)
            # compute output from model
            output = model(inputs)

            # compute loss and measure accuracy
            loss.add(criterion(output, target).item())
            classerr.add(output.data, target)
            
            if (step + 1) % print_freq == 0:
                print('[{:3d}/{:3d}] Top1: {:.3f}  Top5: {:.3f}  Loss: {:.3f}'.format(
                      step + 1, total_steps, classerr.value(1), classerr.value(5), loss.mean), flush=True)
    print('----------')
    print('Overall ==> Top1: {:.3f}  Top5: {:.3f}  Loss: {:.3f} PPL: {:.3f}'.format(
        classerr.value(1), classerr.value(5), loss.mean, np.exp(loss.mean)), flush=True)

In [16]:
if torch.cuda.is_available():
    %time eval_model(val_loader_gpu,model,'cuda')

5831 samples (64 per mini-batch)
[ 10/ 92] Top1: 0.000  Top5: 0.781  Loss: 15.164
[ 20/ 92] Top1: 0.391  Top5: 1.406  Loss: 15.054
[ 30/ 92] Top1: 0.313  Top5: 1.458  Loss: 15.067
[ 40/ 92] Top1: 0.352  Top5: 1.289  Loss: 15.125
[ 50/ 92] Top1: 0.406  Top5: 1.313  Loss: 15.032
[ 60/ 92] Top1: 0.443  Top5: 1.380  Loss: 15.034
[ 70/ 92] Top1: 0.402  Top5: 1.272  Loss: 15.025
[ 80/ 92] Top1: 0.371  Top5: 1.250  Loss: 15.028
[ 90/ 92] Top1: 0.330  Top5: 1.285  Loss: 15.039
----------
Overall ==> Top1: 0.326  Top5: 1.286  Loss: 15.063 PPL: 3480915.028
CPU times: user 1min 14s, sys: 1.02 s, total: 1min 15s
Wall time: 1min 15s


In [10]:
cpu_model = distiller.make_non_parallel_copy(model).cpu()

In [None]:
%time eval_model(val_loader_cpu, cpu_model, 'cpu')

# Quantization

## 0. Collect activation statistics

Resnet *as is* contains parallel modules, we need to use non-parallel model copy for stats

In [11]:
# z distiller/examples/word_language_model/quantize_lstm.ipynb
import os
import torch
from distiller.data_loggers import collect_quant_stats, QuantCalibrationStatsCollector

#man_model = torch.load('./manual.checkpoint.pth.tar')
distiller.utils.assign_layer_fq_names(cpu_model)
collector = QuantCalibrationStatsCollector(cpu_model)

stats_file = './acts_quantization_stats.yaml'

if not os.path.isfile(stats_file):
    def eval_for_stats(model):
        eval_model(data_loader=train_loader_cpu,model=model)
    collect_quant_stats(cpu_model, eval_for_stats, save_dir='.')

3888 samples (32 per mini-batch)
[ 10/122] Top1: 1.250  Top5: 7.188  Loss: 15.298
[ 20/122] Top1: 0.938  Top5: 6.719  Loss: 15.130
[ 30/122] Top1: 0.938  Top5: 6.042  Loss: 15.375
[ 40/122] Top1: 0.859  Top5: 5.156  Loss: 15.362
[ 50/122] Top1: 1.062  Top5: 5.188  Loss: 15.352
[ 60/122] Top1: 1.146  Top5: 5.417  Loss: 15.259
[ 70/122] Top1: 1.161  Top5: 5.268  Loss: 15.249
[ 80/122] Top1: 1.211  Top5: 4.922  Loss: 15.283
[ 90/122] Top1: 1.215  Top5: 4.861  Loss: 15.335
[100/122] Top1: 1.281  Top5: 4.656  Loss: 15.327
[110/122] Top1: 1.278  Top5: 4.688  Loss: 15.331
[120/122] Top1: 1.354  Top5: 4.583  Loss: 15.333
----------
Overall ==> Top1: 1.337  Top5: 4.578  Loss: 15.353 PPL: 4654219.261
3888 samples (32 per mini-batch)
[ 10/122] Top1: 0.938  Top5: 3.438  Loss: 15.544
[ 20/122] Top1: 1.094  Top5: 3.750  Loss: 15.603
[ 30/122] Top1: 1.562  Top5: 4.688  Loss: 15.504
[ 40/122] Top1: 1.484  Top5: 4.844  Loss: 15.424
[ 50/122] Top1: 1.625  Top5: 4.937  Loss: 15.341
[ 60/122] Top1: 1.510 

Teraz mame ziskane statistiky z train datasetu. Podla tychto statistik mozeme nastavit kvantizaciu.

Pokracujeme upravenim torchvision impl. na nas pripad

## 1. Replace direct tensor operations with modules

*   mali sme v forward_imp vyuzittie `torch.flatten`
*   mali sme aj + v `BasicBlock`

## 2. Replace re-used modules with dedicated instances

*   V BasicBlock viac krat `nn.Relu`

## 3. Replace `torch.nn.functional` calls with equivalent modules
 

Taktiez dolezite je volanie

```python
...

_resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
                   **kwargs)
                   
...

def _resnet(arch, block, layers, pretrained, progress, **kwargs):
    model = ResNet(block, layers, **kwargs)

```

# Evaluate quantized model

### we should also save model for re-use ?
Skusim len spustit kvantizaciu Resnet18 podla quantization_jupyters/resnet18_imagenet_post_train.yaml
Implementacia Resnet v Distiller uz pouziva vsade modules, teda nie je nutne vytvaranie noveho korektneho modelu.
**Preto mozem rovno pustit kvantizaciu**

In [12]:
from copy import deepcopy
from distiller.quantization import PostTrainLinearQuantizer
import argparse
import os

Predpripravy, ziskanie statistik modelu pred kvantizaciou, kvantizovanie modelu

In [14]:
# deepcopy(cpu_model)
def make_quantizer(cpu_model, config_file_path):
    """returns stats_before_prepare, quantizer"""
    parser = argparse.ArgumentParser()
    distiller.quantization.add_post_train_quant_args(parser)
    args = parser.parse_args(args=[])
    
    #this needs to be defined in config file
    # args.qe_stats_file = os.path.expanduser(stats_file)
    
    #config present, ALL OTHER args are IGNORED
    args.qe_config_file = os.path.expanduser(config_file_path)
    
    cp = deepcopy(cpu_model)
    quantizer = PostTrainLinearQuantizer.from_args(cp, args)
    
    # Quantizer magic
    stats_before_prepare = deepcopy(quantizer.model_activation_stats)
    
    # dummy input of (batch_size, height, width, depth)
    # https://towardsdatascience.com/understanding-input-and-output-shapes-in-convolution-network-keras-f143923d56ca
    dummy_input = (torch.zeros(64,3,7,7) )

    quantizer.prepare_model(dummy_input)
    
    return stats_before_prepare, quantizer


## Porovnajme statistiky oboch modelov
Porovnanie statistiky pred a po kvantizacii ResNet18

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=1)
print('Stats BEFORE prepare_model:')
pp.pprint(stats_before_prepare['rnn.cells.0.eltwiseadd_gate']['output'])

print('\nStats AFTER to prepare_model:')
pp.pprint(quantizer.model_activation_stats['rnn.cells.0.eltwiseadd_gate']['output'])

In [15]:
stats_file = '/home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/acts_quantization_stats.yaml'
config_file = '/home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/resnet18_imagenet_post_train.yaml'
stats_before_quantization, quantizer = make_quantizer(cpu_model,config_file)
quantizer.model

ResNet(
  (conv1): RangeLinearQuantParamLayerWrapper(
    weights_quant_settings=(num_bits=8 ; quant_mode=ASYMMETRIC_UNSIGNED ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=True)
    output_quant_settings=(num_bits=8 ; quant_mode=ASYMMETRIC_UNSIGNED ; clip_mode=AVG ; clip_n_stds=None ; clip_half_range=True ; per_channel=False)
    accum_quant_settings=(num_bits=16 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
      inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None
    scale_approx_mult_bits=None
    preset_activation_stats=True
      output_scale=75.812683, output_zero_point=0.000000
    weights_scale=PerCh, weights_zero_point=PerCh
    (wrapped_module): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
  )
  (bn1): Identity()
  (relu): Identity()
  (maxpool): RangeLinearFakeQuantWrapper(
    output_quant_settings=(num_bits=8 ; quant_mode=ASYMMETRIC_UNSIGNED ; clip_m

### Base - full precision cuda model

In [27]:
%time eval_model(val_loader_gpu, model, 'cuda', print_freq=60)

15551 samples (32 per mini-batch)
[ 60/486] Top1: 0.365  Top5: 1.042  Loss: 15.124
[120/486] Top1: 0.339  Top5: 1.068  Loss: 15.157
[180/486] Top1: 0.330  Top5: 1.146  Loss: 15.099
[240/486] Top1: 0.313  Top5: 1.172  Loss: 15.091
[300/486] Top1: 0.292  Top5: 1.208  Loss: 15.050
[360/486] Top1: 0.339  Top5: 1.337  Loss: 15.032
[420/486] Top1: 0.335  Top5: 1.317  Loss: 15.013
[480/486] Top1: 0.306  Top5: 1.289  Loss: 15.012
----------
Overall ==> Top1: 0.315  Top5: 1.286  Loss: 15.019 PPL: 3331843.514
CPU times: user 3min 44s, sys: 2.65 s, total: 3min 47s
Wall time: 3min 39s


In [23]:
%time eval_model(val_loader_cpu, cpu_model.to(device), device, print_freq=10)

10000 samples (64 per mini-batch)
[ 10/157] Top1: 0.000  Top5: 0.000  Loss: 10.656
[ 20/157] Top1: 0.156  Top5: 0.313  Loss: 10.582
[ 30/157] Top1: 0.156  Top5: 0.417  Loss: 10.580
[ 40/157] Top1: 0.156  Top5: 0.508  Loss: 10.512
[ 50/157] Top1: 0.125  Top5: 0.594  Loss: 10.518
[ 60/157] Top1: 0.104  Top5: 0.547  Loss: 10.550
[ 70/157] Top1: 0.112  Top5: 0.625  Loss: 10.538
[ 80/157] Top1: 0.098  Top5: 0.645  Loss: 10.557
[ 90/157] Top1: 0.087  Top5: 0.608  Loss: 10.570
[100/157] Top1: 0.078  Top5: 0.625  Loss: 10.568
[110/157] Top1: 0.099  Top5: 0.625  Loss: 10.591
[120/157] Top1: 0.104  Top5: 0.638  Loss: 10.590
[130/157] Top1: 0.096  Top5: 0.673  Loss: 10.589
[140/157] Top1: 0.089  Top5: 0.636  Loss: 10.602
[150/157] Top1: 0.094  Top5: 0.635  Loss: 10.598
----------
Overall ==> Top1: 0.090  Top5: 0.630  Loss: 10.611
CPU times: user 11min 11s, sys: 1.11 s, total: 11min 12s
Wall time: 11min 8s


### 8bit quantization evaluation
Mame nastavenie:

```python
bits_activations: 8
bits_parameters: 8
bits_accum: 16
```

In [21]:
quantizer_gpu = deepcopy(quantizer.model).to('cuda')
%time eval_model(val_loader_gpu, quantizer_gpu, 'cuda', print_freq=60)

15551 samples (32 per mini-batch)
[ 60/486] Top1: 0.000  Top5: 0.417  Loss: 7.366
[120/486] Top1: 0.052  Top5: 0.443  Loss: 7.343
[180/486] Top1: 0.087  Top5: 0.556  Loss: 7.341
[240/486] Top1: 0.104  Top5: 0.560  Loss: 7.335
[300/486] Top1: 0.104  Top5: 0.562  Loss: 7.336
[360/486] Top1: 0.104  Top5: 0.564  Loss: 7.332
[420/486] Top1: 0.089  Top5: 0.528  Loss: 7.331
[480/486] Top1: 0.104  Top5: 0.553  Loss: 7.330
----------
Overall ==> Top1: 0.103  Top5: 0.553  Loss: 7.330 PPL: 1525.112
CPU times: user 11min 7s, sys: 2.8 s, total: 11min 10s
Wall time: 11min 1s


In [10]:
%time eval_model(val_loader_cpu, quantizer.model, 'cpu', print_freq=10)

15551 samples (32 per mini-batch)
[ 10/486] Top1: 0.000  Top5: 0.000  Loss: 7.285
[ 20/486] Top1: 0.000  Top5: 0.000  Loss: 7.287
[ 30/486] Top1: 0.000  Top5: 0.000  Loss: 7.297
[ 40/486] Top1: 0.078  Top5: 0.078  Loss: 7.292
[ 50/486] Top1: 0.062  Top5: 0.125  Loss: 7.292
[ 60/486] Top1: 0.052  Top5: 0.104  Loss: 7.290
[ 70/486] Top1: 0.045  Top5: 0.089  Loss: 7.298
[ 80/486] Top1: 0.078  Top5: 0.195  Loss: 7.282
[ 90/486] Top1: 0.069  Top5: 0.174  Loss: 7.279
[100/486] Top1: 0.062  Top5: 0.156  Loss: 7.282
[110/486] Top1: 0.057  Top5: 0.142  Loss: 7.279
[120/486] Top1: 0.052  Top5: 0.156  Loss: 7.288
[130/486] Top1: 0.072  Top5: 0.168  Loss: 7.287
[140/486] Top1: 0.067  Top5: 0.179  Loss: 7.281
[150/486] Top1: 0.083  Top5: 0.187  Loss: 7.280
[160/486] Top1: 0.098  Top5: 0.215  Loss: 7.282
[170/486] Top1: 0.110  Top5: 0.221  Loss: 7.273
[180/486] Top1: 0.104  Top5: 0.208  Loss: 7.277
[190/486] Top1: 0.099  Top5: 0.197  Loss: 7.282
[200/486] Top1: 0.094  Top5: 0.187  Loss: 7.278
[210/4

### 4bit quantization
Mame nastavenie:

```python
bits_activations: 4
bits_parameters: 4
bits_accum: 8
```

In [22]:
config_file = '/home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/resnet18_imagenet_post_train_4bit.yaml'
stats_before_quantization_2, quantizer_4bit = make_quantizer(cpu_model,config_file)

In [25]:
quantizer_gpu_4bit = deepcopy(quantizer_4bit.model).to('cuda')
%time eval_model(val_loader_gpu, quantizer_gpu_4bit, 'cuda', print_freq=60)

15551 samples (32 per mini-batch)
[ 60/486] Top1: 0.156  Top5: 0.781  Loss: 7.427
[120/486] Top1: 0.130  Top5: 0.677  Loss: 7.435
[180/486] Top1: 0.104  Top5: 0.625  Loss: 7.425
[240/486] Top1: 0.091  Top5: 0.651  Loss: 7.422
[300/486] Top1: 0.094  Top5: 0.667  Loss: 7.418
[360/486] Top1: 0.104  Top5: 0.694  Loss: 7.419
[420/486] Top1: 0.119  Top5: 0.744  Loss: 7.425
[480/486] Top1: 0.124  Top5: 0.729  Loss: 7.422
----------
Overall ==> Top1: 0.129  Top5: 0.746  Loss: 7.422 PPL: 1671.588
CPU times: user 12min 2s, sys: 2.33 s, total: 12min 4s
Wall time: 11min 56s


# Not used, yet...
Distiller impl. of resnet is **wrapped** in modules 

In [None]:
import torch.nn as nn
import distiller.modules

def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1
    __constants__ = ['downsample']

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride
        
        # Added
        """  for quantization purposes """
        # (1)
        self.add = distiller.modules.EltwiseAdd(inplace=True)
        # (2)
        self.relu2 = nn.ReLU(inplace=True)

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        # (1) out += identity
        out = self.add(out,identity)
        # (2) out = self.relu(out)
        out = self.relu2(out)

        return out
    
class Bottleneck(nn.Module):
    expansion = 4
    __constants__ = ['downsample']

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
        
        # Added
        """  for quantization purposes """
        #(2)
        self.relu2 = nn.ReLU(inplace=True)
        #(1)
        self.add = distiller.modules.EltwiseAdd(inplace=True)
        #(2)
        self.relu3 = nn.ReLU(inplace=True)

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        # out = self.relu(out)
        out = self.relu2

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        # out += identity
        out = self.add(out,identity)
        # out = self.relu(out)
        out = self.relu3

        return out



class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        
        # Added
        """  for quantization purposes """
        # (1)
        self.flatten = distiller.modules.Flatten()

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def _forward_impl(self, x):
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        # (1) x = torch.flatten(x, 1)
        x = self.flatten(x,1)
        x = self.fc(x)

        return x

    def forward(self, x):
        return self._forward_impl(x)


In [None]:
device = 'cuda'

def model_conversion(pytorch_model_):
    model = ResNet(BasicBlock,[2, 2, 2, 2]).to(device)
    # model = DistillerRNNModel(nlayers=nlayers, ninp=ninp, nhid=nhid, ntoken=ntoken, tie_weights=tie_weights).to(device)
    model.eval()    
    # model.encoder.weight = nn.Parameter(pytorch_model_.encoder.weight.clone().detach())
    model.decoder.weight = nn.Parameter(pytorch_model_.decoder.weight.clone().detach())
    model.decoder.bias = nn.Parameter(pytorch_model_.decoder.bias.clone().detach())
    
    
    # model.rnn = LSTM.from_pytorch_impl(pytorch_model_.rnn)

    return model

man_model = model_conversion(model)
torch.save(man_model, 'manual.checkpoint.pth.tar')
man_model