# ResNet kvantizácia

Podľa: https://nervanasystems.github.io/distiller/prepare_model_quant.html

1. Replace direct tensor operations with modules

* Replace re-used modules with dedicated instances

* Replace torch.nn.functional calls with equivalent modules

* Special cases - replace modules that aren't quantize-able with quantize-able variants


In [1]:
import torch
import torchvision
import distiller
from distiller.models import create_model


print(torch.__version__)
print(torchvision.__version__)

1.3.1
0.4.2


In [2]:
model = create_model(pretrained=True,dataset='imagenet',arch='resnet18') 
model

DataParallel(
  (module): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): DistillerBasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu1): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (add): EltwiseAdd()
        (relu2): ReLU(inplace=True)
      )
      (1): DistillerBasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bia

### Načítanie datasetu pre cpu a gpu (cuda)
Používame distiller funkcionalitu v distiller.apputils.load_data


In [3]:
DATASET_PATH = "/home/bohumil/FIIT/BP/BP/Zdroje_kod/imagenet"
val_images = DATASET_PATH + "/val/images"


# z <distiller_root>/jupyter/post_train_quant_convert_pytorch.ipynb
distiller.set_seed(0)

subset_size = 1.0
val_split = 0.5

batch_size_gpu = 30
workers_gpu = 1

# use train for STATS and val for EVAL 

train_loader_gpu, val_loader_gpu , test_loader_gpu, _ = distiller.apputils.load_data('imagenet', DATASET_PATH, batch_size=batch_size_gpu, workers=workers_gpu, 
                             validation_split=val_split, fixed_subset=False, sequential=False, 
                             test_only=False)

In [None]:
# z <distiller_root>/jupyter/post_train_quant_convert_pytorch.ipynb
# for CPU vs GPU speed comparison
distiller.set_seed(0)

batch_size_cpu = 32
num_workers_cpu = 1
train_loader_cpu, val_loader_cpu, test_loader_cpu, _ = distiller.apputils.load_data(
    'imagenet', DATASET_PATH, batch_size_cpu, num_workers_cpu,
    validation_split=val_split, fixed_subset=True, test_only=False)

In [4]:
# z <distiller_root>/jupyter/post_train_quant_convert_pytorch.ipynb
import torchnet as tnt
import math
import numpy as np


def eval_model(data_loader, model, device='cpu', print_freq=10):
    # print('Evaluation model ', model.arch)
    
    criterion = torch.nn.CrossEntropyLoss().to(device)
    
    loss = tnt.meter.AverageValueMeter()
    classerr = tnt.meter.ClassErrorMeter(accuracy=True, topk=(1, 5))

    total_samples = len(data_loader.sampler)
    batch_size = data_loader.batch_size
    total_steps = math.ceil(total_samples / batch_size)
    print('{0} samples ({1} per mini-batch)'.format(total_samples, batch_size))

    # Switch to evaluation mode
    model.eval()

    for step, (inputs, target) in enumerate(data_loader):
        with torch.no_grad():
            inputs, target = inputs.to(device), target.to(device)
            # compute output from model
            output = model(inputs)

            # compute loss and measure accuracy
            loss.add(criterion(output, target).item())
            classerr.add(output.data, target)
            
            if (step + 1) % print_freq == 0:
                print('[{:3d}/{:3d}] Top1: {:.3f}  Top5: {:.3f}  Loss: {:.3f}'.format(
                      step + 1, total_steps, classerr.value(1), classerr.value(5), loss.mean), flush=True)
    print('----------')
    print('Overall ==> Top1: {:.3f}  Top5: {:.3f}  Loss: {:.3f} PPL: {:.3f}'.format(
        classerr.value(1), classerr.value(5), loss.mean, np.exp(loss.mean)), flush=True)

In [None]:
if torch.cuda.is_available():
    %time eval_model(val_loader_gpu,model,'cuda')

In [5]:
cpu_model = distiller.make_non_parallel_copy(model).cpu()

In [None]:
%time eval_model(val_loader_cpu, cpu_model, 'cpu')

# Quantization

## 0. Collect activation statistics

Resnet *as is* contains parallel modules, we need to use non-parallel model copy for stats

In [None]:
# z distiller/examples/word_language_model/quantize_lstm.ipynb
import os
import torch
from distiller.data_loggers import collect_quant_stats, QuantCalibrationStatsCollector

#man_model = torch.load('./manual.checkpoint.pth.tar')
distiller.utils.assign_layer_fq_names(cpu_model)
collector = QuantCalibrationStatsCollector(cpu_model)

stats_file = './acts_quantization_stats.yaml'

if not os.path.isfile(stats_file):
    def eval_for_stats(model):
        eval_model(data_loader=train_loader_gpu,model=model, print_freq=30)
    collect_quant_stats(cpu_model, eval_for_stats, save_dir='.')

Teraz mame ziskane statistiky z train datasetu. Podla tychto statistik mozeme nastavit kvantizaciu.

Pokracujeme upravenim torchvision impl. na nas pripad

## 1. Replace direct tensor operations with modules

*   mali sme v forward_imp vyuzittie `torch.flatten`
*   mali sme aj + v `BasicBlock`

## 2. Replace re-used modules with dedicated instances

*   V BasicBlock viac krat `nn.Relu`

## 3. Replace `torch.nn.functional` calls with equivalent modules
 

Taktiez dolezite je volanie

```python
...

_resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
                   **kwargs)
                   
...

def _resnet(arch, block, layers, pretrained, progress, **kwargs):
    model = ResNet(block, layers, **kwargs)

```

# Evaluate quantized model

### we should also save model for re-use ?
Skusim len spustit kvantizaciu Resnet18 podla quantization_jupyters/resnet18_imagenet_post_train.yaml
Implementacia Resnet v Distiller uz pouziva vsade modules, teda nie je nutne vytvaranie noveho korektneho modelu.
**Preto mozem rovno pustit kvantizaciu**

In [6]:
from copy import deepcopy
from distiller.quantization import PostTrainLinearQuantizer, ClipMode
import argparse
import os

Predpripravy, ziskanie statistik modelu pred kvantizaciou, kvantizovanie modelu

In [7]:
# deepcopy(cpu_model)
def make_quantizer(cpu_model, config_file_path):
    """returns stats_before_prepare, quantizer"""
    parser = argparse.ArgumentParser()
    
    # distiller.quantization.add_post_train_quant_args(parser)
    
    args = parser.parse_args(args=[])
    
    #config present, ALL OTHER args are IGNORED
    args.qe_config_file = os.path.expanduser(config_file_path)
    
    cp = deepcopy(cpu_model)
    
    quantizer = PostTrainLinearQuantizer.from_args(cp, args)
    
    # PostTrainLinearQuantizer()
    
    # Quantizer magic
    stats_before_prepare = deepcopy(quantizer.model_activation_stats)
    
    # dummy input of (batch_size, height, width, depth)
    # https://towardsdatascience.com/understanding-input-and-output-shapes-in-convolution-network-keras-f143923d56ca
    dummy_input = distiller.get_dummy_input(input_shape=model.input_shape)

    quantizer.prepare_model(dummy_input)
    
    return stats_before_prepare, quantizer


## Porovnajme statistiky oboch modelov
Porovnanie statistiky pred a po kvantizacii ResNet18

In [8]:
import pprint
pp = pprint.PrettyPrinter(indent=1)
print('Stats BEFORE prepare_model:')
pp.pprint(stats_before_prepare['rnn.cells.0.eltwiseadd_gate']['output'])

print('\nStats AFTER to prepare_model:')
pp.pprint(quantizer.model_activation_stats['rnn.cells.0.eltwiseadd_gate']['output'])

Stats BEFORE prepare_model:


NameError: name 'stats_before_prepare' is not defined

In [9]:
stats_file = '/home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/acts_quantization_stats.yaml'
config_file = '/home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/resnet18_imagenet_post_train.yaml'

# dummy_input = distiller.get_dummy_input(input_shape=model.input_shape)

stats_before_quantization, quantizer = make_quantizer(model,config_file,)
quantizer.model

DataParallel(
  (module): ResNet(
    (conv1): RangeLinearQuantParamLayerWrapper(
      weights_quant_settings=(num_bits=8 ; quant_mode=ASYMMETRIC_UNSIGNED ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=True)
      output_quant_settings=(num_bits=8 ; quant_mode=ASYMMETRIC_UNSIGNED ; clip_mode=AVG ; clip_n_stds=None ; clip_half_range=True ; per_channel=False)
      accum_quant_settings=(num_bits=16 ; quant_mode=SYMMETRIC ; clip_mode=NONE ; clip_n_stds=None ; clip_half_range=False ; per_channel=False)
        inputs_quant_auto_fallback=True, forced_quant_settings_for_inputs=None
      scale_approx_mult_bits=None
      preset_activation_stats=True
        output_scale=81.144836, output_zero_point=0.000000
      weights_scale=PerCh, weights_zero_point=PerCh
      (wrapped_module): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    )
    (bn1): Identity()
    (relu): Identity()
    (maxpool): RangeLinearFakeQuantWrapper(
      output_quant_settin

### Base - full precision cuda model

Just 3881 samples, to save time... 

In [None]:
%time eval_model(train_loader_gpu, model, 'cuda', print_freq=60)

In [None]:
%time eval_model(val_loader_cpu, cpu_model.to(device), device, print_freq=10)

### 8bit quantization evaluation
Mame nastavenie:

```python
bits_activations: 8
bits_parameters: 8
bits_accum: 16
```

In [10]:
import distiller.quantization as quant
from distiller.quantization import ClipMode
from copy import deepcopy

quant_mode = {'activations': 'ASYMMETRIC_UNSIGNED', 'weights': 'SYMMETRIC'}
# quant_mode = {'activations': 'ASYMMETRIC_UNSIGNED', 'weights': 'ASYMMETRIC_UNSIGNED'}
# overides =  { 'fc' : {'clip_acts': ClipMode.NONE}} 
# stats_file = "../examples/quantization/post_train_quant/stats/resnet18_quant_stats.yaml"
stats_file = '/home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/acts_quantization_stats.yaml'

dummy_input = distiller.get_dummy_input(input_shape=model.input_shape)

quantizer = quant.PostTrainLinearQuantizer(
    deepcopy(model), bits_activations=8, bits_parameters=8, bits_accum=32, mode=quant_mode, 
    clip_acts= ClipMode.NONE, model_activation_stats=stats_file, overrides=None, 
)
quantizer.prepare_model(dummy_input)

In [13]:
#tu by mal by loss okolo 15
#pre bits_accum = 32
%time eval_model(val_loader_gpu, quantizer.model, 'cuda', print_freq=30)

9719 samples (30 per mini-batch)
[ 30/324] Top1: 0.000  Top5: 1.222  Loss: 14.955
[ 60/324] Top1: 0.000  Top5: 1.056  Loss: 15.076


KeyboardInterrupt: 

Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


In [8]:
#toto je quantizer zhora, je presnou kopiou podla post_train_quant_convert_pytorch
#teraz som tam zmenil quant mode=ASYMMETRIC_UNSIGNED, per_channel_wts=True a ClipActs=AVG a bits_accum=16
%time eval_model(val_loader_gpu, quantizer.model, 'cuda', print_freq=30)

9719 samples (30 per mini-batch)
[ 30/324] Top1: 0.000  Top5: 0.444  Loss: 7.266


Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


KeyboardInterrupt: 

In [11]:
# quant_mode = {'activations': 'ASYMMETRIC_UNSIGNED', 'weights': 'SYMMETRIC'}
# bits_activations=8, bits_parameters=8, bits_accum=32, mode=quant_mode, 
#     clip_acts= ClipMode.NONE, model_activation_stats=stats_file, overrides=None, 
%time eval_model(val_loader_gpu, quantizer.model, 'cuda', print_freq=30)

9719 samples (30 per mini-batch)
[ 30/324] Top1: 0.000  Top5: 1.000  Loss: 14.914


KeyboardInterrupt: 

toto je eval pre *moj* quantizer, lenze tam je nejaka chyba lebo loss je 7

In [10]:
%time eval_model(val_loader_gpu, quantizer.model, 'cuda', print_freq=30)

9719 samples (30 per mini-batch)
[ 30/324] Top1: 0.000  Top5: 0.222  Loss: 7.449
[ 60/324] Top1: 0.000  Top5: 0.222  Loss: 7.472
[ 90/324] Top1: 0.000  Top5: 0.296  Loss: 7.483
[120/324] Top1: 0.083  Top5: 0.444  Loss: 7.489
[150/324] Top1: 0.089  Top5: 0.444  Loss: 7.482


KeyboardInterrupt: 

In [None]:
train_loader_gpu

In [None]:
from PIL import Image
from imagenet_classes import imagenet_classes

def eval_model(data_loader, model, device='cpu', print_freq=10):
    # print('Evaluation model ', model.arch)
    
    criterion = torch.nn.CrossEntropyLoss().to(device)
    
    loss = tnt.meter.AverageValueMeter()
    classerr = tnt.meter.ClassErrorMeter(accuracy=True, topk=(1, 5))
    apmeter = tnt.meter.APMeter()

    total_samples = len(data_loader.sampler)
    batch_size = data_loader.batch_size
    total_steps = math.ceil(total_samples / batch_size)
    print('{0} samples ({1} per mini-batch)'.format(total_samples, batch_size))

    # Switch to evaluation mode
    model.eval()

    for step, (inputs, target) in enumerate(data_loader):
        with torch.no_grad():
            inputs, target = inputs.to(device), target.to(device)
            # compute output from model
            output = model(inputs)

            # compute loss and measure accuracy
            out = criterion(output, target)
            
            # print(output[0])
            # The output has unnormalized scores. To get probabilities, you can run a softmax on it.
            softmax = torch.nn.functional.softmax(output[0], dim=0)
            target_meter = np.zeros(1000)
            target_meter[target[0]] = 1
            apmeter.add(softmax, target_meter)
            # print(softmax)
            list = softmax.tolist()
            m = max(list)
            print(' Mame tensor {4}:\n {0} \n Softmax:\n {1} \n Maximalny prvok je {2} a jeho index {3} \n Target je {5}'
                  .format(output, softmax, m, list.index(m), output.shape, target))
            samples = data_loader.dataset.samples[0]
            classes = data_loader.dataset.classes
            truth = samples[1]
            truth_class = classes[truth]
            print('Image is class {0} => {1}\n'.format(truth_class, imagenet_classes[truth]))
            img = Image.open(samples[0])
            img.show()
            print('Predicted  class index {0} which is {1} => {2}'.format(list.index(m), classes[list.index(m)], imagenet_classes[list.index(m)]) )
            
            
            item = out.item()
            loss.add(item)
            # loss.add(criterion(output, target).item())
            data = output.data
            # classerr.add(output.data, target)
            classerr.add(data, target)
            
            if (step + 1) % print_freq == 0:
                print('[{:3d}/{:3d}] Top1: {:.3f}  Top5: {:.3f}  Loss: {:.3f}'.format(
                      step + 1, total_steps, classerr.value(1), classerr.value(5), loss.mean), flush=True)
            break
        break
    print('----------')
    print('Overall ==> Top1: {:.3f}  Top5: {:.3f}  Loss: {:.3f} PPL: {:.3f}'.format(
        classerr.value(1), classerr.value(5), loss.mean, np.exp(loss.mean)), flush=True)
        

quantizer_gpu = deepcopy(quantizer.model).to('cuda')
eval_model(train_loader_gpu, quantizer_gpu, 'cuda', print_freq=1)

In [None]:
%time eval_model(train_loader_cpu, quantizer.model, 'cpu', print_freq=60)

Zmena v config-u

```python
overrides:
  fc:
    clip_acts: NONE  # Don't clip activations in last layer before softmax
```

In [None]:
config_file = '/home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/resnet18_imagenet_post_train.yaml'
stats_before_quantization, quantizer_fc = make_quantizer(cpu_model,config_file)


In [None]:
quantizer_gpu = deepcopy(quantizer_fc.model).to('cuda')
%time eval_model(train_loader_gpu, quantizer_gpu, 'cuda', print_freq=60)

In [None]:
%time eval_model(train_loader_cpu, quantizer_fc.model, 'cpu', print_freq=60)

### 4bit quantization
Mame nastavenie:

```python
bits_activations: 4
bits_parameters: 4
bits_accum: 8
```

In [None]:
config_file = '/home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/resnet18_imagenet_post_train_4bit.yaml'
stats_before_quantization_2, quantizer_4bit = make_quantizer(cpu_model,config_file)

In [None]:
quantizer_gpu_4bit = deepcopy(quantizer_4bit.model).to('cuda')
%time eval_model(val_loader_gpu, quantizer_gpu_4bit, 'cuda', print_freq=60)

# Second try on quantization

Problem is with prepare quantizer function... 
Teraz skusim to urobit **presne** podla navodu

In [6]:
import logging
def config_notebooks_logger():
    logging.config.fileConfig('logging.conf')
    msglogger = logging.getLogger()
    msglogger.info('Logging configured successfully')
    return msglogger

In [7]:
import argparse

msglogger = config_notebooks_logger()

parser = argparse.ArgumentParser()
distiller.quantization.add_post_train_quant_args(parser)
args = parser.parse_args(args= [])
args.qe_config_file = '/home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/resnet18_imagenet_post_train.yaml'


Logging configured successfully


Correct way of getting statistics

In [10]:
from distiller.data_loggers import collect_quant_stats, QuantCalibrationStatsCollector, collector_context

args.qe_calibration = 0.2
if args.qe_calibration:
    
    cpu_model = distiller.make_non_parallel_copy(model).cpu()
    
    distiller.utils.assign_layer_fq_names(cpu_model)
    msglogger.info("Generating quantization calibration stats based on {0} users".format(args.qe_calibration))
    collector = distiller.data_loggers.QuantCalibrationStatsCollector(cpu_model)
    with collector_context(collector):
        eval_model(train_loader_gpu,cpu_model,'cuda',print_freq=30)
        # Here call your model evaluation function, making sure to execute only
        # the portion of the dataset specified by the qe_calibration argument
    yaml_path = './act_quantization_stats.yaml'
    collector.save(yaml_path)


Generating quantization calibration stats based on 0.2 users


9720 samples (30 per mini-batch)


RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [11]:
args.quantize_eval = True

if args.quantize_eval:
    quantizer = distiller.quantization.PostTrainLinearQuantizer.from_args(model, args)
    # dummy = distiller.get_dummy_input(model.input_shape)
    dummy = distiller.get_dummy_input(input_shape=model.input_shape)
    quantizer.prepare_model(dummy)
    eval_model(val_loader_gpu, quantizer.model, 'cuda', print_freq=30)


Reading configuration from: /home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/resnet18_imagenet_post_train.yaml
Found component of class PostTrainLinearQuantizer: Name: post_train_quantizer ; Section: quantizers
Loading activation stats from: /home/bohumil/FIIT/BP/BP/Zdroje_kod/quantization_jupyters/acts_quantization_stats.yaml
Preparing model for quantization using PostTrainLinearQuantizer
Applying batch-norm folding ahead of post-training quantization
Propagating output statistics from BN modules to folded modules
Optimizing output statistics for modules followed by ReLU/Tanh/Sigmoid
Updated stats saved to ./quant_stats_after_prepare_model.yaml
Per-layer quantization parameters saved to ./layer_quant_params.yaml


9719 samples (30 per mini-batch)
[ 30/324] Top1: 0.000  Top5: 0.556  Loss: 7.446


KeyboardInterrupt: 

In [None]:
# Not used, yet...
Distiller impl. of resnet is **wrapped** in modules 

In [None]:
import torch.nn as nn
import distiller.modules

def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1
    __constants__ = ['downsample']

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride
        
        # Added
        """  for quantization purposes """
        # (1)
        self.add = distiller.modules.EltwiseAdd(inplace=True)
        # (2)
        self.relu2 = nn.ReLU(inplace=True)

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        # (1) out += identity
        out = self.add(out,identity)
        # (2) out = self.relu(out)
        out = self.relu2(out)

        return out
    
class Bottleneck(nn.Module):
    expansion = 4
    __constants__ = ['downsample']

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride
        
        # Added
        """  for quantization purposes """
        #(2)
        self.relu2 = nn.ReLU(inplace=True)
        #(1)
        self.add = distiller.modules.EltwiseAdd(inplace=True)
        #(2)
        self.relu3 = nn.ReLU(inplace=True)

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        # out = self.relu(out)
        out = self.relu2

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        # out += identity
        out = self.add(out,identity)
        # out = self.relu(out)
        out = self.relu3

        return out



class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        
        # Added
        """  for quantization purposes """
        # (1)
        self.flatten = distiller.modules.Flatten()

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def _forward_impl(self, x):
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        # (1) x = torch.flatten(x, 1)
        x = self.flatten(x,1)
        x = self.fc(x)

        return x

    def forward(self, x):
        return self._forward_impl(x)


In [None]:
device = 'cuda'

def model_conversion(pytorch_model_):
    model = ResNet(BasicBlock,[2, 2, 2, 2]).to(device)
    # model = DistillerRNNModel(nlayers=nlayers, ninp=ninp, nhid=nhid, ntoken=ntoken, tie_weights=tie_weights).to(device)
    model.eval()    
    # model.encoder.weight = nn.Parameter(pytorch_model_.encoder.weight.clone().detach())
    model.decoder.weight = nn.Parameter(pytorch_model_.decoder.weight.clone().detach())
    model.decoder.bias = nn.Parameter(pytorch_model_.decoder.bias.clone().detach())
    
    
    # model.rnn = LSTM.from_pytorch_impl(pytorch_model_.rnn)

    return model

man_model = model_conversion(model)
torch.save(man_model, 'manual.checkpoint.pth.tar')
man_model