# Calculating score for Micronet Challange

Remark: We don't have to account for Batch Norm parameters and FLOPS - https://tehnokv.com/posts/fusing-batchnorm-and-conv/ since they can be fused with the preceding convolutional layer.

`counting.py` script is taken from https://github.com/google-research/google-research/blob/master/micronet_challenge/counting.py

### Imports

In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import numpy as np
from efficientnet_pytorch import EfficientNet
from counting import Conv2D, FullyConnected, GlobalAvg, Add, DepthWiseConv2D, Scale, count_ops, get_conv_output_size

# ImageNet submission

Let's look at MobileNetV2 architecture which is a baseline architecture for calculating the score.

In [2]:
baseline_model = models.__dict__['mobilenet_v2'](width_mult=1.4)
print(baseline_model)

MobileNetV2(
  (features): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 44, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(44, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(44, 44, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=44, bias=False)
          (1): BatchNorm2d(44, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace)
        )
        (1): Conv2d(44, 22, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(22, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(22, 132, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(132, eps=1e-05, momentum=0.1, affine=True, trac

Let's get all the operations in the MobileNetV2 architecture - as stated earlier in the remark we can ignore BatchNorm layers.

In [3]:
start_image_size = 224
#kernel_shape = k_size, _, c_in, c_out

In [4]:
def build_mobilenet_conv_op(module, input_size):
    padding = 'same' if module.padding==(1,1) else 'valid'
    if module.groups == module.in_channels:
        op_type = DepthWiseConv2D
        kernel_shape=module.kernel_size + (module.in_channels, 1)
    else:
        op_type = Conv2D
        kernel_shape=module.kernel_size + (module.in_channels, module.out_channels)
    return op_type(input_size=input_size, 
                kernel_shape=kernel_shape, 
                strides=module.stride,
                padding=padding,
                use_bias=False,
                activation='relu')


def get_mobilenet_ops(model, input_size):
    # First operation before blocks is a convolution
    op = Conv2D(input_size=input_size, kernel_shape=(3, 3, 3, 44), 
                strides=(2, 2), padding='same', 
                use_bias=False, activation='relu')
    ops = [op]
    input_size = get_conv_output_size(input_size, op.kernel_shape[0], 'same', op.strides[0])
    
    # Get all relevant operations from blocks
    for child in model.modules():
        # If child is of block type, get all operations inside the block
        if isinstance(child, models.mobilenet.InvertedResidual):
            for block_child in child.modules():
                if isinstance(block_child, nn.Conv2d):
                    op = build_mobilenet_conv_op(block_child, input_size)
                    ops.append(op)
                    input_size = get_conv_output_size(input_size, op.kernel_shape[0], op.padding, op.strides[0])
            # Account for residual connection
            if child.use_res_connect:
                op = Add(input_size=input_size,
                         n_channels=op.kernel_shape[3])
                ops.append(op)
    
    # There is one last conv2D after all the blocks  
    last_conv = Conv2D(input_size=input_size, kernel_shape=(1, 1, 448, 1792), 
                strides=(1, 1), padding='valid', 
                use_bias=False, activation='relu')
    # After all of the blocks pooling layer and linear layer follows
    pool_op = GlobalAvg(input_size=input_size,
                       n_channels=1792)
    dense_op = FullyConnected(kernel_shape=(1792, 1000),
                       use_bias=True,
                       activation=None)
    ops.extend([last_conv, pool_op, dense_op])
    return ops

In [5]:
model_ops_baseline = get_mobilenet_ops(model=baseline_model, input_size=start_image_size)

### Counts for the baseline MobileNetV2

In [6]:
ADD_BIT_BASE = 32
MUL_BIT_BASE = 32
BASELINE_PARAMETER_BITS = 32

Helper function

In [7]:
def process_counts(total_params, total_mults, total_adds, mul_bits, add_bits):
    # converting to Mbytes.
    total_params = int(total_params) / 8. / 1e6
    total_mults = total_mults * mul_bits / MUL_BIT_BASE / 1e6
    total_adds = total_adds * add_bits / ADD_BIT_BASE  / 1e6
    return total_params, total_mults, total_adds

In [8]:
total_params_baseline, total_mults_baseline, total_adds_baseline = [0] * 3
for op in model_ops_baseline:
    param_count, flop_mults, flop_adds = count_ops(op=op, sparsity=0, param_bits=BASELINE_PARAMETER_BITS)
    total_params_baseline += param_count
    total_mults_baseline += flop_mults
    total_adds_baseline += flop_adds

total_params_baseline, total_mults_baseline, total_adds_baseline = process_counts(
                                             total_params = total_params_baseline,
                                             total_mults = total_mults_baseline,
                                             total_adds = total_adds_baseline, 
                                             mul_bits = MUL_BIT_BASE,
                                             add_bits = ADD_BIT_BASE)

### Counts for our pruned model of EfficientNetB2

First let's load the architecture.

In [9]:
model = model = EfficientNet.from_name('efficientnet-b2')
print(model)

EfficientNet(
  (_conv_stem): Conv2dStaticSamePadding(
    3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
    (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
  )
  (_bn0): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  (_blocks): ModuleList(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
        (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
      )
      (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        32, 8, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        8, 32, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dStaticSamePadding(
        32, 16, kernel_size=

Let's get all the operations in the EfficientNet architecture - as stated earlier in the remark we can ignore BatchNorm layers.

In [10]:
start_image_size = 260
#kernel_shape = k_size, _, c_in, c_out

Helper function

In [11]:
def get_mb_block_ops(mb_block, input_size):
    ops = []
    mask = []
    m_name = '_expand_conv'
    if hasattr(mb_block, m_name):
        module = getattr(mb_block, m_name)
        op = Conv2D(input_size=input_size,
               kernel_shape=module.kernel_size + (module.in_channels, module.out_channels),
               strides=module.stride,
               padding='same',
               use_bias=True,
               activation='swish')
        ops.append(op)
        mask.append(1)
        
    m_name = '_depthwise_conv'
    module = getattr(mb_block, m_name)
    op = DepthWiseConv2D(input_size=input_size,
            kernel_shape=module.kernel_size + (module.in_channels, 1),
            strides=module.stride,
            padding='same',
            use_bias=True,
            activation='swish')
    ops.append(op)
    mask.append(1)
    input_size = get_conv_output_size(input_size, op.kernel_shape[0], 'same', op.strides[0])
    
    if mb_block.has_se:
        se_reduce = getattr(mb_block, '_se_reduce')
        se_expand = getattr(mb_block, '_se_expand')
        op = GlobalAvg(input_size = input_size,
                       n_channels=se_reduce.in_channels)
        ops.append(op)
        mask.append(0)
        # input size is 1
        op = Conv2D(input_size=1,
               kernel_shape=se_reduce.kernel_size + (se_reduce.in_channels, se_reduce.out_channels),
               strides=se_reduce.stride,
               padding='same',
               use_bias=True,
               activation='swish')
        ops.append(op)
        mask.append(1)
        op = Conv2D(input_size=1,
               kernel_shape=se_expand.kernel_size + (se_expand.in_channels, se_expand.out_channels),
               strides=se_expand.stride,
               padding='same',
               use_bias=True,
               activation='sigmoid')
        ops.append(op)
        mask.append(1)
        op = Scale(input_size = input_size,
                   n_channels=se_reduce.in_channels)
        ops.append(op)
        mask.append(0)
    
    m_name = '_project_conv'
    module = getattr(mb_block, m_name)
    op = Conv2D(input_size=input_size,
               kernel_shape=module.kernel_size + (module.in_channels, module.out_channels),
               strides=module.stride,
               padding='same',
               use_bias=True,
               activation=None)
    ops.append(op)
    mask.append(1)
    
    if mb_block.id_skip:
        if type(mb_block._block_args.stride) is not list:
            stride = [mb_block._block_args.stride]
        else:
            stride = mb_block._block_args.stride
        if all(s == 1 for s in stride):
            if mb_block._block_args.input_filters == mb_block._block_args.output_filters:
                op = Add(input_size=input_size,
                        n_channels=se_reduce.in_channels)
                ops.append(op)
                mask.append(0)
    
    return ops, input_size, mask

In [12]:
def get_efficientnet_ops(model, input_size):
    ops = []
    mask_for_sparsity = []
    # First operation before blocks is a convolution
    m_name = '_conv_stem'
    module = getattr(model, m_name)
    op = Conv2D(input_size=input_size,
               kernel_shape=module.kernel_size + (module.in_channels, module.out_channels),
               strides=module.stride,
               padding='same',
               use_bias=True,
               activation='swish')
    ops.append(op)
    mask_for_sparsity.append(1)
    input_size = get_conv_output_size(input_size, op.kernel_shape[0], 'same', op.strides[0])
    
    # Iterate over blocks
    blocks = getattr(model, '_blocks')
    for i in range(23):
        mb_block = getattr(blocks, str(i))
        block_ops, input_size, mask = get_mb_block_ops(mb_block, input_size)
        ops.extend(block_ops)
        mask_for_sparsity.extend(mask)
    # Conv head
    m_name = '_conv_head'
    module = getattr(model, m_name)
    conv_head = Conv2D(input_size=9,
               kernel_shape=module.kernel_size + (module.in_channels, module.out_channels),
               strides=module.stride,
               padding='same',
               use_bias=True,
               activation='swish')
    
    # Average pooling
    pool = GlobalAvg(input_size=9,
                   n_channels=module.out_channels)
    # Linear layer
    linear = FullyConnected(kernel_shape=(module.out_channels, 1000),
                           use_bias=True,
                           activation=None)
    ops.extend([conv_head, pool, linear])
    mask_for_sparsity.extend([1,0,1])
    return ops, mask_for_sparsity

In [13]:
model_ops, mask_for_sparsities = get_efficientnet_ops(model=model, input_size=start_image_size)

Now let's calculate the sparsities for each layer. Our method prunes the architecture on per-layer basis, hence the sparsity levels across layers differ. We can calculate the sparsity for each layer using `weigth_mask` (bit mask for the model weight). The percentage of 0s is equivalent to the sparsity level of a given layer. (Note: the order of layers in the `weight_mask` is the same as the order of layers in the `model_ops` list).

In [14]:
path = "weightsmasks_effnetB2.bin"
weight_mask = torch.load(path, map_location='cpu')
total_weights_per_layer = [l.numel() for l in weight_mask]
zeros_per_layer = [torch.sum(l==0).item() for l in weight_mask]
sparsity_per_layer = [zeros/total for (zeros, total) in zip(zeros_per_layer, total_weights_per_layer)]
# Pad with zeros for non-convolutional and non-linear layers
sparsity_per_layer_temp = sparsity_per_layer
sparsity_per_layer = []
index = 0
for i in mask_for_sparsities:
    if i == 0:
        sparsity_per_layer.append(0)
    else:
        sparsity_per_layer.append(sparsity_per_layer_temp[index])
        index += 1

For our pruned model we are using the 'freebie' 16-bit quantization.

In [15]:
PARAMETER_BITS = 16
ADD_BITS = 32
MULT_BITS = 16

In [16]:
total_params, total_mults, total_adds = [0] * 3
for op, sparsity in zip(model_ops, sparsity_per_layer):
    param_count, flop_mults, flop_adds = count_ops(op, sparsity, PARAMETER_BITS)
    total_params += param_count
    total_mults += flop_mults
    total_adds += flop_adds
    
total_params, total_mults, total_adds = process_counts(total_params = total_params,
                                                           total_mults = total_mults,
                                                           total_adds = total_adds, 
                                                           mul_bits = MULT_BITS,
                                                           add_bits = ADD_BITS)

### Parameter storage score

In [17]:
param_score = total_params/total_params_baseline
print('Our parameter score is: {}.'.format(param_score))

Our parameter score is: 0.3540558413686507.


### Math operations score

In [18]:
total_flops_baseline = total_adds_baseline + total_mults_baseline
total_flops = total_adds + total_mults
flops_score = total_flops/total_flops_baseline
print('Our math operations score is: {}.'.format(flops_score))

Our math operations score is: 0.5392571128829731.


### Total score

In [19]:
total_score = param_score + flops_score
print('Our total score is: {}'.format(total_score))

Our total score is: 0.8933129542516238
