In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import copy
import sys

sys.path.append('./mcunet')

from mcunet.gumbel_module.gumbel_net import GumbelMCUNets

from mcunet.tinynas.nn.modules import MBInvertedConvLayer
from mcunet.tinynas.nn.networks import MobileInvertedResidualBlock
from mcunet.model_zoo import build_model

from mcunet.utils import MyModule, MyNetwork, SEModule, build_activation, get_same_padding, sub_filter_start_end
from mcunet.tinynas.nn.modules import ZeroLayer, set_layer_from_config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ori_model, img_size, desc = build_model(net_id='mcunet-in4', pretrained=True)

In [3]:
gubmel_config = {'global_expand_ratio_list':[1,3,5,6], 'global_kernel_size_list':[3,5,7], 'gumbel_feature_extract_block':2}
gumbel_model = GumbelMCUNets.build_from_config(ori_model.config, gubmel_config)
gumbel_model.load_pretrained_mcunet_param(ori_model)

0 {'name': 'MobileInvertedResidualBlock', 'mobile_inverted_conv': {'name': 'MBInvertedConvLayer', 'in_channels': 32, 'out_channels': 16, 'kernel_size': 3, 'stride': 1, 'expand_ratio': 1, 'mid_channels': None, 'act_func': 'relu6', 'use_se': False}, 'shortcut': None}
1 {'name': 'MobileInvertedResidualBlock', 'mobile_inverted_conv': {'name': 'MBInvertedConvLayer', 'in_channels': 16, 'out_channels': 24, 'kernel_size': 7, 'stride': 2, 'expand_ratio': 3, 'mid_channels': 48, 'act_func': 'relu6', 'use_se': False}, 'shortcut': None}
load first_conv.conv.weight params (torch.Size([32, 3, 3, 3]))
load first_conv.bn.weight params (torch.Size([32]))
load first_conv.bn.bias params (torch.Size([32]))
load blocks.0.mobile_inverted_conv.depth_conv.conv.weight params (torch.Size([32, 1, 3, 3]))
load blocks.0.mobile_inverted_conv.depth_conv.bn.weight params (torch.Size([32]))
load blocks.0.mobile_inverted_conv.depth_conv.bn.bias params (torch.Size([32]))
load blocks.0.mobile_inverted_conv.point_linear.co

In [4]:
class test_gumbel(nn.Module):
    def __init__(self):
        super().__init__()
        self.test_layer = nn.Linear(20, 4)
    
    def forward(self, x):
        gumbel_input = self.test_layer(x)
        if self.training:
            gumbel_out = F.gumbel_softmax(gumbel_input, tau=1, hard=True, eps=1e-10, dim=-1)
        else:
            index = gumbel_input.max(dim=-1, keepdim=True)[1]
            gumbel_out = torch.zeros_like(gumbel_input, memory_format=torch.legacy_contiguous_format).scatter_(-1, index, 1.0)
        
        return gumbel_out


In [5]:
inputs = torch.randn(5, 20)
gumbel = test_gumbel()

# train
gumbel.train()
for i in range(5):
    out = gumbel(inputs)
    print(f"{i} iter -> ", out)

# test
gumbel.eval()
for i in range(5):
    out = gumbel(inputs)
    print(f"{i} iter -> ", out)


0 iter ->  tensor([[0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.]], grad_fn=<AddBackward0>)
1 iter ->  tensor([[0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]], grad_fn=<AddBackward0>)
2 iter ->  tensor([[0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]], grad_fn=<AddBackward0>)
3 iter ->  tensor([[0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.]], grad_fn=<AddBackward0>)
4 iter ->  tensor([[0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.]], grad_fn=<AddBackward0>)
0 iter ->  tensor([[0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.]])
1 iter ->  tensor([[0., 0., 

In [6]:
gumbel.training

False

In [7]:
inputs = torch.randn(16, 3, 160, 160)

out = gumbel_model.forward_original(inputs)
out2 = ori_model.forward(inputs)
print(out - out2)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<SubBackward0>)


In [8]:
gumbel_model.forward(inputs)

gumbel_model.eval()
print(gumbel_model.training)
out_origin = gumbel_model.forward(inputs)
for i in range(10):
    print(out_origin - gumbel_model.forward(inputs))


False
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<SubBackward0>)
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<SubBackward0>)
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], grad_fn=<SubBackward0>)
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 

In [13]:
from torchprofile import profile_macs

total_mac = profile_macs(gumbel_model.cuda(), torch.randn(2, 3,160, 160).cuda())
print(total_mac)

291112064




In [24]:
from torchprofile.utils.flatten import Flatten
import warnings
with warnings.catch_warnings(record=True):
    graph, _ = torch.jit._get_trace_graph(Flatten(gumbel_model.cuda()), torch.randn(2,3,160,160).cuda(), None)

In [23]:
variables = dict()
for x in graph.nodes():
    for v in list(x.inputs()):
        if 'tensor' in v.type().kind().lower():
            print(v.debugName(), v.type().scalarType(), v.type().sizes())

input.1 Float [2, 3, 160, 160]
1 Float [32, 3, 3, 3]
input.3 Float [2, 32, 80, 80]
2 Float [32]
3 Float [32]
4 Float [32]
5 Float [32]
input.5 Float [2, 32, 80, 80]
input.7 Float [2, 32, 80, 80]
7 Float [32, 1, 3, 3]
input.9 Float [2, 32, 80, 80]
8 Float [32]
9 Float [32]
10 Float [32]
11 Float [32]
input.11 Float [2, 32, 80, 80]
input.13 Float [2, 32, 80, 80]
13 Float [16, 32, 1, 1]
input.15 Float [2, 16, 80, 80]
14 Float [16]
15 Float [16]
16 Float [16]
17 Float [16]
input.17 Float [2, 16, 80, 80]
19 Float [48, 16, 1, 1]
input.19 Float [2, 48, 80, 80]
20 Float [48]
21 Float [48]
22 Float [48]
23 Float [48]
input.21 Float [2, 48, 80, 80]
input.23 Float [2, 48, 80, 80]
25 Float [48, 1, 7, 7]
input.25 Float [2, 48, 40, 40]
26 Float [48]
27 Float [48]
28 Float [48]
29 Float [48]
input.27 Float [2, 48, 40, 40]
input.29 Float [2, 48, 40, 40]
31 Float [24, 48, 1, 1]
input.31 Float [2, 24, 40, 40]
32 Float [24]
33 Float [24]
34 Float [24]
35 Float [24]
input.33 Float [2, 24, 40, 40]
1218 Flo

In [27]:
for x in graph.nodes():
    if 'mul_' in x.kind().lower():
        print(x)

%1302 : Float(2, 120, 40, 40, strides=[192000, 1600, 40, 1], requires_grad=1, device=cuda:0) = aten::mul_(%1287, %1301) # /home/pdh/torch/NAS/TinyML_NAS/mcunet/mcunet/gumbel_module/gumbel_net.py:172:0

%1385 : Float(2, 24, 40, 40, strides=[38400, 1600, 40, 1], requires_grad=1, device=cuda:0) = aten::mul_(%1370, %1384) # /home/pdh/torch/NAS/TinyML_NAS/mcunet/mcunet/gumbel_module/gumbel_net.py:177:0

%1488 : Float(2, 72, 40, 40, strides=[115200, 1600, 40, 1], requires_grad=1, device=cuda:0) = aten::mul_(%1473, %1487) # /home/pdh/torch/NAS/TinyML_NAS/mcunet/mcunet/gumbel_module/gumbel_net.py:177:0

%1666 : Float(2, 96, 40, 40, strides=[153600, 1600, 40, 1], requires_grad=1, device=cuda:0) = aten::mul_(%1651, %1665) # /home/pdh/torch/NAS/TinyML_NAS/mcunet/mcunet/gumbel_module/gumbel_net.py:142:0

%1758 : Float(2, 96, 40, 40, strides=[153600, 1600, 40, 1], requires_grad=1, device=cuda:0) = aten::mul_(%1743, %1757) # /home/pdh/torch/NAS/TinyML_NAS/mcunet/mcunet/gumbel_module/gumbel_net.py:15

In [9]:
ori_model_size = sum([p.numel() for p in ori_model.parameters()]) * 4 / 2**20
gumbel_model_size = sum([p.numel() for p in gumbel_model.parameters()]) * 4 / 2**20
print("Ori model size : %.1f MB" % ori_model_size)
print("Gumbel model size : %.1f MB" % gumbel_model_size)

from torchinfo import summary

summary(gumbel_model, input_size=(4, 3, 160, 160), col_width=16, col_names=['kernel_size', 'output_size', 'num_params', 'mult_adds', 'params_percent'], depth=2)


Ori model size : 6.6 MB
Gumbel model size : 8.2 MB


Layer (type:depth-idx)                             Kernel Shape     Output Shape     Param #          Mult-Adds        Param %
GumbelMCUNets                                      --               [4, 1000]        --               --                    --
├─ConvLayer: 1-1                                   3                [4, 32, 80, 80]  --               --                    --
│    └─Conv2d: 2-1                                 [3, 3]           [4, 32, 80, 80]  864              22,118,400         0.04%
│    └─BatchNorm2d: 2-2                            --               [4, 32, 80, 80]  64               256                0.00%
│    └─ReLU6: 2-3                                  --               [4, 32, 80, 80]  --               --                    --
├─ModuleList: 1-8                                  --               --               (recursive)      --               (recursive)
│    └─MobileInvertedResidualBlock: 2-4            --               [4, 16, 80, 80]  896              20,48

In [10]:
print("before forward grad : ", gumbel_model.gumbel_fc1.weight.grad)
out = gumbel_model(torch.randn(32, 3, 160, 160))

out.sum().backward()

print("after forward grad : \n", gumbel_model.gumbel_fc1.weight.grad)

before forward grad :  None


RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

In [None]:
for n, p in net.named_parameters():
    if has_deep_attr(model, n):
        print(n)

first_conv.conv.weight
first_conv.bn.weight
first_conv.bn.bias
blocks.0.mobile_inverted_conv.depth_conv.conv.weight
blocks.0.mobile_inverted_conv.depth_conv.bn.weight
blocks.0.mobile_inverted_conv.depth_conv.bn.bias
blocks.0.mobile_inverted_conv.point_linear.conv.weight
blocks.0.mobile_inverted_conv.point_linear.bn.weight
blocks.0.mobile_inverted_conv.point_linear.bn.bias
blocks.1.mobile_inverted_conv.inverted_bottleneck.conv.weight
blocks.1.mobile_inverted_conv.inverted_bottleneck.bn.weight
blocks.1.mobile_inverted_conv.inverted_bottleneck.bn.bias
blocks.1.mobile_inverted_conv.depth_conv.conv.weight
blocks.1.mobile_inverted_conv.depth_conv.bn.weight
blocks.1.mobile_inverted_conv.depth_conv.bn.bias
blocks.1.mobile_inverted_conv.point_linear.conv.weight
blocks.1.mobile_inverted_conv.point_linear.bn.weight
blocks.1.mobile_inverted_conv.point_linear.bn.bias
blocks.2.mobile_inverted_conv.inverted_bottleneck.conv.weight
blocks.2.mobile_inverted_conv.inverted_bottleneck.bn.weight
blocks.2.mo

In [None]:
model

ProxylessNASNets(
  (first_conv): ConvLayer(
    (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (act): ReLU6(inplace=True)
  )
  (blocks): ModuleList(
    (0): MobileInvertedResidualBlock(
      (mobile_inverted_conv): MBInvertedConvLayer(
        (depth_conv): Sequential(
          (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (bn): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (act): ReLU6(inplace=True)
        )
        (point_linear): Sequential(
          (conv): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
      )
    )
    (1): MobileInvertedResidualBlock(
      (mobile_inverted_conv): MBInvertedConvLayer(
        (inverted_b

In [None]:
mbconv_test = MBGumbelInvertedConvLayer.build_from_config(m.mobile_inverted_conv.config)
mbconv_test.config

{'name': 'MBGumbelInvertedConvLayer',
 'in_channels': 16,
 'out_channels': 24,
 'kernel_size': 7,
 'kernel_size_list': [7, 5, 3],
 'stride': 2,
 'expand_ratio': 3,
 'expand_ratio_list': [1, 3],
 'mid_channels': 48,
 'act_func': 'relu6',
 'use_se': False}

In [None]:
inputs = torch.randn(2, 16, 32, 32)
gumbel_inputs = torch.randn(2, 4, 8, 8)
gumbel_inputs.requires_grad = True
gumbel_layer = nn.Linear(4*8*8, 5)
gumbel_output = gumbel_layer(gumbel_inputs.view(2, -1))
gumbel_index = F.gumbel_softmax(gumbel_output, tau=1, hard=True)
print(gumbel_index)
out = mbconv_test.forward(torch.randn(2, 16, 32, 32))

tensor([[0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0.]], grad_fn=<AddBackward0>)


In [None]:
inputs = torch.randn(2, 16, 32, 32)
gumbel_inputs = torch.randn(2, 4, 8, 8)
gumbel_inputs.requires_grad = True
gumbel_layer = nn.Linear(4*8*8, 5)
gumbel_output = gumbel_layer(gumbel_inputs.view(2, -1))
gumbel_index = F.gumbel_softmax(gumbel_output, tau=1, hard=True)
print(gumbel_index)
out = mbconv_test.forward(torch.randn(2, 16, 32, 32), gumbel_index)
out.sum().backward()

tensor([[0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.]], grad_fn=<AddBackward0>)
gumbel shape :  torch.Size([2, 5])
1 6 5 7 torch.Size([48, 1, 7, 7])
1 4 3 5 torch.Size([48, 1, 7, 7])


In [None]:
gumbel_layer.weight.grad

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
original_mbconv_test_weight = copy.deepcopy(mbconv_test.depth_conv.conv.weight)
print(original_mbconv_test_weight)

In [None]:
print(m.mobile_inverted_conv.depth_conv.conv.weight)

In [None]:
for n, p in m.mobile_inverted_conv.named_parameters():
    if has_deep_attr(mbconv_test, n):
        print(n, p)
        set_deep_attr(mbconv_test, n, p)
        print('------------------')

In [None]:
for n, p in m.mobile_inverted_conv.named_parameters():
    if has_deep_attr(mbconv_test, n):
        print(n)
        print(get_deep_attr(mbconv_test, n) - p)

In [None]:
mbconv_test.forward(torch.randn(1,32,16,16), gumbel=1)

In [None]:
bn_layer = nn.BatchNorm2d(16)

In [None]:
x = torch.randn(1, 12, 32, 32)

In [None]:
feature_dim = 12
out = F.batch_norm(x, bn_layer.running_mean[:feature_dim], bn_layer.running_var[:feature_dim], bn_layer.weight[:feature_dim], bn_layer.bias[:feature_dim])

In [None]:
out.sum().backward()

In [None]:
bn_layer.weight.grad

In [None]:
model, img_size, desc = build_model(net_id='mcunet-in4', pretrained=True)

backup_model = copy.deepcopy(model)
model_copy = build_model(net_id='mcunet-in4', pretrained=False)[0]

for (n1, p1), (n2, p2) in zip(backup_model.named_parameters(), model_copy.named_parameters()):
    if n1 == n2:
        print((p1 - p2).sum())

In [None]:
for n, p in model.named_parameters():
    if has_deep_attr(model_copy, n):
        print(n)
        set_deep_attr(model_copy, n, p)

In [None]:
for (n1, p1), (n2, p2) in zip(backup_model.named_parameters(), model_copy.named_parameters()):
    if n1 == n2:
        print((p1-p2).sum())