In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from torch.profiler import profile, record_function, ProfilerActivity
from utils import memory_cost_profiler

In [2]:
class hswish(nn.Module):
    def forward(self, x):
        out = x * F.relu6(x+3, inplace = True) /6

        return out

class hsigmoid(nn.Module):
    def forward(self,x ):
        out = F.relu6(x+3, inplace = True) / 6
        
        return out

class SeModule(nn.Module):

    def __init__(self, in_size, reduction = 4):
        super(SeModule, self).__init__()
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(in_size, in_size // reduction, kernel_size = 1, stride = 1, padding = 0, bias = False),
            nn.BatchNorm2d(in_size // reduction),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_size // reduction, in_size, kernel_size = 1, stride = 1, padding = 0, bias = False),
            nn.BatchNorm2d(in_size),
            hsigmoid()
        )

    def forward(self, x):
        return x * self.se(x)


In [3]:
class Block(nn.Module):
    
    def __init__(self, kernel_size, in_size, expand_size, out_size, nolinear, semodule, stride):
        super(Block, self).__init__()
        self.stride = stride
        self.se = semodule

        self.conv1 = nn.Conv2d(in_size, expand_size, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn1 = nn.BatchNorm2d(expand_size)
        self.nolinear1 = nolinear
        self.conv2 = nn.Conv2d(expand_size, expand_size, kernel_size=kernel_size, stride=stride, padding=kernel_size//2, groups=expand_size, bias=False)
        self.bn2 = nn.BatchNorm2d(expand_size)
        self.nolinear2 = nolinear
        self.conv3 = nn.Conv2d(expand_size, out_size, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn3 = nn.BatchNorm2d(out_size)

        self.shortcut = nn.Sequential()
        if stride == 1 and in_size != out_size:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_size, out_size, kernel_size=1, stride=1, padding=0, bias=False),
                nn.BatchNorm2d(out_size),
            )

    def forward(self, x):
        out = self.nolinear1(self.bn1(self.conv1(x)))
        out = self.nolinear2(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        if self.se != None:
            out = self.se(out)
        out = out + self.shortcut(x) if self.stride==1 else out
        return out

In [1]:
class MobileNetV3_Large(nn.Module):
    def __init__(self, num_classes=1000):
        super(MobileNetV3_Large, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.hs1 = hswish()

        self.bneck = nn.Sequential(
            Block(3, 16, 16, 16, nn.ReLU(inplace=True), None, 1),
            Block(3, 16, 64, 24, nn.ReLU(inplace=True), None, 2),
            Block(3, 24, 72, 24, nn.ReLU(inplace=True), None, 1),
            Block(5, 24, 72, 40, nn.ReLU(inplace=True), SeModule(40), 2),
            Block(5, 40, 120, 40, nn.ReLU(inplace=True), SeModule(40), 1),
            Block(5, 40, 120, 40, nn.ReLU(inplace=True), SeModule(40), 1),
            Block(3, 40, 240, 80, hswish(), None, 2),
            Block(3, 80, 200, 80, hswish(), None, 1),
            Block(3, 80, 184, 80, hswish(), None, 1),
            Block(3, 80, 184, 80, hswish(), None, 1),
            Block(3, 80, 480, 112, hswish(), SeModule(112), 1),
            Block(3, 112, 672, 112, hswish(), SeModule(112), 1),
            Block(5, 112, 672, 160, hswish(), SeModule(160), 1),
            Block(5, 160, 672, 160, hswish(), SeModule(160), 2),
            Block(5, 160, 960, 160, hswish(), SeModule(160), 1),
        )


        self.conv2 = nn.Conv2d(160, 960, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(960)
        self.hs2 = hswish()
        self.linear3 = nn.Linear(960, 1280)
        self.bn3 = nn.BatchNorm1d(1280)
        self.hs3 = hswish()
        self.linear4 = nn.Linear(1280, num_classes)
        self.init_params()

    def init_params(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                init.normal_(m.weight, std=0.001)
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def forward(self, x):
        out = self.hs1(self.bn1(self.conv1(x)))
        out = self.bneck(out)
        out = self.hs2(self.bn2(self.conv2(out)))
        out = F.avg_pool2d(out, 7)
        out = out.view(out.size(0), -1)
        out = self.hs3(self.bn3(self.linear3(out)))
        out = self.linear4(out)
        return out



class MobileNetV3_Small(nn.Module):
    def __init__(self, num_classes=1000):
        super(MobileNetV3_Small, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.hs1 = hswish()

        self.bneck = nn.Sequential(
            Block(3, 16, 16, 16, nn.ReLU(inplace=True), SeModule(16), 2),
            Block(3, 16, 72, 24, nn.ReLU(inplace=True), None, 2),
            Block(3, 24, 88, 24, nn.ReLU(inplace=True), None, 1),
            Block(5, 24, 96, 40, hswish(), SeModule(40), 2),
            Block(5, 40, 240, 40, hswish(), SeModule(40), 1),
            Block(5, 40, 240, 40, hswish(), SeModule(40), 1),
            Block(5, 40, 120, 48, hswish(), SeModule(48), 1),
            Block(5, 48, 144, 48, hswish(), SeModule(48), 1),
            Block(5, 48, 288, 96, hswish(), SeModule(96), 2),
            Block(5, 96, 576, 96, hswish(), SeModule(96), 1),
            Block(5, 96, 576, 96, hswish(), SeModule(96), 1),
        )


        self.conv2 = nn.Conv2d(96, 576, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(576)
        self.hs2 = hswish()
        self.linear3 = nn.Linear(576, 1280)
        self.bn3 = nn.BatchNorm1d(1280)
        self.hs3 = hswish()
        self.linear4 = nn.Linear(1280, num_classes)
        self.init_params()

    def init_params(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                init.normal_(m.weight, std=0.001)
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def forward(self, x):
        print(x.shape)
        out = self.hs1(self.bn1(self.conv1(x)))
        out = self.bneck(out)
        out = self.hs2(self.bn2(self.conv2(out)))
        out = F.avg_pool2d(out, 7)
        out = out.view(out.size(0), -1)
        out = self.hs3(self.bn3(self.linear3(out)))
        out = self.linear4(out)
        return out



def test():
    net = MobileNetV3_Large()
    x = torch.randn(2,3,224,224)
    y = net(x)
    print(y)

NameError: name 'nn' is not defined

In [5]:
model = MobileNetV3_Large()
inputs = torch.randn(2,3,224,224)

with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("model_inference"):
        model(inputs)

In [18]:
print(prof.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
         aten::native_batch_norm        52.24%     261.553ms        54.33%     272.050ms       4.122ms            66  
        aten::mkldnn_convolution        37.83%     189.400ms        37.92%     189.868ms       2.921ms            65  
                 model_inference         3.44%      17.212ms        99.70%     499.194ms     499.194ms             1  
                       aten::add         0.74%       3.721ms         0.88%       4.414ms      41.642us           106  
                       aten::sum         0.73%       3.672ms         0.82%       4.098ms      55.378us            74  
                       aten::mul         0.45%  

In [19]:
with profile(activities=[ProfilerActivity.CPU],
        profile_memory=True, record_shapes=True) as prof:
    model(inputs)

[W CPUAllocator.cpp:305] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event


In [20]:
print(prof.key_averages().table(sort_by="self_cpu_memory_usage", row_limit=10))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::empty         1.27%       6.147ms         1.27%       6.147ms      11.300us      74.46 Mb      74.46 Mb           544  
                       aten::add         4.42%      21.373ms         4.57%      22.088ms     208.377us      16.41 Mb      16.41 Mb           106  
                       aten::mul         2.95%      14.279ms         2.95%      14.279ms     492.379us      14.23 Mb      14.23 Mb            29  
             aten::empty_strided         0.31%       1.495ms         0.31%       1.495ms       9.286us      12.83 Mb  

In [27]:
from torchsummary import summary

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # PyTorch v0.4.0
model = model.to(device)

summary(model, (3,224,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 112, 112]             432
       BatchNorm2d-2         [-1, 16, 112, 112]              32
            hswish-3         [-1, 16, 112, 112]               0
            Conv2d-4         [-1, 16, 112, 112]             256
       BatchNorm2d-5         [-1, 16, 112, 112]              32
              ReLU-6         [-1, 16, 112, 112]               0
            Conv2d-7         [-1, 16, 112, 112]             144
       BatchNorm2d-8         [-1, 16, 112, 112]              32
              ReLU-9         [-1, 16, 112, 112]               0
           Conv2d-10         [-1, 16, 112, 112]             256
      BatchNorm2d-11         [-1, 16, 112, 112]              32
            Block-12         [-1, 16, 112, 112]               0
           Conv2d-13         [-1, 64, 112, 112]           1,024
      BatchNorm2d-14         [-1, 64, 1

In [21]:
class MyConv2d(nn.Conv2d):
	"""
	Conv2d with Weight Standardization
	https://github.com/joe-siyuan-qiao/WeightStandardization
	"""

	def __init__(self, in_channels, out_channels, kernel_size, stride=1,
	             padding=0, dilation=1, groups=1, bias=True):
		super(MyConv2d, self).__init__(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias)
		self.WS_EPS = None

	def weight_standardization(self, weight):
		if self.WS_EPS is not None:
			weight_mean = weight.mean(dim=1, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
			weight = weight - weight_mean
			std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, 1) + self.WS_EPS
			weight = weight / std.expand_as(weight)
		return weight

	def forward(self, x):
		if self.WS_EPS is None:
			return super(MyConv2d, self).forward(x)
		else:
			return F.conv2d(x, self.weight_standardization(self.weight), self.bias,
			                self.stride, self.padding, self.dilation, self.groups)

	def __repr__(self):
		return super(MyConv2d, self).__repr__()[:-1] + ', ws_eps=%s)' % self.WS_EPS

In [23]:
class Hswish(nn.Module):

	def __init__(self, inplace=True):
		super(Hswish, self).__init__()
		self.inplace = inplace

	def forward(self, x):
		return x * F.relu6(x + 3., inplace=self.inplace) / 6.

	def __repr__(self):
		return 'Hswish()'


class Hsigmoid(nn.Module):

	def __init__(self, inplace=True):
		super(Hsigmoid, self).__init__()
		self.inplace = inplace

	def forward(self, x):
		return F.relu6(x + 3., inplace=self.inplace) / 6.

	def __repr__(self):
		return 'Hsigmoid()'

In [8]:
with profile(activities= [ProfilerActivity.CPU], record_shapes= True) as prof:
    model(inputs)

prof.export_chrome_trace("trace.json")

In [25]:
with profile(activities=[ProfilerActivity.CPU], profile_memory = True, record_shapes = True) as prof:
    model(inputs)

print(prof.key_averages(group_by_stack_n = 10).table(sort_by = "self_cpu_memory_usage"))

--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                            Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
--------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                     aten::empty         2.23%       5.833ms         2.23%       5.833ms      10.722us      74.46 Mb      74.46 Mb           544  
                       aten::add         2.27%       5.943ms         2.57%       6.715ms      63.349us      16.41 Mb      16.41 Mb           106  
                       aten::mul         0.91%       2.379ms         0.91%       2.379ms      82.034us      14.23 Mb      14.23 Mb            29  
             aten::empty_strided         0.57%       1.485ms         0.57%       1.485ms       9.224us      12.83 Mb  

In [19]:
def trace_handler(prof):
    print(prof.key_averages().table(
        sort_by = "self_cpu_memory_usage", row_limit = -1)
    )

In [26]:
with torch.profiler.profile(
    activities=[
        torch.profiler.ProfilerActivity.CPU
    ],

    # In this example with wait=1, warmup=1, active=2,
    # profiler will skip the first step/iteration,
    # start warming up on the second, record
    # the third and the forth iterations,
    # after which the trace will become available
    # and on_trace_ready (when set) is called;
    # the cycle repeats starting with the next step

    schedule=torch.profiler.schedule(
        wait=1,
        warmup=1,
        active=2),
    
    on_trace_ready=torch.profiler.tensorboard_trace_handler('./log')

    ) as p:
    model(inputs)

#print(p.key_averages(group_by_input_shape= True).table(sort_by= "self_cpu_memory_usage",row_limit = -1))