In [2]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import tensorwatch as tw
from tensorboardX import SummaryWriter
from torchviz import make_dot
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import numpy as np

nclasses = 62 # GTSRB as 43 classes

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        # CNN layers
        self.conv1 = nn.Conv2d(3, 100, kernel_size=5)
        self.bn1 = nn.BatchNorm2d(100)
        self.conv2 = nn.Conv2d(100, 150, kernel_size=3)
        self.bn2 = nn.BatchNorm2d(150)
        self.conv3 = nn.Conv2d(150, 250, kernel_size=3)
        self.bn3 = nn.BatchNorm2d(250)
        self.conv_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(2250, 350)
        self.fc2 = nn.Linear(350, nclasses)
        self.filters = 250

        self.localization = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=7),
            nn.MaxPool2d(2, stride=2),
            nn.ReLU(True),
            nn.Conv2d(8, 10, kernel_size=5),
            nn.MaxPool2d(2, stride=2),
            nn.ReLU(True)
            )

        # Regressor for the 3 * 2 affine matrix
        # 子网络（全连接或卷积网络，再加上一个回归层）用来生成空间变换的参数θ，θ的形式可以多样，
        # 如需实现2D仿射变换，θ 就是一个6维（2x3）向量的输出
        self.fc_loc = nn.Sequential(
            nn.Linear(10 * 7 * 7, 32),                # 160*32
            nn.ReLU(True),
            nn.Linear(32, 3 * 2)                      # 32*6
            )
   
        # Initialize the weights/bias with identity transformation
        self.fc_loc[2].weight.data.zero_()
        self.fc_loc[2].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))
        
        # SENet
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Conv2d(self.filters, self.filters//16, kernel_size=1),
            nn.ReLU(),
            nn.Conv2d(self.filters//16, self.filters, kernel_size=1),
            nn.Sigmoid()
        )


    # Spatial transformer network forward function
    # 整个空间变换器包含三个部分，本地网络(Localisation Network)、网格生成器(Grid Genator)和采样器(Sampler)
    def stn(self, x):
        xs = self.localization(x)         # torch.Size([1, 10, 4, 4])
        xs = xs.view(-1, 10 * 7 * 7)               # 361, 160
        print(xs.size())
        theta = self.fc_loc(xs)
        theta = theta.view(-1, 2, 3)
        print(theta.size())
        grid = F.affine_grid(theta, x.size())
        x = F.grid_sample(x, grid)
        return x

    def forward(self, x):
        # transform the input
        x = self.stn(x)

        # Perform forward pass
        x = self.bn1(F.max_pool2d(F.leaky_relu(self.conv1(x)),2))
        x = self.conv_drop(x)
        x = self.bn2(F.max_pool2d(F.leaky_relu(self.conv2(x)),2))
        x = self.conv_drop(x)
        x = self.bn3(F.max_pool2d(F.leaky_relu(self.conv3(x)),2))
        x = self.conv_drop(x)
        x1 = self.se(x)
        x = x * x1
        x = x.view(-1, 2250)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [None]:
 from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F

class ChannelAttention(nn.Module):
    def __init__(self, in_planes, ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)

        self.fc1   = nn.Conv2d(in_planes, in_planes // 16, 1, bias=False)
        self.relu1 = nn.ReLU()
        self.fc2   = nn.Conv2d(in_planes // 16, in_planes, 1, bias=False)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
        max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
        # out = avg_out + max_out
        # return self.sigmoid(out)
        return avg_out, max_out

# ======================= spatial ==============================
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()

        assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
        padding = 3 if kernel_size == 7 else 1

        self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)   # 输出与输入有相同的维度
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv1(x)
        return self.sigmoid(x)

nclasses = 43  # GTSRB as 43 classes

def mish_fun(x):
    return x * (torch.tanh(F.softplus(x)))

class Mish(nn.Module):
    def __init__(self):
        super().__init__()
        # print("Mish activation loaded...")
    def forward(self,x):
        x = x * (torch.tanh(F.softplus(x)))
        return x

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        # CNN layers
        self.conv1 = nn.Conv2d(3, 100, kernel_size=3)
        self.bn1 = nn.BatchNorm2d(100)
        self.conv2 = nn.Conv2d(100, 150, kernel_size=3)
        self.bn2 = nn.BatchNorm2d(150)
        self.conv3 = nn.Conv2d(150, 250, kernel_size=3)
        self.bn3 = nn.BatchNorm2d(250)
        self.conv4 = nn.Conv2d(250, 250, kernel_size=2)
        self.bn4 = nn.BatchNorm2d(250)
        self.conv_c = nn.Conv2d(500, 250, kernel_size=3, padding=2)
        self.bn_c = nn.BatchNorm2d(250)

        self.conv_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(250, nclasses)
        self.filters = 250
        self.glob = nn.AdaptiveAvgPool2d((1, 1))

        #  attention
        self.ca = ChannelAttention(self.filters)
        self.sa = SpatialAttention()

        self.localization = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=7),
            nn.MaxPool2d(2, stride=2),
            # nn.ReLU(True),
            Mish(),
            nn.Conv2d(8, 10, kernel_size=5),
            nn.MaxPool2d(2, stride=2),
            # nn.ReLU(True)
            Mish()
        )

        # Regressor for the 3 * 2 affine matrix
        # 子网络（全连接或卷积网络，再加上一个回归层）用来生成空间变换的参数θ，θ的形式可以多样，
        # 如需实现2D仿射变换，θ 就是一个6维（2x3）向量的输出
        self.fc_loc = nn.Sequential(
            nn.Linear(10 * 7 * 7, 32),  # 160*32
            # nn.ReLU(True),
            Mish(),
            nn.Linear(32, 3 * 2)  # 32*6
        )

        # Initialize the weights/bias with identity transformation
        self.fc_loc[2].weight.data.zero_()
        self.fc_loc[2].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))

    # Spatial transformer network forward function
    # 整个空间变换器包含三个部分，本地网络(Localisation Network)、网格生成器(Grid Genator)和采样器(Sampler)
    def stn(self, x):
        xs = self.localization(x)  # torch.Size([1, 10, 4, 4])
        xs = xs.view(-1, 10 * 7 * 7)  # 361, 160
        theta = self.fc_loc(xs)
        theta = theta.view(-1, 2, 3)
        grid = F.affine_grid(theta, x.size())
        x = F.grid_sample(x, grid)
        return x

    def forward(self, x):
        # transform the input
        x = self.stn(x)

        # Perform forward pass
        x = self.bn1(F.max_pool2d(mish_fun(self.conv1(x)), 2))
        x = self.conv_drop(x)
        x = self.bn2(F.max_pool2d(mish_fun(self.conv2(x)), 2))
        x = self.conv_drop(x)
        x = self.bn3(F.max_pool2d(mish_fun(self.conv3(x)), 2))
        x = self.conv_drop(x)
        #  channel
        avg_pool, max_pool = self.ca(x)
        avg_pool_out = x * avg_pool
        max_pool_out = x * max_pool
        # concate
        x = torch.cat((avg_pool_out, max_pool_out), -1, x)
        x = self.bn_c(F.max_pool2d(mish_fun(self.conv_c(x)), 2))
        x = self.conv_drop(x)

        x = self.bn4(self.glob(mish_fun(self.conv4(x))))
        x = self.conv_drop(x)
        x1 = self.sa(x)
        x = x * x1
        x = x.view(-1, 250)
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)

model = Net()

In [8]:
 from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F

class ChannelAttention(nn.Module):
    def __init__(self, in_planes, ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)

        self.fc1   = nn.Conv2d(in_planes, in_planes // 16, 1, bias=False)
        self.relu1 = nn.ReLU()
        self.fc2   = nn.Conv2d(in_planes // 16, in_planes, 1, bias=False)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
        max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
        # out = avg_out + max_out
        # return self.sigmoid(out)
        return avg_out, max_out

# ======================= spatial ==============================
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()

        assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
        padding = 3 if kernel_size == 7 else 1

        self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)   # 输出与输入有相同的维度
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv1(x)
        return self.sigmoid(x)

nclasses = 43  # GTSRB as 43 classes

def mish_fun(x):
    return x * (torch.tanh(F.softplus(x)))

class Mish(nn.Module):
    def __init__(self):
        super().__init__()
        # print("Mish activation loaded...")
    def forward(self,x):
        x = x * (torch.tanh(F.softplus(x)))
        return x

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        # CNN layers
        self.conv1 = nn.Conv2d(3, 100, kernel_size=3)
        self.bn1 = nn.BatchNorm2d(100)
        self.conv2 = nn.Conv2d(100, 150, kernel_size=3)
        self.bn2 = nn.BatchNorm2d(150)
        self.conv3 = nn.Conv2d(150, 250, kernel_size=3)
        self.bn3 = nn.BatchNorm2d(250)
        self.conv4 = nn.Conv2d(250, 250, kernel_size=2)
        self.bn4 = nn.BatchNorm2d(250)
        self.conv_c = nn.Conv2d(500, 250, kernel_size=3, padding=1)
        self.conv_1 = nn.Conv2d(500, 250, kernel_size=1)
        self.bn_c = nn.BatchNorm2d(250)

        self.conv_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(250, nclasses)
        self.filters = 250
        self.glob = nn.AdaptiveAvgPool2d((1, 1))

        #  attention
        self.ca = ChannelAttention(self.filters)
        self.sa = SpatialAttention()

        self.localization = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=7),
            nn.MaxPool2d(2, stride=2),
            # nn.ReLU(True),
            Mish(),
            nn.Conv2d(8, 10, kernel_size=5),
            nn.MaxPool2d(2, stride=2),
            # nn.ReLU(True)
            Mish()
        )

        # Regressor for the 3 * 2 affine matrix
        # 子网络（全连接或卷积网络，再加上一个回归层）用来生成空间变换的参数θ，θ的形式可以多样，
        # 如需实现2D仿射变换，θ 就是一个6维（2x3）向量的输出
        self.fc_loc = nn.Sequential(
            nn.Linear(10 * 7 * 7, 32),  # 160*32
            # nn.ReLU(True),
            Mish(),
            nn.Linear(32, 3 * 2)  # 32*6
        )

        # Initialize the weights/bias with identity transformation
        self.fc_loc[2].weight.data.zero_()
        self.fc_loc[2].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))

    # Spatial transformer network forward function
    # 整个空间变换器包含三个部分，本地网络(Localisation Network)、网格生成器(Grid Genator)和采样器(Sampler)
    def stn(self, x):
        xs = self.localization(x)  # torch.Size([1, 10, 4, 4])
        xs = xs.view(-1, 10 * 7 * 7)  # 361, 160
        theta = self.fc_loc(xs)
        theta = theta.view(-1, 2, 3)
        grid = F.affine_grid(theta, x.size())
        x = F.grid_sample(x, grid)
        return x

    def forward(self, x):
        # transform the input
        x = self.stn(x)

        # Perform forward pass
        x = self.bn1(F.max_pool2d(mish_fun(self.conv1(x)), 2))
        x = self.conv_drop(x)
        x = self.bn2(F.max_pool2d(mish_fun(self.conv2(x)), 2))
        x = self.conv_drop(x)
        x = self.bn3(F.max_pool2d(mish_fun(self.conv3(x)), 2))
        x = self.conv_drop(x)
        #  channel
        avg_pool, max_pool = self.ca(x)
        avg_pool_out = x * avg_pool
        max_pool_out = x * max_pool
        # concate
        x1 = torch.cat((avg_pool_out, max_pool_out), 1)
        print(x1.shape)
        
#         x = self.bn_c(mish_fun(self.conv_c(x1)))
        x = self.bn_c(mish_fun(self.conv_1(x1)))
        x = self.conv_drop(x)

        x = self.bn4(self.glob(mish_fun(self.conv4(x))))
        x = self.conv_drop(x)
        x1 = self.sa(x)
        x = x * x1
        x = x.view(-1, 250)
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)

model = Net()
import tensorwatch as tw
tw.model_stats(model, [16, 3, 43, 43])

[MAdd]: Mish is not supported!
[Flops]: Mish is not supported!
[Memory]: Mish is not supported!
[MAdd]: Mish is not supported!
[Flops]: Mish is not supported!
[Memory]: Mish is not supported!
[MAdd]: Mish is not supported!
[Flops]: Mish is not supported!
[Memory]: Mish is not supported!
[MAdd]: Dropout2d is not supported!
[Flops]: Dropout2d is not supported!
[Memory]: Dropout2d is not supported!
[MAdd]: Dropout2d is not supported!
[Flops]: Dropout2d is not supported!
[Memory]: Dropout2d is not supported!
[MAdd]: Dropout2d is not supported!
[Flops]: Dropout2d is not supported!
[Memory]: Dropout2d is not supported!
[MAdd]: Dropout2d is not supported!
[Flops]: Dropout2d is not supported!
[Memory]: Dropout2d is not supported!
[MAdd]: Dropout2d is not supported!
[Flops]: Dropout2d is not supported!
[Memory]: Dropout2d is not supported!
[MAdd]: Dropout2d is not supported!
[Flops]: Dropout2d is not supported!
[Memory]: Dropout2d is not supported!
[MAdd]: Dropout2d is not supported!
[Flops]: D

Unnamed: 0,module name,input shape,output shape,params,memory(MB),MAdd,Flops,MemRead(B),MemWrite(B),duration[%],MemR+W(B)
0,conv1,3 43 43,100 41 41,2800.0,0.64,9077400.0,4706800.0,33388.0,672400.0,2.01%,705788.0
1,bn1,100 20 20,100 20 20,200.0,0.15,160000.0,80000.0,160800.0,160000.0,1.72%,320800.0
2,conv2,100 20 20,150 18 18,135150.0,0.19,87480000.0,43788600.0,700600.0,194400.0,3.87%,895000.0
3,bn2,150 9 9,150 9 9,300.0,0.05,48600.0,24300.0,49800.0,48600.0,1.61%,98400.0
4,conv3,150 9 9,250 7 7,337750.0,0.05,33075000.0,16549750.0,1399600.0,49000.0,3.43%,1448600.0
5,bn3,250 3 3,250 3 3,500.0,0.01,9000.0,4500.0,11000.0,9000.0,1.60%,20000.0
6,conv4,250 3 3,250 2 2,250250.0,0.0,2000000.0,1001000.0,1010000.0,4000.0,3.83%,1014000.0
7,bn4,250 1 1,250 1 1,500.0,0.0,1000.0,500.0,3000.0,1000.0,3.29%,4000.0
8,conv_c,500 3 3,250 3 3,1125250.0,0.01,20250000.0,10127250.0,4519000.0,9000.0,4.10%,4528000.0
9,bn_c,250 3 3,250 3 3,500.0,0.01,9000.0,4500.0,11000.0,9000.0,1.73%,20000.0


In [3]:
localization = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=7),
            nn.MaxPool2d(2, stride=2),
            nn.ReLU(True),
            nn.Conv2d(8, 10, kernel_size=5),
            nn.MaxPool2d(2, stride=2),
            nn.ReLU(True)
            )

tw.model_stats(localization, [16, 3, 43, 43])

Unnamed: 0,module name,input shape,output shape,params,memory(MB),MAdd,Flops,MemRead(B),MemWrite(B),duration[%],MemR+W(B)
0,0.0,3 43 43,8 37 37,1184.0,0.04,3219888.0,1620896.0,26924.0,43808.0,21.62%,70732.0
1,1.0,8 37 37,8 18 18,0.0,0.01,7776.0,10952.0,43808.0,10368.0,22.42%,54176.0
2,2.0,8 18 18,8 18 18,0.0,0.01,2592.0,2592.0,10368.0,10368.0,8.76%,20736.0
3,3.0,8 18 18,10 14 14,2010.0,0.01,784000.0,393960.0,18408.0,7840.0,15.31%,26248.0
4,4.0,10 14 14,10 7 7,0.0,0.0,1470.0,1960.0,7840.0,1960.0,18.03%,9800.0
5,5.0,10 7 7,10 7 7,0.0,0.0,490.0,490.0,1960.0,1960.0,13.85%,3920.0
total,,,,3194.0,0.07,4016216.0,2030850.0,1960.0,1960.0,100.00%,185612.0


In [8]:
import torchvision
import tensorwatch as tw
from cnn_finetune import make_model

# model = torchvision.models.resnet18(pretrained=True)
# model = make_model('resnet50', num_classes=2, pretrained=True, input_size=(320, 320), classifier_factory=make_classifier)
model = make_model('resnet18', num_classes=43, pretrained=True)
tw.model_stats(model, [16, 3, 43, 43])

[MAdd]: AdaptiveAvgPool2d is not supported!
[Flops]: AdaptiveAvgPool2d is not supported!
[Memory]: AdaptiveAvgPool2d is not supported!
[MAdd]: AdaptiveAvgPool2d is not supported!
[Flops]: AdaptiveAvgPool2d is not supported!
[Memory]: AdaptiveAvgPool2d is not supported!


Unnamed: 0,module name,input shape,output shape,params,memory(MB),MAdd,Flops,MemRead(B),MemWrite(B),duration[%],MemR+W(B)
0,_features.0,3 43 43,64 22 22,9408.0,0.12,9075968.0,4553472.0,59820.0,123904.0,1.98%,183724.0
1,_features.1,64 22 22,64 22 22,128.0,0.12,123904.0,61952.0,124416.0,123904.0,1.47%,248320.0
2,_features.2,64 22 22,64 22 22,0.0,0.12,30976.0,30976.0,123904.0,123904.0,1.26%,247808.0
3,_features.3,64 22 22,64 11 11,0.0,0.03,61952.0,30976.0,123904.0,30976.0,1.96%,154880.0
4,_features.4.0.conv1,64 11 11,64 11 11,36864.0,0.03,8913344.0,4460544.0,178432.0,30976.0,1.75%,209408.0
5,_features.4.0.bn1,64 11 11,64 11 11,128.0,0.03,30976.0,15488.0,31488.0,30976.0,0.86%,62464.0
6,_features.4.0.relu,64 11 11,64 11 11,0.0,0.03,7744.0,7744.0,30976.0,30976.0,1.17%,61952.0
7,_features.4.0.conv2,64 11 11,64 11 11,36864.0,0.03,8913344.0,4460544.0,178432.0,30976.0,1.91%,209408.0
8,_features.4.0.bn2,64 11 11,64 11 11,128.0,0.03,30976.0,15488.0,31488.0,30976.0,0.82%,62464.0
9,_features.4.1.conv1,64 11 11,64 11 11,36864.0,0.03,8913344.0,4460544.0,178432.0,30976.0,1.76%,209408.0


In [2]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F

class ChannelAttention(nn.Module):
    def __init__(self, in_planes, ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.max_pool = nn.AdaptiveMaxPool2d(1)

        self.fc1   = nn.Conv2d(in_planes, in_planes // 16, 1, bias=False)
        self.relu1 = nn.ReLU()
        self.fc2   = nn.Conv2d(in_planes // 16, in_planes, 1, bias=False)

        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x))))
        max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))
        out = avg_out + max_out
        return self.sigmoid(out)

# ======================= spatial ==============================
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()

        assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
        padding = 3 if kernel_size == 7 else 1

        self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True)   # 输出与输入有相同的维度
        max_out, _ = torch.max(x, dim=1, keepdim=True)
        x = torch.cat([avg_out, max_out], dim=1)
        x = self.conv1(x)
        return self.sigmoid(x)

nclasses = 43  # GTSRB as 43 classes

def mish_fun(x):
    # tmp = np.log(1 + np.exp(x))
    # tmp = np.tanh(tmp)
    # tmp = tmp * x
    return x * (torch.tanh(F.softplus(x)))

class Mish(nn.Module):
    def __init__(self):
        super().__init__()
        # print("Mish activation loaded...")
    def forward(self,x):
        x = x * (torch.tanh(F.softplus(x)))
        return x

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        # CNN layers
        self.conv1 = nn.Conv2d(3, 100, kernel_size=3)
        self.bn1 = nn.BatchNorm2d(100)
        self.conv2 = nn.Conv2d(100, 150, kernel_size=3)
        self.bn2 = nn.BatchNorm2d(150)
        self.conv3 = nn.Conv2d(150, 250, kernel_size=3)
        self.bn3 = nn.BatchNorm2d(250)
        self.conv4 = nn.Conv2d(250, 250, kernel_size=2)
        self.bn4 = nn.BatchNorm2d(250)
        self.conv_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(250, nclasses)
        self.filters = 250
        self.glob = nn.AdaptiveAvgPool2d((1, 1))

        #  attention
        self.ca = ChannelAttention(self.filters)
        self.sa = SpatialAttention()

        self.localization = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=7),
            nn.MaxPool2d(2, stride=2),
            # nn.ReLU(True),
            Mish(),
            nn.Conv2d(8, 10, kernel_size=5),
            nn.MaxPool2d(2, stride=2),
            # nn.ReLU(True)
            Mish()
        )

        # Regressor for the 3 * 2 affine matrix
        # 子网络（全连接或卷积网络，再加上一个回归层）用来生成空间变换的参数θ，θ的形式可以多样，
        # 如需实现2D仿射变换，θ 就是一个6维（2x3）向量的输出
        self.fc_loc = nn.Sequential(
            nn.Linear(10 * 7 * 7, 32),  # 160*32
            # nn.ReLU(True),
            Mish(),
            nn.Linear(32, 3 * 2)  # 32*6
        )

        # Initialize the weights/bias with identity transformation
        self.fc_loc[2].weight.data.zero_()
        self.fc_loc[2].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))

        # SENet
        # self.se = nn.Sequential(
        #     nn.AdaptiveAvgPool2d((1, 1)),
        #     nn.Conv2d(self.filters, self.filters // 16, kernel_size=1),
        #     # nn.ReLU(),
        #     Mish(),
        #     nn.Conv2d(self.filters // 16, self.filters, kernel_size=1),
        #     nn.Sigmoid()
        # )

    # Spatial transformer network forward function
    # 整个空间变换器包含三个部分，本地网络(Localisation Network)、网格生成器(Grid Genator)和采样器(Sampler)
    def stn(self, x):
        xs = self.localization(x)  # torch.Size([1, 10, 4, 4])
        xs = xs.view(-1, 10 * 7 * 7)  # 361, 160
        theta = self.fc_loc(xs)
        theta = theta.view(-1, 2, 3)
        grid = F.affine_grid(theta, x.size())
        x = F.grid_sample(x, grid)
        return x

    def forward(self, x):
        # transform the input
        x = self.stn(x)

        # Perform forward pass
        # x = self.bn1(F.max_pool2d(F.leaky_relu(self.conv1(x)), 2))
        x = self.bn1(F.max_pool2d(mish_fun(self.conv1(x)), 2))
        x = self.conv_drop(x)
        # x = self.bn2(F.max_pool2d(F.leaky_relu(self.conv2(x)), 2))
        x = self.bn2(F.max_pool2d(mish_fun(self.conv2(x)), 2))
        x = self.conv_drop(x)
        # x = self.bn3(F.max_pool2d(F.leaky_relu(self.conv3(x)), 2))
        x = self.bn3(F.max_pool2d(mish_fun(self.conv3(x)), 2))
        x = self.conv_drop(x)
        x1 = self.ca(x)
        x = x * x1
        # x = self.bn4(self.glob(F.leaky_relu(self.conv4(x))))
        x = self.bn4(self.glob(mish_fun(self.conv4(x))))
        x = self.conv_drop(x)
        x1 = self.sa(x)
        x = x * x1
        x = x.view(-1, 250)
        x = self.fc1(x)
        return F.log_softmax(x, dim=1)

model = Net()
import tensorwatch as tw

tw.model_stats(model, [16, 3, 43, 43])

[MAdd]: Mish is not supported!
[Flops]: Mish is not supported!
[Memory]: Mish is not supported!
[MAdd]: Mish is not supported!
[Flops]: Mish is not supported!
[Memory]: Mish is not supported!
[MAdd]: Mish is not supported!
[Flops]: Mish is not supported!
[Memory]: Mish is not supported!
[MAdd]: Dropout2d is not supported!
[Flops]: Dropout2d is not supported!
[Memory]: Dropout2d is not supported!
[MAdd]: Dropout2d is not supported!
[Flops]: Dropout2d is not supported!
[Memory]: Dropout2d is not supported!
[MAdd]: Dropout2d is not supported!
[Flops]: Dropout2d is not supported!
[Memory]: Dropout2d is not supported!
[MAdd]: AdaptiveAvgPool2d is not supported!
[Flops]: AdaptiveAvgPool2d is not supported!
[Memory]: AdaptiveAvgPool2d is not supported!
[MAdd]: AdaptiveMaxPool2d is not supported!
[Flops]: AdaptiveMaxPool2d is not supported!
[Memory]: AdaptiveMaxPool2d is not supported!
[MAdd]: Sigmoid is not supported!
[Flops]: Sigmoid is not supported!
[Memory]: Sigmoid is not supported!
[MAd

Unnamed: 0,module name,input shape,output shape,params,memory(MB),MAdd,Flops,MemRead(B),MemWrite(B),duration[%],MemR+W(B)
0,conv1,3 43 43,100 41 41,2800.0,0.64,9077400.0,4706800.0,33388.0,672400.0,2.49%,705788.0
1,bn1,100 20 20,100 20 20,200.0,0.15,160000.0,80000.0,160800.0,160000.0,1.44%,320800.0
2,conv2,100 20 20,150 18 18,135150.0,0.19,87480000.0,43788600.0,700600.0,194400.0,2.90%,895000.0
3,bn2,150 9 9,150 9 9,300.0,0.05,48600.0,24300.0,49800.0,48600.0,0.14%,98400.0
4,conv3,150 9 9,250 7 7,337750.0,0.05,33075000.0,16549750.0,1399600.0,49000.0,2.16%,1448600.0
5,bn3,250 3 3,250 3 3,500.0,0.01,9000.0,4500.0,11000.0,9000.0,0.13%,20000.0
6,conv4,250 3 3,250 2 2,250250.0,0.0,2000000.0,1001000.0,1010000.0,4000.0,1.12%,1014000.0
7,bn4,250 1 1,250 1 1,500.0,0.0,1000.0,500.0,3000.0,1000.0,0.68%,4000.0
8,conv_drop,250 1 1,250 1 1,0.0,0.0,0.0,0.0,0.0,0.0,0.03%,0.0
9,fc1,250,43,10793.0,0.0,21457.0,10750.0,44172.0,172.0,0.13%,44344.0


In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv = 
        self.fc1 = nn.Linear(2250, 350)
        self.fc2 = nn.Linear(350, nclasses)

        self.localization = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=7),
            nn.MaxPool2d(2, stride=2),
            nn.ReLU(True),
            nn.Conv2d(8, 10, kernel_size=5),
            nn.MaxPool2d(2, stride=2),
            nn.ReLU(True)
            )

        # Regressor for the 3 * 2 affine matrix
        # 子网络（全连接或卷积网络，再加上一个回归层）用来生成空间变换的参数θ，θ的形式可以多样，
        # 如需实现2D仿射变换，θ 就是一个6维（2x3）向量的输出
        self.fc_loc = nn.Sequential(
            nn.Linear(10 * 7 * 7, 32),                # 160*32
            nn.ReLU(True),
            nn.Linear(32, 3 * 2)                      # 32*6
            )
   
        # Initialize the weights/bias with identity transformation
        self.fc_loc[2].weight.data.zero_()
        self.fc_loc[2].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))


    # Spatial transformer network forward function
    # 整个空间变换器包含三个部分，本地网络(Localisation Network)、网格生成器(Grid Genator)和采样器(Sampler)
    def stn(self, x):
        xs = self.localization(x)         # torch.Size([1, 10, 4, 4])
        xs = xs.view(-1, 10 * 7 * 7)               # 361, 160
        print(xs.size())
        theta = self.fc_loc(xs)
        theta = theta.view(-1, 2, 3)
        print(theta.size())
        grid = F.affine_grid(theta, x.size())
        x = F.grid_sample(x, grid)
        return x

    def forward(self, x):
        # transform the input
        x = self.stn(x)

        # Perform forward pass
        x = self.bn1(F.max_pool2d(F.leaky_relu(self.conv1(x)),2))
        x = self.conv_drop(x)
        x = self.bn2(F.max_pool2d(F.leaky_relu(self.conv2(x)),2))
        x = self.conv_drop(x)
        x = self.bn3(F.max_pool2d(F.leaky_relu(self.conv3(x)),2))
        x = self.conv_drop(x)
        x = x.view(-1, 2250)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)

In [4]:
import torchvision
import tensorwatch as tw
from cnn_finetune import make_model

def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class ResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
        
        # =====================================================
        self.localization = nn.Sequential(
            nn.Conv2d(3, 8, kernel_size=7),
            nn.MaxPool2d(2, stride=2),
            nn.ReLU(True),
            nn.Conv2d(8, 10, kernel_size=5),
            nn.MaxPool2d(2, stride=2),
            nn.ReLU(True)
            )

        # Regressor for the 3 * 2 affine matrix
        # 子网络（全连接或卷积网络，再加上一个回归层）用来生成空间变换的参数θ，θ的形式可以多样，
        # 如需实现2D仿射变换，θ 就是一个6维（2x3）向量的输出
        self.fc_loc = nn.Sequential(
            nn.Linear(10 * 7 * 7, 32),                # 160*32
            nn.ReLU(True),
            nn.Linear(32, 3 * 2)                      # 32*6
            )
   
        # Initialize the weights/bias with identity transformation
        self.fc_loc[2].weight.data.zero_()
        self.fc_loc[2].bias.data.copy_(torch.tensor([1, 0, 0, 0, 1, 0], dtype=torch.float))


        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)
    
    
     # Spatial transformer network forward function
    # 整个空间变换器包含三个部分，本地网络(Localisation Network)、网格生成器(Grid Genator)和采样器(Sampler)
    def stn(self, x):
        xs = self.localization(x)         # torch.Size([1, 10, 4, 4])
        xs = xs.view(-1, 10 * 7 * 7)               # 361, 160
        theta = self.fc_loc(xs)
        theta = theta.view(-1, 2, 3)
        grid = F.affine_grid(theta, x.size())
        x = F.grid_sample(x, grid)
        return x


    def forward(self, x):
        x = self.stn(x)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x


def _resnet(arch, block, layers, pretrained, progress, **kwargs):
    model = ResNet(block, layers, **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model


def resnet18(pretrained=False, progress=True, **kwargs):
   
    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,**kwargs)

model = resnet18(pretrained = False)

tw.model_stats(model, [16, 3, 43, 43])

torch.Size([1, 490])
torch.Size([1, 2, 3])
[MAdd]: AdaptiveAvgPool2d is not supported!
[Flops]: AdaptiveAvgPool2d is not supported!
[Memory]: AdaptiveAvgPool2d is not supported!
[MAdd]: AdaptiveAvgPool2d is not supported!
[Flops]: AdaptiveAvgPool2d is not supported!
[Memory]: AdaptiveAvgPool2d is not supported!


Unnamed: 0,module name,input shape,output shape,params,memory(MB),MAdd,Flops,MemRead(B),MemWrite(B),duration[%],MemR+W(B)
0,conv1,3 43 43,64 22 22,9408.0,0.12,9075968.0,4553472.0,59820.0,123904.0,2.62%,183724.0
1,bn1,64 22 22,64 22 22,128.0,0.12,123904.0,61952.0,124416.0,123904.0,1.20%,248320.0
2,relu,64 22 22,64 22 22,0.0,0.12,30976.0,30976.0,123904.0,123904.0,0.71%,247808.0
3,maxpool,64 22 22,64 11 11,0.0,0.03,61952.0,30976.0,123904.0,30976.0,1.84%,154880.0
4,layer1.0.conv1,64 11 11,64 11 11,36864.0,0.03,8913344.0,4460544.0,178432.0,30976.0,2.99%,209408.0
5,layer1.0.bn1,64 11 11,64 11 11,128.0,0.03,30976.0,15488.0,31488.0,30976.0,0.94%,62464.0
6,layer1.0.relu,64 11 11,64 11 11,0.0,0.03,7744.0,7744.0,30976.0,30976.0,0.67%,61952.0
7,layer1.0.conv2,64 11 11,64 11 11,36864.0,0.03,8913344.0,4460544.0,178432.0,30976.0,1.51%,209408.0
8,layer1.0.bn2,64 11 11,64 11 11,128.0,0.03,30976.0,15488.0,31488.0,30976.0,0.96%,62464.0
9,layer1.1.conv1,64 11 11,64 11 11,36864.0,0.03,8913344.0,4460544.0,178432.0,30976.0,1.48%,209408.0
