[View in Colaboratory](https://colab.research.google.com/github/maxmatical/pytorch-projects/blob/master/SENet_v2.ipynb)

In [1]:
# install pytorch 0.4.1 with gpu

from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision


tcmalloc: large alloc 1073750016 bytes == 0x574e8000 @  0x7fbe5fdad1c4 0x46d6a4 0x5fcbcc 0x4c494d 0x54f3c4 0x553aaf 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54e4c8


In [2]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print('Torch', torch.__version__, 'CUDA', torch.version.cuda)
print('Device:', torch.device('cuda:0'))


0.4.1
True
Torch 0.4.1 CUDA 8.0.61
Device: cuda:0


In [3]:
  
# install required packagles
!pip install torchsummary

Collecting torchsummary
  Downloading https://files.pythonhosted.org/packages/2a/61/21b44bb29aedb820fec4716a102e802397f0c21512764a9d98206c17417d/torchsummary-1.4-py3-none-any.whl
Installing collected packages: torchsummary
Successfully installed torchsummary-1.4


In [0]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
import math

import torch.nn as nn
import torch.nn.functional as F

from torchsummary import summary

#import ipdb

In [0]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

In [0]:
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
                                        
                                        
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=False, num_workers=2)


In [0]:
class SEBlock(nn.Module):
    def __init__(self, n_channels, reduction_ratio = 16):
      super(SEBlock, self).__init__()
      self.avg_pool = nn.AdaptiveAvgPool2d(1)
      self.fc1 = nn.Linear(n_channels, n_channels//reduction_ratio)
      self.relu = nn.LeakyReLU(inplace = True)
      self.fc2 = nn.Linear(n_channels//reduction_ratio, n_channels)
      self.sigmoid = nn.Sigmoid()

    def forward(self, input):
      b, c, _, _ = input.size()
      out = self.avg_pool(input).view(b,c)
      out = self.relu(self.fc1(out))
      out = self.sigmoid(self.fc2(out))
      out = out.view(b,c,1,1)
      out = out*input

      return out


In [0]:
class preact_se_res_block(nn.Module):
    expansion = 1 # used in downsampling
    
    def __init__(self, in_channels, out_channels, stride = 1, downsample = None, reduction_ratio = 16):
        super(preact_se_res_block, self).__init__()
        self.bn0 = nn.BatchNorm2d(in_channels)
        # conv1 has defined stride
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride = stride, padding = 1, bias=False)  # stride for downsamplings
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding = 1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample 
        self.reduction_ratio = reduction_ratio
        self.stride = stride
        self.se = SEBlock(out_channels*self.expansion, reduction_ratio)
        
    def forward(self, x):
        residual = x
        # 1st conv layer (with downsampling)
        out = self.bn0(x)
        out = self.relu(out)
        out = self.conv1(out)
        out = self.bn1(out)
        out = self.relu(out)
        # 2nd conv layer
        out = self.conv2(out)
        #out = self.bn2(out)
        out = self.se(out)


        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        #out = self.relu(out)
        return out
        

                        

In [0]:
# basic block (no preact)
class SE_Res_Block_Basic(nn.Module):
    expansion = 1

    def __init__(self, in_channels, out_channels, stride = 1, downsample = None, reduction_ratio = 16):
        super(SE_Res_Block_Basic, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False) 
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False) 
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.se = SEBlock(out_channels, reduction_ratio)
        self.downsample = downsample
        self.stride = stride
        self.reduction_ratio = reduction_ratio

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.se(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

In [0]:
# using pre-activation res blocks
class SE_Res_Block_bottleneck(nn.Module): 
    expansion = 4

    def __init__(self, in_channels, out_channels, stride = 1, downsample = None, reduction_ratio = 16):
      super(SE_Res_Block_bottleneck, self).__init__()
      self.conv1 = nn.Conv2d(in_channels, out_channels, 1, bias=False) 
      self.bn1 = nn.BatchNorm2d(in_channels)
      self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride,
                             padding=1, bias=False)      
      self.bn2 = nn.BatchNorm2d(out_channels)
      self.conv3 = nn.Conv2d(out_channels, out_channels * self.expansion, 1, bias=False) 
      self.bn3 = nn.BatchNorm2d(out_channels)
      self.relu = nn.ReLU(inplace=True)
      self.downsample = downsample 
      self.stride = stride 
      self.reduction_ratio = reduction_ratio
      self.se = SEBlock(out_channels * self.expansion, reduction_ratio)

    def forward(self, x):
        residual = x
        #1x1 block
        out = self.bn1(x)
        out = self.relu(out)
        out = self.conv1(out)
        # downsampling block (3x3 conv)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv2(out)
        # expand block (1x1 conv)
        out = self.bn3(out)
        out = self.relu(out)
        out = self.conv3(out)
        # SEblock
        out = self.se(out)

        # adjust for dimension mismatch
        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual


        return out



In [0]:
# SENet
n_classes = 10 

class preact_SENet(nn.Module):
    def __init__(self, block, layers, num_classes = n_classes, reduction_ratio = 16): # layer is a list
        super(preact_SENet, self).__init__()
        #initial conv layer
        self.conv1 = nn.Conv2d(3, 16, 7, stride=1, padding = 3, bias = False) # initial conv layer similar to resnet
        self.in_channels = 16 # match outchannel for conv1
        self.batchnorm1 = nn.BatchNorm2d(16) # match outchannel for conv1
        self.relu = nn.ReLU(inplace = True)
        ################
        self.layer1 = self.make_layer(block, 16, layers[0], reduction_ratio = reduction_ratio)
        self.layer2 = self.make_layer(block, 32, layers[1], stride = 2, reduction_ratio = reduction_ratio)
        self.layer3 = self.make_layer(block, 64, layers[2], stride = 2, reduction_ratio = reduction_ratio)
        self.layer4 = self.make_layer(block, 128, layers[3], stride = 2, reduction_ratio = reduction_ratio)
        ################
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.maxpool = nn.AdaptiveMaxPool2d(1) 
        self.linear = nn.Linear(128*block.expansion, num_classes) # in_features = out_channel from last layer * expansion
        self.reduction_ratio = reduction_ratio
    
    
    #######################
    # make layers
    #######################
    
    def make_layer(self, block, out_channels, blocks, reduction_ratio, stride=1):
        # block = residual_block
        # out_channel = output dimension of the block
        # blocks = number of residual_block to use
        # stride = stride length

        downsample = None

        # if dimesions don't match up
        if (stride != 1) or (self.in_channels != out_channels * block.expansion):
            downsample = nn.Sequential(
            nn.Conv2d(self.in_channels, out_channels*block.expansion, kernel_size=1, stride = stride, bias = False),
            nn.BatchNorm2d(out_channels*block.expansion))

        # creating multiple layers of resblocks
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample,  reduction_ratio))
        self.in_channels = out_channels*block.expansion

        for i in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))
        return nn.Sequential(*layers)
        
    def forward(self, x):
        # initial conv layer to improve starting point
        out = self.conv1(x)
        out = self.batchnorm1(out)
        out = self.relu(out)
        #out = self.maxpool(out)
        # resblocks
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.avg_pool(out) # adaptive avg pooling to get (-1, out_channel(last layer), 1, 1)
        out = out.view(out.size(0), -1) # flatten
        out = self.linear(out) # output layer

        return out





In [16]:
#net1 = preact_SENet(SEBottleneck, layers = [2,2,2,2])
net2 = preact_SENet(preact_se_res_block, layers = [2,2,2,2])
net3 = preact_SENet(SE_Res_Block_bottleneck, layers = [2,2,2,2])
# enable GPU
use_cuda = True

if use_cuda and torch.cuda.is_available():
    #net1.cuda()
    net2.cuda()
    net3.cuda()
    
# check if models are on cuda
#print(next(net1.parameters()).is_cuda)
print(next(net2.parameters()).is_cuda)
print(next(net3.parameters()).is_cuda)

True
True


In [17]:
summary(net2, (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 32, 32]           2,352
       BatchNorm2d-2           [-1, 16, 32, 32]              32
              ReLU-3           [-1, 16, 32, 32]               0
       BatchNorm2d-4           [-1, 16, 32, 32]              32
              ReLU-5           [-1, 16, 32, 32]               0
            Conv2d-6           [-1, 16, 32, 32]           2,304
       BatchNorm2d-7           [-1, 16, 32, 32]              32
              ReLU-8           [-1, 16, 32, 32]               0
            Conv2d-9           [-1, 16, 32, 32]           2,304
AdaptiveAvgPool2d-10             [-1, 16, 1, 1]               0
           Linear-11                    [-1, 1]              17
        LeakyReLU-12                    [-1, 1]               0
           Linear-13                   [-1, 16]              32
          Sigmoid-14                   

In [18]:
# summary
summary(net3, (3, 32, 32))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 32, 32]           2,352
       BatchNorm2d-2           [-1, 16, 32, 32]              32
              ReLU-3           [-1, 16, 32, 32]               0
       BatchNorm2d-4           [-1, 16, 32, 32]              32
              ReLU-5           [-1, 16, 32, 32]               0
            Conv2d-6           [-1, 16, 32, 32]             256
       BatchNorm2d-7           [-1, 16, 32, 32]              32
              ReLU-8           [-1, 16, 32, 32]               0
            Conv2d-9           [-1, 16, 32, 32]           2,304
      BatchNorm2d-10           [-1, 16, 32, 32]              32
             ReLU-11           [-1, 16, 32, 32]               0
           Conv2d-12           [-1, 64, 32, 32]           1,024
AdaptiveAvgPool2d-13             [-1, 64, 1, 1]               0
           Linear-14                   

In [0]:
# weight initialization
# new init weight        
def init_weight(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight) #he initialize, can use xavier instead
        #nn.init.constant_(m.bias, 0.001) # optional bias
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight) #he initialize, can use xavier instead
        #nn.init.constant_(m.bias, 0.001) # optional bias
    elif type(m) == nn.BatchNorm2d:
        torch.nn.init.constant_(m.weight, 1)
        torch.nn.init.constant_(m.bias, 1)

In [0]:
# apply initializers
net.apply(init_weight)

In [0]:
# define loss and optimizer
import torch.optim as optim
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr = learning_rate, momentum=0.9, nesterov= True, weight_decay= 0.01)


In [0]:
# LR scheduler
from torch.optim.lr_scheduler import _LRScheduler

class CosineAnnealingLR_with_Restart(_LRScheduler):
    """Set the learning rate of each parameter group using a cosine annealing
    schedule, where :math:`\eta_{max}` is set to the initial lr and
    :math:`T_{cur}` is the number of epochs since the last restart in SGDR:

    .. math::

        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 +
        \cos(\frac{T_{cur}}{T_{max}}\pi))

    When last_epoch=-1, sets initial lr as lr.

    It has been proposed in
    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. The original pytorch
    implementation only implements the cosine annealing part of SGDR,
    I added my own implementation of the restarts part.
    
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        T_max (int): Maximum number of iterations. (LENGTH OF 1 CYCLE)
        T_mult (float): Increase T_max by a factor of T_mult
        eta_min (float): Minimum learning rate. Default: 0.
        last_epoch (int): The index of last epoch. Default: -1.
        model (pytorch model): The model to save.
        out_dir (str): Directory to save snapshots
        take_snapshot (bool): Whether to save snapshots at every restart

    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
        https://arxiv.org/abs/1608.03983
    """

    def __init__(self, optimizer, T_max, T_mult, model, out_dir, take_snapshot, eta_min=0, last_epoch=-1):
        self.T_max = T_max
        self.T_mult = T_mult
        self.Te = self.T_max
        self.eta_min = eta_min
        self.current_epoch = last_epoch
        
        self.model = model
        self.out_dir = out_dir
        self.take_snapshot = take_snapshot
        
        self.lr_history = []
        
        super(CosineAnnealingLR_with_Restart, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        new_lrs = [self.eta_min + (base_lr - self.eta_min) *
                (1 + math.cos(math.pi * self.current_epoch / self.Te)) / 2

                for base_lr in self.base_lrs]
        
        self.lr_history.append(new_lrs)
        return new_lrs
    
    def step(self, epoch=None):
        if epoch is None:
        
            epoch = self.last_epoch + 1
        self.last_epoch = epoch
        self.current_epoch += 1
        
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr
        
        ## restart
        if self.current_epoch == self.Te:
            print("restart at epoch {:03d}".format(self.last_epoch + 1))
            
            if self.take_snapshot:
                torch.save({
                    'epoch': self.T_max,
                    'state_dict': self.model.state_dict()
                }, self.out_dir + "/" + 'snapshot_e_{:03d}.pth.tar'.format(self.T_max))
            
            ## reset epochs since the last reset
            self.current_epoch = 0
            
            ## reset the next goal
            self.Te = int(self.Te * self.T_mult)
            self.T_max = self.T_max + self.Te

In [0]:
# T_max = how many Epochs before restarting learning rate
# T_mult = increase cycle length after restart 

# try:
# 1st training cycle: T_max = 3, T_mult = 1 for 3 cycles (9 epochs)
# 2nd training cycle: T_max = 3, T_mult = 2 for 3 cycles (21 epochs)

scheduler = CosineAnnealingLR_with_Restart(optimizer, T_max=2, T_mult=2, model = net,  out_dir='blank', take_snapshot=False)

In [0]:
# modified model training to keep track of train/val loss
n_epochs = 6

for epoch in range(n_epochs):
    scheduler.step()
    running_loss = 0.0
    total_train_loss = 0.0
    for i, train_data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = train_data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print loss per n minibatches
        running_loss += loss.item()
        total_train_loss += loss.item()
        if i % 500 == 499:    # print every 500 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 500))
            running_loss = 0.0
    
    # keep track of loss in test dataset 
    correct = 0
    total = 0
    total_test_loss = 0.0
    with torch.no_grad():
        for test_data in testloader:
            test_images, test_labels = test_data
            test_outputs = net(test_images)
            test_loss = criterion(test_outputs, test_labels)
            total_test_loss += test_loss.item()
            _, predicted = torch.max(test_outputs.data, 1)
            total += test_labels.size(0)
            correct += (predicted == test_labels).sum().item()


    
    
    # for printing average loss every epoch
    print("===> Epoch {} Complete: Train Avg. Loss: {:.4f}".format(epoch+1, total_train_loss / len(trainloader)))
    print("===> Epoch {} Complete: Test Avg. Loss: {:.4f}".format(epoch+1, total_test_loss / len(testloader)))
    print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))
print('Finished Training')