[View in Colaboratory](https://colab.research.google.com/github/maxmatical/pytorch-projects/blob/master/DenseNet_v3.ipynb)

In [0]:
# install pytorch 0.4.1 with gpu

from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision


tcmalloc: large alloc 1073750016 bytes == 0x567e0000 @  0x7f902def41c4 0x46d6a4 0x5fcbcc 0x4c494d 0x54f3c4 0x553aaf 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54efc1 0x54f24d 0x551ee0 0x54efc1 0x54f24d 0x551ee0 0x54e4c8 0x54f4f6 0x553aaf 0x54e4c8


In [0]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print('Torch', torch.__version__, 'CUDA', torch.version.cuda)
print('Device:', torch.device('cuda:0'))


0.4.1
True
Torch 0.4.1 CUDA 8.0.61
Device: cuda:0


In [0]:
  
# install required packagles
!pip install torchsummary

Collecting torchsummary
  Downloading https://files.pythonhosted.org/packages/2a/61/21b44bb29aedb820fec4716a102e802397f0c21512764a9d98206c17417d/torchsummary-1.4-py3-none-any.whl
Installing collected packages: torchsummary
Successfully installed torchsummary-1.4


In [0]:
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
import math

import torch.nn as nn
import torch.nn.functional as F

from torchsummary import summary

#import ipdb

In [0]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

In [0]:
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
                                        
                                        
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=False, num_workers=2)


Files already downloaded and verified
Files already downloaded and verified


In [0]:
class DenseBlock(nn.Module):
    def __init__(self, in_channels, out_channels, dropout=0.2, stride = 1):
        super(DenseBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.leaky_relu = nn.LeakyReLU(inplace=True)
        self.conv1 = nn.Conv2d(in_channels, 4*out_channels, 1, stride = 1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(4*out_channels)
        self.conv2 = nn.Conv2d(4*out_channels, out_channels, 3, stride = 1, padding=1, bias = False)
        self.dropout_prob = dropout
        self.stride = stride
        
    def forward(self, input):
        out = self.conv1(self.leaky_relu(self.bn1(input)))
        out = F.dropout(out, p=self.dropout_prob, inplace=False, training = self.training)
        out = self.conv2(self.leaky_relu(self.bn2(out)))
        out = F.dropout(out, p=self.dropout_prob, inplace=False, training = self.training)
        out = torch.cat([out,input],1)
        return out

        

In [0]:
class TransitionBlock(nn.Module):
    def __init__(self, in_channels, out_channels, dropout=0.2):
        super(TransitionBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.leaky_relu = nn.LeakyReLU(inplace=True)
        self.conv1 = nn.Conv2d(in_channels, out_channels, 1, stride = 1, padding=0, bias=False)
        self.dropout_prob = dropout
        self.avgpool = nn.AvgPool2d(2, stride = 2)
        
    def forward(self, input):
        out = self.conv1(self.leaky_relu(self.bn1(input)))
        out = F.dropout(out, p=self.dropout_prob, inplace=False, training = self.training)
        out = self.avgpool(out)
        return out
    

In [0]:
# DenseNet
n_classes = 10 
growth_rate = 12 # growth rate
compression_rate = 0.5 # theta
class DenseNet(nn.Module):
    def __init__(self, block, layers, dropout =0.2, num_classes = n_classes,k=growth_rate, theta = compression_rate): # layer is a list
        super(DenseNet, self).__init__()
        
        # defining initial in_plane
        in_channel = 2*k
        
        #self.layer = self.make_layer(block, in_channel, k, n_layers, dropout)

        self.dropout = dropout
        

        #initial conv layers
        self.conv1 = nn.Conv2d(3, in_channel, 7, padding = 3, stride = 2) #shoudl take 224 to 112
        self.avgpool1 = nn.AvgPool2d(3, padding = 1, stride = 2)
        
        #####################
        # making denseblocks
        #####################
        
        self.layer1 = self.make_layer(block, in_channel, k, layers[0], dropout) #1st argument is num of dense blocks
        in_channel = int(in_channel+layers[0]*k)
        self.trans1 = TransitionBlock(in_channel, int(math.floor(in_channel*theta)), dropout)
        in_channel = int(math.floor(in_channel*theta))
        
        self.layer2 = self.make_layer(block, in_channel, k, layers[1], dropout) #1st argument is num of dense blocks
        in_channel = int(in_channel+layers[1]*k)
        self.trans2 = TransitionBlock(in_channel, int(math.floor(in_channel*theta)), dropout)
        in_channel = int(math.floor(in_channel*theta))

        self.layer3 = self.make_layer(block,in_channel, k, layers[2], dropout) #1st argument is num of dense blocks
        in_channel = int(in_channel+layers[2]*k)
        self.trans3 = TransitionBlock(in_channel, int(math.floor(in_channel*theta)), dropout)
        in_channel = int(math.floor(in_channel*theta))
        
        self.layer4 = self.make_layer(block, in_channel, k, layers[3], dropout) #1st argument is num of dense blocks
        in_channel = int(in_channel+layers[3]*k)
        
        # pooling and classification
        self.bn = nn.BatchNorm2d(in_channel)
        self.leaky_relu = nn.LeakyReLU(inplace=True)
        self.adaptive_avg_pool = nn.AdaptiveAvgPool2d(1)
        self.adaptive_max_pool = nn.AdaptiveMaxPool2d(1)
        self.linear = nn.Linear(in_channel*2, n_classes) 
        
        
    #####################
    # function for making layers
    #####################
    def make_layer(self, block, in_channel, k, n_layers, dropout):
        layers = []
        for i in range(n_layers):
            layers.append(block(in_channel+i*k, k, dropout))
        return nn.Sequential(*layers)

        
    def forward(self, input):
        out = self.avgpool1(self.conv1(input))
        out = self.layer1(out)
        out = self.trans1(out)
        out = self.trans2(self.layer2(out))
        out = self.trans3(self.layer3(out))
        out = self.leaky_relu(self.bn(self.layer4(out)))
        out_a = self.adaptive_avg_pool(out)
        out_a = out_a.view(out_a.size(0), -1) 
        out_b = self.adaptive_max_pool(out)
        out_b = out_b.view(out_b.size(0), -1) 
        
        out = torch.cat([out_a, out_b],1)
        out = self.linear(out) # output layer

        
        return out
    

    


In [0]:
net = DenseNet(DenseBlock, layers = [1,1,1,1], dropout=0.2)

In [0]:
# tests
T = TransitionBlock(3,3)
D = DenseBlock(3, 3)



In [0]:

#summary



#summary(net, (3, 224, 224))

#summary(D, (3, 224, 224))

summary(T, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
       BatchNorm2d-1          [-1, 3, 224, 224]               6
         LeakyReLU-2          [-1, 3, 224, 224]               0
            Conv2d-3          [-1, 3, 224, 224]               9
         AvgPool2d-4          [-1, 3, 112, 112]               0
Total params: 15
Trainable params: 15
Non-trainable params: 0
----------------------------------------------------------------


In [0]:
summary(net, (3, 224, 224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 24, 112, 112]           3,552
         AvgPool2d-2           [-1, 24, 56, 56]               0
       BatchNorm2d-3           [-1, 24, 56, 56]              48
         LeakyReLU-4           [-1, 24, 56, 56]               0
            Conv2d-5           [-1, 48, 56, 56]           1,152
       BatchNorm2d-6           [-1, 48, 56, 56]              96
         LeakyReLU-7           [-1, 48, 56, 56]               0
            Conv2d-8           [-1, 12, 56, 56]           5,184
        DenseBlock-9           [-1, 36, 56, 56]               0
      BatchNorm2d-10           [-1, 36, 56, 56]              72
        LeakyReLU-11           [-1, 36, 56, 56]               0
           Conv2d-12           [-1, 18, 56, 56]             648
        AvgPool2d-13           [-1, 18, 28, 28]               0
  TransitionBlock-14           [-1, 18,

In [0]:
# weight initialization
# new init weight        
def init_weight(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight) #he initialize, can use xavier instead
        #nn.init.constant_(m.bias, 0.001) # optional bias
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight) #he initialize, can use xavier instead
        #nn.init.constant_(m.bias, 0.001) # optional bias
    elif type(m) == nn.BatchNorm2d:
        torch.nn.init.constant_(m.weight, 1)
        torch.nn.init.constant_(m.bias, 1)

In [0]:
# apply initializers
net.apply(init_weight)

In [0]:
# define loss and optimizer
import torch.optim as optim
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr = learning_rate, momentum=0.9, nesterov= True, weight_decay= 0.01)


In [0]:
# LR scheduler
from torch.optim.lr_scheduler import _LRScheduler

class CosineAnnealingLR_with_Restart(_LRScheduler):
    """Set the learning rate of each parameter group using a cosine annealing
    schedule, where :math:`\eta_{max}` is set to the initial lr and
    :math:`T_{cur}` is the number of epochs since the last restart in SGDR:

    .. math::

        \eta_t = \eta_{min} + \frac{1}{2}(\eta_{max} - \eta_{min})(1 +
        \cos(\frac{T_{cur}}{T_{max}}\pi))

    When last_epoch=-1, sets initial lr as lr.

    It has been proposed in
    `SGDR: Stochastic Gradient Descent with Warm Restarts`_. The original pytorch
    implementation only implements the cosine annealing part of SGDR,
    I added my own implementation of the restarts part.
    
    Args:
        optimizer (Optimizer): Wrapped optimizer.
        T_max (int): Maximum number of iterations. (LENGTH OF 1 CYCLE)
        T_mult (float): Increase T_max by a factor of T_mult
        eta_min (float): Minimum learning rate. Default: 0.
        last_epoch (int): The index of last epoch. Default: -1.
        model (pytorch model): The model to save.
        out_dir (str): Directory to save snapshots
        take_snapshot (bool): Whether to save snapshots at every restart

    .. _SGDR\: Stochastic Gradient Descent with Warm Restarts:
        https://arxiv.org/abs/1608.03983
    """

    def __init__(self, optimizer, T_max, T_mult, model, out_dir, take_snapshot, eta_min=0, last_epoch=-1):
        self.T_max = T_max
        self.T_mult = T_mult
        self.Te = self.T_max
        self.eta_min = eta_min
        self.current_epoch = last_epoch
        
        self.model = model
        self.out_dir = out_dir
        self.take_snapshot = take_snapshot
        
        self.lr_history = []
        
        super(CosineAnnealingLR_with_Restart, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        new_lrs = [self.eta_min + (base_lr - self.eta_min) *
                (1 + math.cos(math.pi * self.current_epoch / self.Te)) / 2

                for base_lr in self.base_lrs]
        
        self.lr_history.append(new_lrs)
        return new_lrs
    
    def step(self, epoch=None):
        if epoch is None:
        
            epoch = self.last_epoch + 1
        self.last_epoch = epoch
        self.current_epoch += 1
        
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr
        
        ## restart
        if self.current_epoch == self.Te:
            print("restart at epoch {:03d}".format(self.last_epoch + 1))
            
            if self.take_snapshot:
                torch.save({
                    'epoch': self.T_max,
                    'state_dict': self.model.state_dict()
                }, self.out_dir + "/" + 'snapshot_e_{:03d}.pth.tar'.format(self.T_max))
            
            ## reset epochs since the last reset
            self.current_epoch = 0
            
            ## reset the next goal
            self.Te = int(self.Te * self.T_mult)
            self.T_max = self.T_max + self.Te

In [0]:
# T_max = how many Epochs before restarting learning rate
# T_mult = increase cycle length after restart 

# try:
# 1st training cycle: T_max = 3, T_mult = 1 for 3 cycles (9 epochs)
# 2nd training cycle: T_max = 3, T_mult = 2 for 3 cycles (21 epochs)

scheduler = CosineAnnealingLR_with_Restart(optimizer, T_max=2, T_mult=2, model = net,  out_dir='blank', take_snapshot=False)

In [0]:
# modified model training to keep track of train/val loss
n_epochs = 6

for epoch in range(n_epochs):
    scheduler.step()
    running_loss = 0.0
    total_train_loss = 0.0
    for i, train_data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = train_data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print loss per n minibatches
        running_loss += loss.item()
        total_train_loss += loss.item()
        if i % 500 == 499:    # print every 500 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 500))
            running_loss = 0.0
    
    # keep track of loss in test dataset 
    correct = 0
    total = 0
    total_test_loss = 0.0
    with torch.no_grad():
        for test_data in testloader:
            test_images, test_labels = test_data
            test_outputs = net(test_images)
            test_loss = criterion(test_outputs, test_labels)
            total_test_loss += test_loss.item()
            _, predicted = torch.max(test_outputs.data, 1)
            total += test_labels.size(0)
            correct += (predicted == test_labels).sum().item()


    
    
    # for printing average loss every epoch
    print("===> Epoch {} Complete: Train Avg. Loss: {:.4f}".format(epoch+1, total_train_loss / len(trainloader)))
    print("===> Epoch {} Complete: Test Avg. Loss: {:.4f}".format(epoch+1, total_test_loss / len(testloader)))
    print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))
print('Finished Training')