In [1]:
# Import libraries

import torch
import time
import copy
import torchvision
import torch.nn as nn
from torch import optim
import tensorflow as tf
import torch.nn.functional as F
from torchvision import datasets
from torchsummary import summary
from torch.autograd import Variable
from torchvision.transforms import ToTensor
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, ConcatDataset

# **Data - CiFAR10**

---



---



In [2]:
# Image preprocessing modules
transform = transforms.Compose([
    transforms.Pad(4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32),
    transforms.ToTensor()])

In [3]:
# CIFAR-10 dataset
train_dataset = torchvision.datasets.CIFAR10(root='./data/',
                                             train=True, 
                                             transform=transform,
                                             download=True)

test_dataset = torchvision.datasets.CIFAR10(root='./data/',
                                            train=False, 
                                            transform=transforms.ToTensor())

Files already downloaded and verified


In [4]:
# Data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=100, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=100, 
                                          shuffle=False)

In [5]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

# **Model**

---



In [6]:
def weights_init(module):
  if isinstance(module, nn.Linear):
    random_seed = 1
    torch.manual_seed(random_seed)
    nn.init.normal_(module.weight, mean=0, std=1.0)
    
    if module.bias is not None:
      nn.init.constant_(module.bias, 0) 

In [7]:
class ResBlock(nn.Module):
    def __init__(self, in_channels, out_channels, downsample):
        super().__init__()
        if downsample:
            self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1)
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=2),
                nn.BatchNorm2d(out_channels)
            )
        else:
            self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
            self.shortcut = nn.Sequential()

        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.bn2 = nn.BatchNorm2d(out_channels)

    def forward(self, input):
        shortcut = self.shortcut(input)
        input = nn.ReLU()(self.bn1(self.conv1(input)))
        input = nn.ReLU()(self.bn2(self.conv2(input)))
        input = input + shortcut
        return nn.ReLU()(input)

In [8]:
import torch
import torch.nn as nn

class CustomDropout(nn.Module):
    """
    :parameter
    p: probability to drop. Bigger p -> Drop more
    """
    def __init__(self, num_of_labels, p=1.0):
        super(CustomDropout, self).__init__()
        self.p = p
        self.num_of_labels = num_of_labels

    def forward(self, batch_input, batch_labels):
        if self.training:
            layer_size = batch_input.size(1)
            batch_size = batch_input.size(0)
            portion_size = int(layer_size // self.num_of_labels)
            # print(f"portion_size: {portion_size}")
            # print(batch_input)

            for i in range(batch_size):
                label = batch_labels[i].item()
                # print(f"{label} * {portion_size} = {label * portion_size}")
                # print(f"{label+1} * {portion_size} = {(label + 1) * portion_size}")
                mask = torch.bernoulli(torch.ones_like(batch_input[i]) * (1 - self.p))
                # mask = torch.zeros_like()
                # print(label * portion_size, "-", (label + 1) * portion_size)
                # print((1.0 + (1/portion_size)))
                mask[label * portion_size : (label + 1) * portion_size] = 1.0 #self.num_of_labels  # give each neuron power of the dropped neurons
                batch_input[i] = batch_input[i] * mask

            output = batch_input
        else:
            output = batch_input

        return output

## **ResNet**

---



In [9]:
class ResNet(nn.Module):
    def __init__(self, in_channels, resblock, repeat, outputs=1000, DROPOUT=True):
        super().__init__()
        self.layer0 = nn.Sequential(
            nn.Conv2d(in_channels, 64, kernel_size=7, stride=2, padding=3),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )

        filters = [64, 64, 128, 256, 512]

        self.layer1 = nn.Sequential()
        self.layer1.add_module('conv2_1', resblock(filters[0], filters[1], downsample=False))
        for i in range(1, repeat[0]):
                self.layer1.add_module('conv2_%d'%(i+1,), resblock(filters[1], filters[1], downsample=False))

        self.layer2 = nn.Sequential()
        self.layer2.add_module('conv3_1', resblock(filters[1], filters[2], downsample=True))
        for i in range(1, repeat[1]):
                self.layer2.add_module('conv3_%d' % (i+1,), resblock(filters[2], filters[2], downsample=False))

        self.layer3 = nn.Sequential()
        self.layer3.add_module('conv4_1', resblock(filters[2], filters[3], downsample=True))
        for i in range(1, repeat[2]):
            self.layer3.add_module('conv2_%d' % (i+1,), resblock(filters[3], filters[3], downsample=False))

        self.layer4 = nn.Sequential()
        self.layer4.add_module('conv5_1', resblock(filters[3], filters[4], downsample=True))
        for i in range(1, repeat[3]):
            self.layer4.add_module('conv3_%d'%(i+1,), resblock(filters[4], filters[4], downsample=False))

        self.gap = torch.nn.AdaptiveAvgPool2d(1)
        self.fc = torch.nn.Linear(filters[4], outputs)
        self.dropout = CustomDropout(num_of_labels=10, p=0.5) if DROPOUT else None
        if DROPOUT:
            print(self.dropout)
        # self.dropout = nn.Dropout(p=0.5) if DROPOUT else None

    def forward(self, input, input_labels):
        input = self.layer0(input)
        input = self.layer1(input)
        input = self.layer2(input)
        input = self.layer3(input)
        input = self.layer4(input)
        input = self.gap(input)
        input = torch.flatten(input, start_dim=1)
        input = self.fc(input)
        if self.dropout is not None:
            input = self.dropout(input, input_labels)

        return input

# **Train & Test**

In [10]:
# define everything we need for training

epochs = 100 #1
criterion = nn.CrossEntropyLoss()

# optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-4)
# lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)

In [11]:
# count trainable parameters of the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [12]:
def train_model(model, dataloaders, criterion, optimizer, num_epochs=50, is_inception=False):
    
    since = time.time()
    val_acc_history = []
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        for phase in ['train', 'test']: # Each epoch has a training and validation phase
            if phase == 'train':
                model.train()           # Set model to training mode
            else:
                model.eval()            # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]: # Iterate over data
                
                inputs = transforms.functional.resize(inputs, (112, 112))
                inputs = inputs.to(device)

                labels = labels.to(device)

                optimizer.zero_grad() # Zero the parameter gradients

                with torch.set_grad_enabled(phase == 'train'): # Forward. Track history if only in train
                    outputs = model(inputs, labels)
                    loss = criterion(outputs, labels)
                    _, preds = torch.max(outputs, 1)

                    if phase == 'train': # Backward + optimize only if in training phase
                        loss.backward()
                        optimizer.step()

                # Statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
                print(loss.item())

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            
            if phase == 'test': # Adjust learning rate based on val loss
                lr_scheduler.step(epoch_loss)
                
            epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

            print('{}\t Loss: {:.4f}\t Acc: {:.4f}%'.format(phase, epoch_loss, epoch_acc*100))
            
            if phase == 'test':
                val_acc_history.append(epoch_acc)

                # deep copy the model
                if epoch_acc > best_acc:
                  best_acc = epoch_acc
                  # best_model_wts = copy.deepcopy(model.state_dict())

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    print('Best test Acc: {:4f}%'.format(best_acc*100))

    # # load best model weights
    # model.load_state_dict(best_model_wts)

    return model, val_acc_history

## **Train & Test Models**

---



### **ResNet34 - Group Dropout**

---



In [None]:
# resnet34 - dropout
res34_d = ResNet(3, ResBlock, [3, 4, 6, 3], outputs=1000, DROPOUT=True)
res34_d.apply(weights_init)
res34_d.to(torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))

In [13]:
count_parameters(res34_d) # 21,806,184

CustomDropout()


ResNet(
  (layer0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): ReLU()
  )
  (layer1): Sequential(
    (conv2_1): ResBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (shortcut): Sequential()
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (conv2_2): ResBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (shortcut): Sequential()
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=

In [None]:
optimizer = optim.Adam(res34_d.parameters(), lr=0.0001, weight_decay=1e-4)
lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)

res34_d, _ = train_model(res34_d,
                         {"train": train_loader, "test": test_loader},
                         criterion,
                         optimizer,
                         epochs)

Epoch 0/99
----------




85.865478515625
92.10539245605469
66.04986572265625
58.3069953918457
52.94820785522461
37.41429901123047
42.38365173339844
39.15611267089844
44.498355865478516
30.18720245361328
35.27131271362305
24.94654083251953
27.680124282836914
21.88913345336914
19.67936134338379
17.713895797729492
23.779987335205078
11.991691589355469
25.427087783813477
21.119979858398438
16.383203506469727
19.78763198852539
21.80624771118164
17.806995391845703
