In [1]:
# loading and normalizing cifar10
import torch
import torchvision
import torchvision.transforms as transforms
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from torchsummary import summary

In [2]:
# new transform
#transform = transforms.Compose(
#    [transforms.ToTensor(),
#    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
#    transforms.Pad(4), 
#    transforms.RandomHorizontalFlip(),
#    transforms.RandomCrop(32)])

# old transform
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])



In [3]:
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
                                        
                                        
trainloader = torch.utils.data.DataLoader(trainset, batch_size=64,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=64,
                                         shuffle=False, num_workers=2)


Files already downloaded and verified
Files already downloaded and verified


In [4]:
## creating a function that initializes weights
def init_weight_conv(m):
    if type(m) == nn.Conv2d:
        torch.nn.init.kaiming_normal_(m.weight) #he initialize, can use xavier instead
        #m.bias.data.fill_(0.001) # optional bias
        
def init_weight_linear(m):
    if type(m) == nn.Linear:
        torch.nn.init.kaiming_normal_(m.weight)
        #m.bias.data.fill_(0.001)

# new init weight        
def init_weight(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight) #he initialize, can use xavier instead
        #nn.init.constant_(m.bias, 0.001) # optional bias
    if isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight) #he initialize, can use xavier instead
        #nn.init.constant_(m.bias, 0.001) # optional bias
    elif type(m) == nn.BatchNorm2d:
        torch.nn.init.constant_(m.weight, 1)
        torch.nn.init.constant_(m.bias, 1)

In [5]:
# residual block
class residual_block(nn.Module):
    expansion = 1 # used in downsampling
    
    def __init__(self, in_channels, out_channels, stride = 1, downsample = None):
        super(residual_block, self).__init__()
        # conv1 has defined stride
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, stride = stride, padding = 1, bias=False)  # stride for downsamplings
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding = 1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample 
        self.stride = stride
        
    def forward(self, x):
        residual = x
        # 1st conv layer
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        # 2nd conv layer
        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)
        return out
        

                        

In [6]:
# bottleneck block

class bottleneck(nn.Module):
    expansion = 4 # used in downsampling
    
    def __init__(self, in_channels, out_channels, stride = 1, downsample = None):
        super(bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, 1, bias=False) # no stride
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride,
                               padding=1, bias=False) # has stride argument       
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels*self.expansion, 1, bias=False) #expanding out_channels
        self.bn3 = nn.BatchNorm2d(out_channels*self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample 
        self.stride = stride # why is this used?
        
    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

                        


In [7]:
# ResNet
n_classes = 10 

class ResNet(nn.Module):
    def __init__(self, block, layers, num_classes = n_classes): # layer is a list
        super(ResNet, self).__init__()
        #initial conv layer
        self.conv1 = nn.Conv2d(3, 16, 7, stride=1, padding = 3, bias = False) # first thing in resnet
        self.in_channels = 16 # match outchannel for conv1
        self.batchnorm1 = nn.BatchNorm2d(16) # match outchannel for conv1
        self.relu = nn.ReLU(inplace = True)
        #self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1) # outchannel still the same
        self.layer1 = self.make_layer(block, 16, layers[0])
        self.layer2 = self.make_layer(block, 32, layers[1],stride = 2)
        self.layer3 = self.make_layer(block, 64, layers[2], stride = 2)
        self.layer4 = self.make_layer(block, 128, layers[3], stride = 2)
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.linear = nn.Linear(128*block.expansion, num_classes) # in_features = out_channel from last layer * expansion
        
        
    def make_layer(self, block, out_channels, blocks, stride=1):
        # block = residual_block
        # out_channel = output dimension of the block
        # blocks = number of residual_block to use
        # stride = stride length

        downsample = None

        # if dimesions don't match up
        if (stride != 1) or (self.in_channels != out_channels * block.expansion):
            downsample = nn.Sequential(
            nn.Conv2d(self.in_channels, out_channels*block.expansion, kernel_size=1, stride = stride, bias = False),
            nn.BatchNorm2d(out_channels*block.expansion))

        # creating multiple layers of resblocks
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels*block.expansion

        for i in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))
        return nn.Sequential(*layers)
        
    def forward(self, x):
        # initial conv layer to improve starting point
        out = self.conv1(x)
        out = self.batchnorm1(out)
        out = self.relu(out)
        #out = self.maxpool(out)
        # resblocks
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.avg_pool(out) # adaptive avg pooling to get (-1, out_channel(last layer), 1, 1)
        out = out.view(out.size(0), -1) # flatten
        out = self.linear(out) # output layer

        return out





In [8]:
# smaller resnet
n_classes = 10

class ResNet_small(nn.Module):
    def __init__(self, block, layers, num_classes = n_classes): # layer is a list
        super(ResNet_small, self).__init__()
        #initial conv layer
        self.conv1 = nn.Conv2d(3, 4, 7, stride=2, padding = 3, bias = False) # first thing in resnet
        self.in_channels = 4 # match outchannel for conv1
        self.batchnorm1 = nn.BatchNorm2d(4) # match outchannel for conv1
        self.relu = nn.ReLU(inplace = True)
        #self.maxpool = nn.MaxPool2d(kernel_size = 3, stride = 2, padding = 1) # outchannel still the same
        self.layer1 = self.make_layer(block, 4, layers[0])
        self.layer2 = self.make_layer(block, 8, layers[1],stride = 2)
        self.layer3 = self.make_layer(block, 16, layers[2], stride = 2)
        self.layer4 = self.make_layer(block, 32, layers[3], stride = 2)
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.linear = nn.Linear(32*block.expansion, num_classes) # in_features = out_channel from last layer * expansion
        
        
    def make_layer(self, block, out_channels, blocks, stride=1):
        # block = residual_block
        # out_channel = output dimension of the block
        # blocks = number of residual_block to use
        # stride = stride length

        downsample = None

        # if dimesions don't match up
        if (stride != 1) or (self.in_channels != out_channels * block.expansion):
            downsample = nn.Sequential(
            nn.Conv2d(self.in_channels, out_channels*block.expansion, kernel_size=1, stride = stride, bias = False),
            nn.BatchNorm2d(out_channels*block.expansion))

        # creating multiple layers of resblocks
        layers = []
        layers.append(block(self.in_channels, out_channels, stride, downsample))
        self.in_channels = out_channels*block.expansion

        for i in range(1, blocks):
            layers.append(block(self.in_channels, out_channels))
        return nn.Sequential(*layers)
        
    def forward(self, x):
        # initial conv layer to improve starting point
        out = self.conv1(x)
        out = self.batchnorm1(out)
        out = self.relu(out)
        #out = self.maxpool(out)
        # resblocks
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.avg_pool(out) # adaptive avg pooling to get (-1, out_channel(last layer), 1, 1)
        out = out.view(out.size(0), -1) # flatten
        out = self.linear(out) # output layer

        return out





In [9]:
# specify different network architectures

#net = ResNet(residual_block, layers = [2,2,2,2])

net = ResNet_small(bottleneck, layers = [2,1,2,1])

#net = ResNet_small(residual_block, layers = [1,1,1,1])

In [10]:
# visualize network
#for i, weights in enumerate(list(net.parameters())):
#    print('i:',i,'weights:',weights.size())

#print(net)

summary(net, (3, 256, 256))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 4, 128, 128]             588
       BatchNorm2d-2          [-1, 4, 128, 128]               8
              ReLU-3          [-1, 4, 128, 128]               0
            Conv2d-4          [-1, 4, 128, 128]              16
       BatchNorm2d-5          [-1, 4, 128, 128]               8
              ReLU-6          [-1, 4, 128, 128]               0
            Conv2d-7          [-1, 4, 128, 128]             144
       BatchNorm2d-8          [-1, 4, 128, 128]               8
              ReLU-9          [-1, 4, 128, 128]               0
           Conv2d-10         [-1, 16, 128, 128]              64
      BatchNorm2d-11         [-1, 16, 128, 128]              32
           Conv2d-12         [-1, 16, 128, 128]              64
      BatchNorm2d-13         [-1, 16, 128, 128]              32
             ReLU-14         [-1, 16, 1

In [None]:
# apply initializers
net.apply(init_weight)

In [None]:
# define loss and optimizer
import torch.optim as optim
learning_rate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)
#optimizer = optim.SGD(net.parameters(), lr = learning_rate, momentum=0.9, nesterov= True, weight_decay= 0.01)


In [None]:
# modified model training to keep track of train/val loss
n_epochs = 2

for epoch in range(n_epochs):
    running_loss = 0.0
    total_train_loss = 0.0
    for i, train_data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = train_data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print loss per n minibatches
        running_loss += loss.item()
        total_train_loss += loss.item()
        if i % 50 == 49:    # print every 50 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 50))
            running_loss = 0.0
    
    # keep track of loss in test dataset 
    correct = 0
    total = 0
    total_test_loss = 0.0
    with torch.no_grad():
        for test_data in testloader:
            test_images, test_labels = test_data
            test_outputs = net(test_images)
            test_loss = criterion(test_outputs, test_labels)
            total_test_loss += test_loss.item()
            _, predicted = torch.max(test_outputs.data, 1)
            total += test_labels.size(0)
            correct += (predicted == test_labels).sum().item()


    
    
    # for printing average loss every epoch
    print("===> Epoch {} Complete: Train Avg. Loss: {:.4f}".format(epoch+1, total_train_loss / len(trainloader)))
    print("===> Epoch {} Complete: Test Avg. Loss: {:.4f}".format(epoch+1, total_test_loss / len(testloader)))
    print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))
print('Finished Training')