# Resnet-18 on CIFAR-10

### Define ResNet-18

ResNet code from https://github.com/kuangliu/pytorch-cifar [MIT License]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, in_planes, planes, stride=1):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, self.expansion *
                               planes, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(self.expansion*planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out


class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64

        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
        self.linear = nn.Linear(512*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


def ResNet18():
    return ResNet(BasicBlock, [2, 2, 2, 2])

In [None]:
# Defining utility progress bar

import os
import sys
import time
import math

term_width = int(50)

TOTAL_BAR_LENGTH = 65.
last_time = time.time()
begin_time = last_time
def progress_bar(current, total, msg=None):
    global last_time, begin_time
    if current == 0:
        begin_time = time.time()  # Reset for new bar.

    cur_len = int(TOTAL_BAR_LENGTH*current/total)
    rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1

    sys.stdout.write(' [')
    for i in range(cur_len):
        sys.stdout.write('=')
    sys.stdout.write('>')
    for i in range(rest_len):
        sys.stdout.write('.')
    sys.stdout.write(']')

    cur_time = time.time()
    step_time = cur_time - last_time
    last_time = cur_time
    tot_time = cur_time - begin_time

    L = []
    L.append('  Step: %s' % format_time(step_time))
    L.append(' | Tot: %s' % format_time(tot_time))
    if msg:
        L.append(' | ' + msg)

    msg = ''.join(L)
    sys.stdout.write(msg)
    for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3):
        sys.stdout.write(' ')

    # Go back to the center of the bar.
    for i in range(term_width-int(TOTAL_BAR_LENGTH/2)+2):
        sys.stdout.write('\b')
    sys.stdout.write(' %d/%d ' % (current+1, total))

    if current < total-1:
        sys.stdout.write('\r')
    else:
        sys.stdout.write('\n')
    sys.stdout.flush()

def format_time(seconds):
    days = int(seconds / 3600/24)
    seconds = seconds - days*3600*24
    hours = int(seconds / 3600)
    seconds = seconds - hours*3600
    minutes = int(seconds / 60)
    seconds = seconds - minutes*60
    secondsf = int(seconds)
    seconds = seconds - secondsf
    millis = int(seconds*1000)

    f = ''
    i = 1
    if days > 0:
        f += str(days) + 'D'
        i += 1
    if hours > 0 and i <= 2:
        f += str(hours) + 'h'
        i += 1
    if minutes > 0 and i <= 2:
        f += str(minutes) + 'm'
        i += 1
    if secondsf > 0 and i <= 2:
        f += str(secondsf) + 's'
        i += 1
    if millis > 0 and i <= 2:
        f += str(millis) + 'ms'
        i += 1
    if f == '':
        f = '0ms'
    return f

### Experiment Code

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as transforms

import os

from datetime import datetime

torch.manual_seed(1000)


def main(experiment):

    # Set processing device. If GPU is available prefer cuda for better performance
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    # Data
    print('==> Preparing data..')
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context

    trainset = torchvision.datasets.CIFAR10(
        root='../resnetexperiment-master/data', train=True, download=True, transform=transform_train)
    trainloader = torch.utils.data.DataLoader(
        trainset, batch_size=512, shuffle=True, num_workers=4)

    testset = torchvision.datasets.CIFAR10(
        root='../resnetexperiment-master/data', train=False, download=True, transform=transform_test)
    testloader = torch.utils.data.DataLoader(
        testset, batch_size=1000, shuffle=False, num_workers=2)

    # Model
    print('==> Building model..')
    net = ResNet18()
    net = net.to(device)
    if device == 'cuda':
        net = torch.nn.DataParallel(net)
        cudnn.benchmark = True

    # Loss Function
    criterion = nn.CrossEntropyLoss()

    # Hyperparameters
    algo = experiment.initial_settings["algo"].lower()
    eta = experiment.initial_settings["eta"]
    gamma = experiment.initial_settings["gamma"]
    delta = experiment.initial_settings["delta"]
    reduce = experiment.initial_settings["etaReduction"]

    # Store training information
    accuracies = []
    losses = []
    step_sizes = []

    if algo == "adam":
        optimizer = optim.Adam(net.parameters(), lr=eta)
    else:
        optimizer = optim.SGD(net.parameters(), lr=eta)


    def test(epoch):
        """
        Function to calculate the accuracy at a given epoch
        """
        net.eval()
        test_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch_idx, (inputs, targets) in enumerate(testloader):
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = net(inputs)
                loss = criterion(outputs, targets)

                test_loss += loss.item()
                _, predicted = outputs.max(1)
                total += targets.size(0)
                correct += predicted.eq(targets).sum().item()

                # progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                #                 % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
        return correct / total


    for epoch in range(0, experiment.initial_settings["epochs"]):
        print('\nEpoch: %d' % epoch)
        net.train()
        train_loss = 0
        correct = 0
        total = 0
        for batch_idx, (inputs, targets) in enumerate(trainloader):
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            norm_grad_f = torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=float("inf")).item()

            if algo == "d-gclip" or algo == "gclip":
                if algo == "gclip":
                    h = min(eta, eta * (gamma / norm_grad_f) )
                else:
                    h = min(eta, eta * max(delta, gamma / norm_grad_f) )
                for g in optimizer.param_groups:
                    g["lr"] = h


            train_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            # if batch_idx % 10 == 0 or batch_idx == 97:
                # progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                #             % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))

            optimizer.step()


        print(f"Epoch {epoch} | Step size: {h if algo == 'gclip' or algo == 'd-gclip' else eta} | norm =", norm_grad_f)
        if reduce and (epoch == 99 or epoch == 149): # If step size reduction has been set
            eta *= 0.1
            for g in optimizer.param_groups:
                g["lr"] = eta
            print("reduced eta to:", eta)

        if epoch % 10 == 0:
            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print("Current Time =", current_time)

        losses.append(train_loss)
        if algo == "d-gclip" or algo == "gclip":
            step_sizes.append(h)
        else:
            step_sizes.append(eta)

        accuracies.append(test(epoch))


    # Save data
    experiment.setResults(losses, step_sizes, accuracies)

### Run Experiment

In [None]:
class Experiment:
    def __init__(self, algo, eta, gamma, delta, epochs, etaReduction) -> None:
        self.initial_settings = dict()
        self.initial_settings["algo"] = algo
        self.initial_settings["eta"] = eta
        self.initial_settings["gamma"] = gamma
        self.initial_settings["delta"] = delta
        self.initial_settings["epochs"] = epochs
        self.initial_settings["etaReduction"] = etaReduction
        self.results = dict()

    def setResults(self, losses, step_sizes, test_accuracies, gradients = None, test_losses = None):
        self.results["losses"] = losses
        self.results["step_sizes"] = step_sizes
        self.results["gradients"] = gradients
        self.results["test_losses"] = test_losses
        self.results["test_accuracies"] = test_accuracies

    def __str__(self):
        return f"Experiment with variables: {self.initial_settings}.\nResults: {self.results}"


# DEFINE EXPERIMENTS
# Here you can set the experiments to run, which will run one after the other until completion
# Supported algorithms are: d-GClip, GClip, SGD and Adam
# For d-GClip the parameters represent: eta, gamma, delta
# For GClip the parameters represent: eta, gamma, [third one is ignored]
# For Adam and SGD the first parameter is step size, and the second and third are ignored
# Fourth parameter is the number of epochs
# Fifth parameter defines if step size reduction should be used at epochs 50 and 150 (divide by 10)
experiments = [
    Experiment("gd", 1, None, None, 200, True),
    Experiment("d-gclip", 5, 0.25, 1e-3, 200, True),
    Experiment("d-gclip", 5, 0.25, 1e-8, 200, True),
    Experiment("gclip", 5, 0.25, None, 200, True),
    Experiment("adam", 0.0001, None, None, 200, True),
]


experiment_results = []
for exp in experiments:
    print("Initiating experiment:", exp.initial_settings)
    main(exp)
    print("Experiment ended. Results:")
    print([exp.initial_settings, exp.results])

    # Save results to later produce graphs
    experiment_results.append([exp.initial_settings, exp.results])

## Generate Results Graphs

In [None]:
for experiment_res in experiment_results:
    print(len(experiment_res), experiment_res)

### Graph Plotting Utility Function

In [None]:
def getExperimentString(exp):
    """
    Returns formatted string for an experiment.

    Eg. Given Experiment("d-gclip", 5, 0.25, 1e-8, 5, False)
    it will return the string: "$\delta$-GClip (5;0.25;1e-8)".
    """
    string = ""
    if exp["algo"] == "gclip":
        if exp["delta"]:
            string += "$\delta$-GClip"
        else:
            string += "GClip"
    elif exp["algo"] == "d-gclip":
        string += "$\delta$-GClip"
    elif exp["algo"].lower() == "gd":
        string += "SGD"
    elif exp["algo"].lower() == "adam":
        string += "Adam"
    else:
        string += exp["algo"]

    string += f" ({exp['eta']}"
    if exp["gamma"]:
        string += f";{exp['gamma']}"
    if exp["delta"]:
        if exp["delta"] == 1e-3:
            string += ";1e-03"
        else:
            string += f";{exp['delta']}"
    return string + ")"

### Plot Graphs

In [None]:
import matplotlib.pyplot as plt

fig, axs = plt.subplots(2, 2, figsize=(10, 5))

for idx, exp in enumerate(experiment_results):
    algo, eta, gamma, delta = exp[0]["algo"].lower(), exp[0]["eta"], exp[0]["gamma"], exp[0]["delta"]

    # Plot the four graphs
    axs[0,0].plot([i for i in range(1, 201)], exp[1]["losses"], label=getExperimentString(exp[0]))
    axs[0,1].plot([i for i in range(1, 201)], [i*100 for i in exp[1]["test_accuracies"]], label=getExperimentString(exp[0]))
    axs[1,0].plot([i for i in range(150, 200)], exp[1]["losses"][-50:], label=getExperimentString(exp[0]))
    axs[1,1].plot([i for i in range(150, 200)], [i*100 for i in exp[1]["test_accuracies"][-50:]], label=getExperimentString(exp[0]))


# Top left
axs[0,0].set_ylabel("Training Loss (log)")
axs[0,0].set_xlabel("Epochs")
axs[0,0].set_yscale("log")

# Top right
axs[0,1].set_ylabel("Test Accuracy (%)")
axs[0,1].set_xlabel("Epochs")
axs[0,1].set_ylim(80)

# Bottom left
axs[1,0].set_ylabel("Training Loss")
axs[1,0].set_xlabel("Epochs (last 50)")

# Bottom right
axs[1,1].set_ylabel("Test Accuracy (%)")
axs[1,1].set_xlabel("Epochs (last 50)")
axs[1,1].set_ylim(89)

handles, labels = axs[0, 1].get_legend_handles_labels()
labels[2], labels[1] = labels[1], labels[2]
fig.legend(handles, labels, loc="lower center", ncol=5, bbox_to_anchor=(0.5, -0.06))
fig.suptitle("ResNet-18 on CIFAR-10 (LR scheduling, no weight-decay)", fontsize=16)
fig.subplots_adjust(hspace=0.3, wspace=0.2)
plt.plot()