In [1]:
# revise this. Make the current working directory to where the repository is (in your google drive)
%cd drive/MyDrive/fall22_dl_mini_project/

/content/drive/MyDrive/fall22_dl_mini_project


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn

import torchvision
import torchvision.transforms as transforms
from torchsummary import summary
from tqdm.notebook import tqdm
import os

In [3]:
from models import * 

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
summary(ResNet18().to(device), (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]           1,728
       BatchNorm2d-2           [-1, 64, 32, 32]             128
            Conv2d-3           [-1, 64, 32, 32]          36,864
       BatchNorm2d-4           [-1, 64, 32, 32]             128
            Conv2d-5           [-1, 64, 32, 32]          36,864
       BatchNorm2d-6           [-1, 64, 32, 32]             128
        BasicBlock-7           [-1, 64, 32, 32]               0
            Conv2d-8           [-1, 64, 32, 32]          36,864
       BatchNorm2d-9           [-1, 64, 32, 32]             128
           Conv2d-10           [-1, 64, 32, 32]          36,864
      BatchNorm2d-11           [-1, 64, 32, 32]             128
       BasicBlock-12           [-1, 64, 32, 32]               0
           Conv2d-13          [-1, 128, 16, 16]          73,728
      BatchNorm2d-14          [-1, 128,

In [9]:
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes,kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

In [10]:
# I modified the ResNet-18 model by making all the number of channels to 1/2

class ModifiedResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ModifiedResNet, self).__init__()
        self.in_planes = 32

        self.conv1 = nn.Conv2d(3, 32, kernel_size=3,stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layer1 = self._make_layer(block, 32, num_blocks[0], stride=1)
        self.layer2 = self._make_layer(block, 64, num_blocks[1], stride=2)
        self.layer3 = self._make_layer(block, 128, num_blocks[2], stride=2)
        self.layer4 = self._make_layer(block, 256, num_blocks[3], stride=2)
        self.linear = nn.Linear(256*block.expansion, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes * block.expansion
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, 4)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

In [11]:
modified_model = ModifiedResNet(BasicBlock, [2, 2, 2, 2])

In [12]:
summary(modified_model.to(device), (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 32, 32]             864
       BatchNorm2d-2           [-1, 32, 32, 32]              64
            Conv2d-3           [-1, 32, 32, 32]           9,216
       BatchNorm2d-4           [-1, 32, 32, 32]              64
            Conv2d-5           [-1, 32, 32, 32]           9,216
       BatchNorm2d-6           [-1, 32, 32, 32]              64
        BasicBlock-7           [-1, 32, 32, 32]               0
            Conv2d-8           [-1, 32, 32, 32]           9,216
       BatchNorm2d-9           [-1, 32, 32, 32]              64
           Conv2d-10           [-1, 32, 32, 32]           9,216
      BatchNorm2d-11           [-1, 32, 32, 32]              64
       BasicBlock-12           [-1, 32, 32, 32]               0
           Conv2d-13           [-1, 64, 16, 16]          18,432
      BatchNorm2d-14           [-1, 64,

# Load data

In [13]:
import sklearn

In [14]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [15]:
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
validset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_test) # download the train set with test transform as the validation set
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [16]:
model = ModifiedResNet(BasicBlock, [2, 2, 2, 2])
model = model.to(device)
if device == 'cuda':
    model = torch.nn.DataParallel(model)
    cudnn.benchmark = True

In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)

# training

In [18]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [46]:
# Training
def train(epoch, model, trainloader, optimizer):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in tqdm(enumerate(trainloader), total=len(trainloader)):
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
    
    return train_loss/(batch_idx+1), 100.*correct/total

In [58]:
def test(epoch, model, testloader, fold = None):
    global best_acc
    global patience
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(testloader):
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()


    # Save checkpoint.
    acc = 100.*correct/total
    if acc > best_acc:
        state = {'model': model.state_dict(),'acc': acc,'epoch': epoch}
        if not fold:
          torch.save(state, f'./checkpoint/trainAll_ckpt.pth')
        else:
          torch.save(state, f'./checkpoint/fold{fold}_ckpt.pth')
        best_acc = acc
        patience = 0
    else:
      patience += 1
    
    return test_loss/(batch_idx+1), acc

In [59]:
from sklearn.model_selection import KFold
from torch.utils.data import SubsetRandomSampler
import numpy as np

def train_from_scratch(N_EPOCHS, N_FOLD=5, lr=0.1, isTest=False):
  global best_acc
  global patience # to record how many epoches are not improving

  # make the directory for storing checkpoint
  if not os.path.isdir('checkpoint'):
      os.mkdir('checkpoint')

  # Do k-fold cross validation
  splits = KFold(n_splits = N_FOLD, shuffle = True)

  if isTest:
    dataset_len = 1000
  else:
    dataset_len = len(trainset)

  for fold, (train_idx,val_idx) in enumerate(splits.split(np.arange(dataset_len))):
    
    print('Fold {}'.format(fold + 1))
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(val_idx)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, sampler=train_sampler)
    validloader = torch.utils.data.DataLoader(validset, batch_size=100, sampler=valid_sampler)

    model = ModifiedResNet(BasicBlock, [2, 2, 2, 2])
    # model = ResNet18()
    model.to(device)
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=N_EPOCHS)
    patience = 0
    best_acc = 0
    for epoch in range(N_EPOCHS):
      start_time = time.time()
      if patience == 5:
        print(f"early stop at epoch {epoch}")
        break
      train_loss, train_acc = train(epoch, model, trainloader, optimizer)
      valid_loss, valid_acc = test(epoch, model, validloader, fold+1)
      scheduler.step()
      end_time = time.time()
      epoch_mins, epoch_secs = epoch_time(start_time, end_time)
      print(f'lr={scheduler.get_last_lr()}')
      print(f'Epoch: {epoch+1} | Epoch Time: {epoch_mins}m {epoch_secs}s')
      print(f"epoch{epoch+1} train loss: {train_loss} train acc: {train_acc} valid acc: {valid_acc}")

In [60]:
train_from_scratch(10, N_FOLD=5)

Fold 1


  0%|          | 0/313 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [63]:
def train_all(N_EPOCHS, lr=0.1, N_patience=5):
  global best_acc
  global patience # to record how many epochs are not improving

  # make the directory for storing checkpoint
  if not os.path.isdir('checkpoint'):
      os.mkdir('checkpoint')

  trainloader = torch.utils.data.DataLoader(trainset, batch_size=128)
  testloader = torch.utils.data.DataLoader(testset, batch_size=100)

  model = ModifiedResNet(BasicBlock, [2, 2, 2, 2])
  # model = ResNet18()
  model.to(device)
  optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
  scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=N_EPOCHS)
  patience = 0
  best_acc = 0
  for epoch in range(N_EPOCHS):
    start_time = time.time()
    if patience == N_patience:
      print(f"early stop at epoch {epoch}")
      break
    train_loss, train_acc = train(epoch, model, trainloader, optimizer)
    test_loss, test_acc = test(epoch, model, testloader)
    scheduler.step()
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    print(f'lr={scheduler.get_last_lr()}')
    print(f'Epoch: {epoch+1} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f"epoch{epoch+1} train loss: {train_loss} train acc: {train_acc} test acc: {test_acc}")

In [64]:
train_all(20, lr=0.1, N_patience=5)

  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.0993844170297569]
Epoch: 1 | Epoch Time: 0m 35s
epoch1 train loss: 1.6933160721493499 train acc: 37.006 test acc: 48.76


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.09755282581475769]
Epoch: 2 | Epoch Time: 0m 35s
epoch2 train loss: 1.1841417461100137 train acc: 56.866 test acc: 58.44


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.0945503262094184]
Epoch: 3 | Epoch Time: 0m 35s
epoch3 train loss: 0.9338170655852999 train acc: 66.47 test acc: 69.29


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.0904508497187474]
Epoch: 4 | Epoch Time: 0m 35s
epoch4 train loss: 0.7771645590777287 train acc: 72.622 test acc: 72.65


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.08535533905932739]
Epoch: 5 | Epoch Time: 0m 35s
epoch5 train loss: 0.6710366169205102 train acc: 76.598 test acc: 76.74


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.07938926261462367]
Epoch: 6 | Epoch Time: 0m 35s
epoch6 train loss: 0.6065443777824606 train acc: 79.07 test acc: 79.96


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.07269952498697735]
Epoch: 7 | Epoch Time: 0m 35s
epoch7 train loss: 0.5582090838028647 train acc: 80.726 test acc: 77.79


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.06545084971874739]
Epoch: 8 | Epoch Time: 0m 35s
epoch8 train loss: 0.5103426897312369 train acc: 82.374 test acc: 79.71


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.05782172325201156]
Epoch: 9 | Epoch Time: 0m 36s
epoch9 train loss: 0.4748872126002446 train acc: 83.6 test acc: 80.26


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.05000000000000001]
Epoch: 10 | Epoch Time: 0m 35s
epoch10 train loss: 0.4407531162509528 train acc: 84.794 test acc: 79.45


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.04217827674798848]
Epoch: 11 | Epoch Time: 0m 35s
epoch11 train loss: 0.4020641315776064 train acc: 85.958 test acc: 82.86


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.03454915028125264]
Epoch: 12 | Epoch Time: 0m 35s
epoch12 train loss: 0.3693208368233098 train acc: 87.168 test acc: 84.8


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.027300475013022667]
Epoch: 13 | Epoch Time: 0m 35s
epoch13 train loss: 0.32925613693264133 train acc: 88.666 test acc: 85.7


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.020610737385376353]
Epoch: 14 | Epoch Time: 0m 35s
epoch14 train loss: 0.2914211930673751 train acc: 90.006 test acc: 86.51


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.014644660940672629]
Epoch: 15 | Epoch Time: 0m 35s
epoch15 train loss: 0.25428951360151897 train acc: 91.102 test acc: 87.29


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.009549150281252633]
Epoch: 16 | Epoch Time: 0m 35s
epoch16 train loss: 0.21744731642172466 train acc: 92.482 test acc: 87.83


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.005449673790581611]
Epoch: 17 | Epoch Time: 0m 35s
epoch17 train loss: 0.18164744849323922 train acc: 93.752 test acc: 89.37


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.0024471741852423235]
Epoch: 18 | Epoch Time: 0m 36s
epoch18 train loss: 0.14896439698041247 train acc: 94.992 test acc: 89.98


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.0006155829702431171]
Epoch: 19 | Epoch Time: 0m 35s
epoch19 train loss: 0.126872633576698 train acc: 95.71 test acc: 90.39


  0%|          | 0/391 [00:00<?, ?it/s]

lr=[0.0]
Epoch: 20 | Epoch Time: 0m 35s
epoch20 train loss: 0.11332056648038386 train acc: 96.292 test acc: 90.8
