In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [1]:
import glob

In [2]:
train_path = '../input/cifar/train/*.png'
test_path = '../input/cifar/test/*.png'

train_imgs = glob.glob(train_path)
test_imgs = glob.glob(test_path)
labels = []
with open("../input/cifar/labels.txt", 'r') as f:
    labels = f.readlines()
label_idx_dict = {label.replace('\n',''):idx for idx, label in enumerate(labels)}
label_idx_dict

{'airplane': 0,
 'automobile': 1,
 'bird': 2,
 'cat': 3,
 'deer': 4,
 'dog': 5,
 'frog': 6,
 'horse': 7,
 'ship': 8,
 'truck': 9}

In [3]:
import torch
import torchvision
import PIL
import time

In [4]:
transform_dict = {
    'train':torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
    ]),
    'test':torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(),
        torchvision.transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
    ])
}

In [5]:
class CIFAR10Dataset(torch.utils.data.Dataset):
    def __init__(self,img_path, label_idx_dict,transform=None):
        super(CIFAR10Dataset,self).__init__()
        self.img_path = img_path
        self.label_idx_dict = label_idx_dict
        self.transform = transform
        
    def __getitem__(self,idx):
        img_p = self.img_path[idx]
        img = PIL.Image.open(img_p)
        label = img_p.split('/')[-1].split('_')[-1].replace('.png','')
        label_idx = self.label_idx_dict[label]
        
        if self.transform:
            img = self.transform(img)
        
        return img, label_idx
        
    def __len__(self):
        return len(self.img_path)

In [6]:
train_ds = CIFAR10Dataset(train_imgs, label_idx_dict, transform_dict['train'])
test_ds = CIFAR10Dataset(test_imgs, label_idx_dict, transform_dict['test'])

In [7]:
batch_size = 1024
num_epoch = 200

In [8]:
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [31]:
net = torchvision.models.mobilenet_v2(pretrained=True)
net.classifier[1] = torch.nn.Linear(1280,10)
model_name = net.__class__.__name__
net

MobileNetV2(
  (features): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=Tr

In [None]:
net = torchvision.models.resnet50(pretrained=True)
net.fc = torch.nn.Linear(2048, 10)
net

In [32]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(),lr=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,T_max=num_epoch)

In [33]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
torch.backends.cudnn.benchmark = True
net.to(device)

MobileNetV2(
  (features): Sequential(
    (0): ConvBNReLU(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): ConvBNReLU(
          (0): Conv2d(16, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=Tr

In [None]:
hist = {'loss':[], 'acc':[], 'val_loss':[], 'val_acc':[]}
print("MODEL: {}".format(model_name))
for epoch in range(num_epoch):
    since = time.time()
    epoch_loss = 0.
    epoch_corrects = 0.
    epoch_imgs = 0.
    
    for batch in train_dl:
        imgs, labels = batch
        imgs = imgs.to(device)
        labels = labels.to(device)
        epoch_imgs += len(imgs)
        
        optimizer.zero_grad()
        outputs=net(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item() * imgs.size(0)
        epoch_corrects += sum(outputs.argmax(1)==labels)
        
    l = epoch_loss / epoch_imgs
    a = epoch_corrects / epoch_imgs

    hist['loss'].append(l)
    hist['acc'].append(a)
    
    print("Epoch {}: Loss: {:.4f} Acc: {:.4} Time: {}".format(epoch+1,l, a, time.time()-since))
    
    
    net.eval()
    val_loss = 0.
    val_corrects = 0.
    val_imgs = 0.
    
    for batch in test_dl:
        imgs, labels = batch
        imgs = imgs.to(device)
        labels = labels.to(device)
        val_imgs += len(labels)
        
        with torch.no_grad():
            outputs = net(imgs)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item()*imgs.size(0)
            val_corrects += sum(outputs.argmax(1) == labels)
            
    l = val_loss / val_imgs
    a = val_corrects / val_imgs
    cur_lr = scheduler.optimizer.param_groups[0]['lr']
    
    hist['val_loss'].append(l)
    hist['val_acc'].append(a)
    
    print("Val: Loss: {:.4f} Acc: {:.4f} lr: {:.6f}".format(l, a, cur_lr))
    
    scheduler.step()

MODEL: MobileNetV2
Epoch 1: Loss: 1.7748 Acc: 0.3757 Time: 43.0116400718689
Val: Loss: 1.3341 Acc: 0.5303 lr: 0.000050
Epoch 2: Loss: 1.1094 Acc: 0.605 Time: 46.88619947433472
Val: Loss: 1.0313 Acc: 0.6335 lr: 0.000050
Epoch 3: Loss: 0.9126 Acc: 0.6769 Time: 44.9997763633728
Val: Loss: 0.9380 Acc: 0.6702 lr: 0.000050
Epoch 4: Loss: 0.8044 Acc: 0.7168 Time: 47.45250487327576
Val: Loss: 0.8854 Acc: 0.6856 lr: 0.000050
Epoch 5: Loss: 0.7205 Acc: 0.7469 Time: 46.69953894615173
Val: Loss: 0.8602 Acc: 0.6994 lr: 0.000050
Epoch 6: Loss: 0.6506 Acc: 0.7704 Time: 45.68670964241028
Val: Loss: 0.8325 Acc: 0.7100 lr: 0.000050
Epoch 7: Loss: 0.5893 Acc: 0.7935 Time: 44.32051682472229
Val: Loss: 0.8258 Acc: 0.7182 lr: 0.000050
Epoch 8: Loss: 0.5282 Acc: 0.8164 Time: 46.67786121368408
Val: Loss: 0.8261 Acc: 0.7205 lr: 0.000050
Epoch 9: Loss: 0.4814 Acc: 0.8342 Time: 46.08226156234741
Val: Loss: 0.8334 Acc: 0.7230 lr: 0.000050
Epoch 10: Loss: 0.4284 Acc: 0.8532 Time: 46.18557786941528
Val: Loss: 0.827

Epoch 83: Loss: 0.0002 Acc: 1.0 Time: 49.17996573448181
Val: Loss: 2.2623 Acc: 0.7268 lr: 0.000032
Epoch 84: Loss: 0.0002 Acc: 1.0 Time: 50.6673309803009
Val: Loss: 2.2683 Acc: 0.7265 lr: 0.000032
Epoch 85: Loss: 0.0002 Acc: 1.0 Time: 49.31101655960083
Val: Loss: 2.2747 Acc: 0.7265 lr: 0.000031
Epoch 86: Loss: 0.0002 Acc: 1.0 Time: 48.22125864028931
Val: Loss: 2.2795 Acc: 0.7274 lr: 0.000031
Epoch 87: Loss: 0.0002 Acc: 1.0 Time: 48.437172651290894
Val: Loss: 2.2860 Acc: 0.7269 lr: 0.000030
Epoch 88: Loss: 0.0002 Acc: 1.0 Time: 49.426344871520996
Val: Loss: 2.2913 Acc: 0.7268 lr: 0.000030
Epoch 89: Loss: 0.0002 Acc: 1.0 Time: 49.47222423553467
Val: Loss: 2.2971 Acc: 0.7269 lr: 0.000030
Epoch 90: Loss: 0.0002 Acc: 1.0 Time: 48.13154053688049
Val: Loss: 2.3030 Acc: 0.7261 lr: 0.000029
Epoch 91: Loss: 0.0002 Acc: 1.0 Time: 48.14185643196106
Val: Loss: 2.3083 Acc: 0.7264 lr: 0.000029
Epoch 92: Loss: 0.0002 Acc: 1.0 Time: 47.74266791343689
Val: Loss: 2.3137 Acc: 0.7273 lr: 0.000029
Epoch 93:

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

epochs = range(1, num_epoch+1)

plt.title('Loss')
plt.plot(epochs, hist['loss'],'ro', label='train')
plt.plot(epochs, hist['val_loss'], 'r',label='val')
plt.legend()

plt.savefig('./mobv2_loss.png')

plt.figure()
plt.title('Acc')
plt.plot(epochs, hist['acc'], 'bo', label='train')
plt.plot(epochs, hist['val_acc', 'b', label='val'])
plt.legend()

plt.savefig('./mobv2_acc.png')

## Discussion
Optimizer: Adam
Situation:
MobileNetv2 doesn't learn anything until 30 epochs in the first try(batch_size=1024, T_max=num_epoch/2)
But, in the second try(batch_size=5096), model learns a bit. 
Again, if batch_size=1024, the model doesn't learn anything.

Possible Causes:

1. T_max = num_epoch/2 causes this bad situation...? -> model stuck in the bad local optima.
2. Something wrong with code...?
3. lr=1e-3 is too big...?
4. lr=1e-3 is too small...?
5. Cosine Annealing LRScheduler should work with SGD as the original paper...? -> CosineAnnealingLR doesn't go well with Adam...?

Experiments:

1. T_max = num_epoch
2. ResNet50 instead of MobileNetv2
3. 
  * T_max = num_epoch/10 == 20
  * **lr=1e-5** 
4. Init lr=1e-1

Results:

1. Nothing happens.(Still doesn't learn)
2. ResNet50 Learns -> Code is fine...
3. 
  * Nothing happens.
  * **MobileNetv2 Learns!!**  
    Epoch 93: Loss: 0.0002 Acc: 1.0 Time: 48.925941944122314  
    Val: Loss: 2.3194 Acc: 0.7276 lr: 0.000028  
4. Nothing happens