# MNIST

- CNN関連で色々と試すのに向いている
  - 28x28なので学習時間が短い
  - 広範囲なベンチマークが存在する
- [courses/mnist.ipynb](https://github.com/fastai/courses/blob/master/deeplearning1/nbs/mnist.ipynb)
- [mnist/main.py](https://github.com/pytorch/examples/blob/master/mnist/main.py)
- [vision/mnist.py](https://github.com/pytorch/vision/blob/master/torchvision/datasets/mnist.py)
- [Pytorch tutorial](http://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html)

# MNISTデータのロード

In [18]:
import time
import os

import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt
from PIL import Image

import torch
import torchvision
from torch import nn
from torch import optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
from torchvision import transforms, datasets
torch.set_printoptions(precision=4, linewidth=100)

In [19]:
batch_size = 64

In [25]:
train_dataset = datasets.MNIST('../data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
valid_dataset = datasets.MNIST('../data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
datasets_dict = {'train': train_dataset, 'valid': valid_dataset}
dataset_sizes = {x: len(datasets_dict[x]) for x in ['train', 'valid']}
dataloaders = {x: torch.utils.data.DataLoader(datasets_dict[x], batch_size=batch_size,
                                             shuffle=False, num_workers=4)
              for x in ['train', 'valid']}

In [26]:
train_labels = train_dataset.train_labels
print(len(train_dataset))
print(train_labels)

60000

 5
 0
 4
⋮ 
 5
 6
 8
[torch.LongTensor of size 60000]



## 準備

In [5]:
use_gpu = torch.cuda.is_available()

In [6]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'valid']:
            if phase == 'train':
                scheduler.step()
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for data in dataloaders[phase]:
                # get the inputs
                inputs, labels = data

                # wrap them in Variable
                if use_gpu:
                    inputs = Variable(inputs.cuda())
                    labels = Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                outputs = model(inputs)
                _, preds = torch.max(outputs.data, 1)
                loss = criterion(outputs, labels)

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                # statistics
                running_loss += loss.data[0] # loss.dataはsize 1 のTensorなので[0]を取る
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            # 最も良いモデルの重みを変数に保持
            if phase == 'valid' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = model.state_dict()

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

## CNN

- 最初: 12x12
- kernel_sizeを5にして、paddingをしていないので、各辺で(5-1)/2=2ピクセル減る
- dropoutでピクセルを減らすので、1/2になる

In [11]:
class NetNormal(nn.Module):    
    def __init__(self):
        super(NetNormal, self).__init__()
        # モノクロなので、in_channelsは1
        # out_channelsを10にして、10個のフィルターを掛ける
        # paddingが0なので、各辺で(5-1)/2=2マス減る => 28x28 -> 24x24
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        # 12x12 -> (12-4)x(12-4) -> 4x4 になっている
        # out_channelsが20なので、320
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        # maxpoolのカーネルサイズが2なので、24x24 -> 12x12
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, self.num_flat_features(x)) # fcに掛けるために、flattenする
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return x # F.log_softmax(x)
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [12]:
model_normal = NetNormal()

In [13]:
if use_gpu:
    model_normal = model_normal.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model_normal.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [27]:
model_normal = train_model(model_normal, criterion, optimizer, exp_lr_scheduler, num_epochs=3)

Epoch 0/2
----------
train Loss: 0.0057 Acc: 0.8908
valid Loss: 0.0019 Acc: 0.9604

Epoch 1/2
----------
train Loss: 0.0049 Acc: 0.9074
valid Loss: 0.0016 Acc: 0.9689

Epoch 2/2
----------
train Loss: 0.0043 Acc: 0.9192
valid Loss: 0.0014 Acc: 0.9722

Training complete in 0m 22s
Best val Acc: 0.972200


# Batchnorm + dropout

In [29]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv_bn1 = nn.BatchNorm2d(10)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv_bn2 = nn.BatchNorm2d(20)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc_bn = nn.BatchNorm1d(50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = self.conv_bn1(x)
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = self.conv_bn2(x)
        x = x.view(-1, self.num_flat_features(x)) # 320
        x = F.relu(self.fc1(x))
        x = self.fc_bn(x)
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return x # F.log_softmax(x)
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [30]:
model_with_bn = Net()

In [31]:
if use_gpu:
    model_with_bn = model_with_bn.cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model_with_bn.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [32]:
model_with_bn = train_model(model_with_bn, criterion, optimizer, exp_lr_scheduler, num_epochs=3)

Epoch 0/2
----------
train Loss: 0.0165 Acc: 0.7133
valid Loss: 0.0046 Acc: 0.9453

Epoch 1/2
----------
train Loss: 0.0067 Acc: 0.8929
valid Loss: 0.0022 Acc: 0.9629

Epoch 2/2
----------
train Loss: 0.0047 Acc: 0.9209
valid Loss: 0.0016 Acc: 0.9708

Training complete in 0m 22s
Best val Acc: 0.970800


## Ensemble

複数のモデルで予測して、そのスコアの平均を取る