# Preparation

### Load the MINIST dataset

In [1]:
import cupy as np
from struct import unpack
import gzip
import matplotlib.pyplot as plt

def read(filepath, show=False):
    with gzip.open(filepath, 'rb') as f:
        if show:
            magic, num, rows, cols = unpack('>4I', f.read(16))
            print('magic\t\t', magic)
            print('num\t\t', num)
            print('rows\t\t', rows)
            print('cols\t\t', cols)
        content=np.frombuffer(f.read(), dtype=np.uint8)
    return content

print('Train*****************************')
train_imgs = read(r'dataset\MNIST\train-images-idx3-ubyte.gz', show=True).reshape(-1, 28, 28)
train_labels = read(r'dataset\MNIST\train-labels-idx1-ubyte.gz')

print('Test******************************')
test_imgs = read(r'dataset\MNIST\t10k-images-idx3-ubyte.gz', show=True).reshape(-1, 28, 28)
test_labels = read(r'dataset/MNIST/t10k-labels-idx1-ubyte.gz')

# X_train, y_train = mnist_reader.load_mnist('fashion_mnist/data/fashion', kind='train')
# X_test, y_test = mnist_reader.load_mnist('fashion_mnist/data/fashion', kind='t10k')


Train*****************************
magic		 2051
num		 60000
rows		 28
cols		 28
Test******************************
magic		 2051
num		 10000
rows		 28
cols		 28


In [2]:
train_labels = train_labels[-60000:]
test_labels = test_labels[-10000:]
print('train images shape\t', train_imgs.shape)
print('train labels shape\t', train_labels.shape)
print('test images shape\t', test_imgs.shape)
print('test labels shape\t', test_labels.shape)
print(train_labels[0:10])

train images shape	 (60000, 28, 28)
train labels shape	 (60000,)
test images shape	 (10000, 28, 28)
test labels shape	 (10000,)
[5 0 4 1 9 2 1 3 1 4]


### Import package MyDL which is written based on Numpy

In [3]:
import MyDL
import MyDL.data
import MyDL.optimizer as optim
import MyDL.nn as nn

# MLP model

### Data preparation

In [4]:
train_imgs = train_imgs.reshape(-1, 28 * 28) / 255.0
test_imgs = test_imgs.reshape(-1, 28 * 28) / 255.0

X_train_mytensor = MyDL.MyTensor(train_imgs[:50000], requires_grad=False)
y_train_mytensor = MyDL.MyTensor(train_labels[:50000], requires_grad=False)
X_val_mytensor = MyDL.MyTensor(train_imgs[50000:], requires_grad=False)
y_val_mytensor = MyDL.MyTensor(train_labels[50000:], requires_grad=False)
X_test_mytensor = MyDL.MyTensor(test_imgs, requires_grad=False)
y_test_mytensor = MyDL.MyTensor(test_labels, requires_grad=False)

train_data = MyDL.data.Dataset(X_train_mytensor, y_train_mytensor)
val_data = MyDL.data.Dataset(X_val_mytensor, y_val_mytensor)
test_data = MyDL.data.Dataset(X_test_mytensor, y_test_mytensor)

### Network Structure

In [5]:
class MLP3(nn.NeuralNetwork):
    def __init__(self, hidden_size1=100, hidden_size2=10, activation='relu'):
        super().__init__()
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        self.activ_func = activation
        self.fc1 = nn.Linear(784, hidden_size1, initialize='random')
        self.fc2 = nn.Linear(hidden_size1, hidden_size2, initialize='random')
        self.fc3 = nn.Linear(hidden_size2, 10, initialize='random')
        if activation == 'relu':
            self.activation = nn.ReLU()
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        else:
            raise ValueError('Unknown activation function')
        self.softmax = nn.Softmax()
        self.BN1 = nn.BatchNorm1d()
        self.BN2 = nn.BatchNorm1d()
        self.BN3 = nn.BatchNorm1d()
    def forward(self, x):
        x = self.BN1(x)
        x = self.fc1(x)
        x = self.BN2(x)
        x = self.activation(x)
        x = self.fc2(x)
        x = self.BN3(x)
        x = self.activation(x)
        x = self.fc3(x)
        x = self.softmax(x)
        return x

### Train model and Search the best hyperparameters

In [6]:
continue_if_exists = False
highest_val_acc = 0
num_epochs = 10
for activ_func in ['relu']:#, 'tanh']:
    for hidden_size1, hidden_size2 in [(100, 10)]:#, (10, 10)]:
        for lambda_L2 in [0.0]:#, 0.0001, 0.001]:
            for lr in [0.1]:#, 0.01, 0.001]:
                model_name = 'MLP3_({},{})_{}_L2-{}_lr-{}'.format(hidden_size1, hidden_size2, activ_func, lambda_L2, lr)
                print(f'model: {model_name}')
                model = MLP3(hidden_size1=hidden_size1, hidden_size2=hidden_size2, activation=activ_func)
                criterion = nn.CrossEntropyLoss()
                optimizer = optim.Adam(model.params, lr=lr, decay_rate=0.2)
                result = MyDL.train(model, criterion, optimizer, train_data, val_data, num_epochs=num_epochs, batch_size=256, lambda_L2=lambda_L2, result_path='figure/results', model_path='figure/model_params', continue_if_exists=continue_if_exists, calc_val_loss_every_iteration=False)
                if not (not continue_if_exists and result['continued_train']):    
                    MyDL.save_result(**result, path='figure/results')
                if result['val_acc_epoch'][-1] > highest_val_acc:
                    highest_val_acc = result['val_acc_epoch'][-1]
                    best_model_name = model_name
                    best_hyperparams = (hidden_size1, hidden_size2, activ_func, lambda_L2, lr)

model: MLP3_(100,10)_relu_L2-0.0_lr-0.1
iter 49	 loss MyTensor(0.36280099106607955)
iter 99	 loss MyTensor(0.4233962428085796)
iter 149	 loss MyTensor(0.25499650579117844)
Epoch 1/10. Training Loss:   0.372 	 Accuracy: 0.894
            Validation Loss: 0.220 	 Accuracy: 0.937
iter 49	 loss MyTensor(0.16290208217226193)
iter 99	 loss MyTensor(0.1344292861009829)


KeyboardInterrupt: 

In [14]:
print(type(model.params[0].data))

<class 'numpy.ndarray'>


### Display the best model

In [None]:
import os
print(f'Best model: {best_model_name}')
with np.load(os.path.join('results', f'{best_model_name}.npz')) as result:
    train_loss = result['train_loss_epoch']
    val_loss = result['val_loss_epoch']
    train_acc = result['train_acc_epoch']
    val_acc = result['val_acc_epoch']
print(f'Train loss: {train_loss[-1]:.3}  Val loss: {val_loss[-1]:.3}  Train acc: {train_acc[-1]:.3}  Val acc: {val_acc[-1]:.3}')
print(f'Hyperparameters: {best_hyperparams}')

Best model: MLP3_(100,10)_tanh_L2-0.0_lr-0.1
Train loss: 0.385  Val loss: 0.424  Train acc: 0.864  Val acc: 0.851
Hyperparameters: (100, 10, 'tanh', 0.0, 0.1)


# ResNet

### Data Preparation

In [7]:
train_imgs = train_imgs.reshape(-1, 1, 28,  28) / 255.0
test_imgs = test_imgs.reshape(-1, 1, 28,  28) / 255.0

print(train_imgs.shape)

(60000, 1, 28, 28)


In [8]:
X_train_mytensor = MyDL.MyTensor(train_imgs[:50000], requires_grad=False)
y_train_mytensor = MyDL.MyTensor(train_labels[:50000], requires_grad=False)
X_val_mytensor = MyDL.MyTensor(train_imgs[50000:], requires_grad=False)
y_val_mytensor = MyDL.MyTensor(train_labels[50000:], requires_grad=False)
X_test_mytensor = MyDL.MyTensor(test_imgs, requires_grad=False)
y_test_mytensor = MyDL.MyTensor(test_labels, requires_grad=False)

train_data = MyDL.data.Dataset(X_train_mytensor, y_train_mytensor)
val_data = MyDL.data.Dataset(X_val_mytensor, y_val_mytensor)
test_data = MyDL.data.Dataset(X_test_mytensor, y_test_mytensor)

### Network Structure

In [10]:
# Define the residual block
class ResiduleBlock(nn.NeuralNetwork):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2D(in_channels, out_channels, kernel_size=3,
                               stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2D(out_channels, out_channels, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        # When channel number not match or stride != 1, use 1x1 conv before jump connect
        self.cross_block = False
        if stride != 1 or in_channels != out_channels:
            self.cross_block = True
            self.conv_shortcut = nn.Conv2D(in_channels, out_channels, kernel_size=1,
                                           padding=0, stride=stride, bias=False)
            self.bn_shortcut = nn.BatchNorm2d(out_channels)
    
    def forward(self, x):
        out = nn.ReLU.forward(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        if self.cross_block:
            x = self.conv_shortcut(x)
            x = self.bn_shortcut(x)
        out = out + x  # 残差连接
        out = nn.ReLU.forward(out)
        return out
    
class ResNetMNIST(nn.NeuralNetwork):
    """
    input: (batch, 1, 28, 28)
    ↓
    Conv 3x3, 16 channels + BN + ReLU
    ↓
    ResiduleBlockx2 (channel = 16, stride=1)
    ↓
    ResiduleBlockx2 (channel = 32, stride=2)
    ↓
    ResiduleBlockx2 (channel = 64, stride=2)
    ↓
    Average Polling (len 64 vector)
    ↓
    FC layer (64 -> 10)
    ↓
    out: (10)
    """
    def __init__(self, block, num_classes=10):
        super(ResNetMNIST, self).__init__()
        self.in_channels = 16

        self.conv = nn.Conv2D(1, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn = nn.BatchNorm2d(16)
        self.layer1 = self._make_layer(block, 16, 2, stride=1)
        self.layer2 = self._make_layer(block, 32, 2, stride=2)
        self.layer3 = self._make_layer(block, 64, 2, stride=2)
        self.avg_pool = nn.FullAveragePool2d()
        self.fc = nn.Linear(64, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride):
        strides = [stride] + [1] * (blocks - 1)
        layers = []
        for s in strides:
            layers.append(block(self.in_channels, out_channels, s))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        out = nn.ReLU.forward(self.bn(self.conv(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.avg_pool(out)  # (batch, c)
        out = self.fc(out)
        out = nn.Softmax.forward(out)
        return out

### Train model and Search the best hyperparameters

In [12]:
continue_if_exists = False
highest_val_acc = 0
num_epochs = 10
for lambda_L2 in [0.0]:#, 0.0001, 0.001]:
    for lr in [0.01]:#, 0.01, 0.001]:
        model_name = 'Resnet_L2-{}_lr-{}'.format(lambda_L2, lr)
        print(f'model: {model_name}')
        model = ResNetMNIST(ResiduleBlock)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.params, lr=lr, decay_rate=0.2)
        result = MyDL.train(model, criterion, optimizer, 
                            train_data, val_data, num_epochs=num_epochs, 
                            batch_size=128, lambda_L2=lambda_L2, 
                            result_path='figure/results', 
                            model_path='figure/model_params', 
                            model_name=model_name, 
                            continue_if_exists=continue_if_exists, calc_val_loss_every_iteration=False)
        if not (not continue_if_exists and result['continued_train']):
            MyDL.save_result(**result, path='figure/results')
        if result['val_acc_epoch'][-1] > highest_val_acc:
            highest_val_acc = result['val_acc_epoch'][-1]
            best_model_name = model_name
            best_hyperparams = (lambda_L2, lr)

model: Resnet_L2-0.0_lr-0.01
iter 49	 loss MyTensor(9.341877837019249)
iter 99	 loss MyTensor(6.61601163762726)
iter 149	 loss MyTensor(2.776984568809482)
iter 199	 loss MyTensor(2.6274939064114324)
iter 249	 loss MyTensor(2.704099060901166)
iter 299	 loss MyTensor(2.5743105314904113)
iter 349	 loss MyTensor(2.899393836165248)
Epoch 1/10. Training Loss:   4.688 	 Accuracy: 0.187
            Validation Loss: 2.622 	 Accuracy: 0.303
iter 49	 loss MyTensor(2.805325579825075)
iter 99	 loss MyTensor(2.454717106419736)
iter 149	 loss MyTensor(2.503880833466241)
iter 199	 loss MyTensor(2.7396336062173106)
iter 249	 loss MyTensor(2.575604162101395)
iter 299	 loss MyTensor(2.0813806054723925)
iter 349	 loss MyTensor(2.6142994119692298)
Epoch 2/10. Training Loss:   2.406 	 Accuracy: 0.351
            Validation Loss: 2.272 	 Accuracy: 0.369
iter 49	 loss MyTensor(2.650730584201835)
iter 99	 loss MyTensor(2.237478776213057)
iter 149	 loss MyTensor(2.829342451264705)
iter 199	 loss MyTensor(1.6655