# Convolutional Neural Network (CNN) for MNIST classification

In [129]:
import numpy as np
import torch
import time
import platform
import math

In [130]:
print(f'Pytorch version: {torch.__version__}')
print(f'cuda version: {torch.version.cuda}')
print(f'Python version: {platform.python_version()}')

Pytorch version: 2.9.1+cpu
cuda version: None
Python version: 3.12.9


In [131]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


## MNIST Data preparation

### Load MNIST data and normalize

In [132]:
import torchvision.transforms as transforms
from torchvision.datasets import MNIST
transform = transforms.Compose([
    transforms.ToTensor(),                  # Transform image to tensor
    transforms.Normalize((0.5,), (0.5,))    # Normalize
])

### Load dataset and create Dataloader

In [133]:
train_dataset = MNIST(
    root='mnist_data',      # File root dir
    train=True,             # Load train data
    transform=transform,    # Transform
    download=True           # Download dataset if not exist
)

test_dataset = MNIST(
    root='mnist_data',
    train=False,            # Load test data
    transform=transform,
    download=True
)

In [134]:
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=64,
    shuffle=True            # Shuffle the data
)

test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset,
    batch_size=64,
    shuffle=False
)

In [135]:
print(f"shape of train dataset: {train_loader.dataset.data.shape}")
print(f"shape of test dataset: {test_loader.dataset.data.shape}")

shape of train dataset: torch.Size([60000, 28, 28])
shape of test dataset: torch.Size([10000, 28, 28])


---
## LeNet -5 network

* Convolutional layers can be created as `nn.Conv2d(N, C, K)`. For input images of size `W×H`, the output feature maps have size `[W−K+1]x[H−K+1]`.  

* Maxpooling is implemented like any other non-linear function (such as ReLU or softmax). For input images of size `W×H`, the output feature maps have size `[W/L]×[H/L]`.  

* A fully connected layer can be created as `nn.Linear(M, N)`.

Architecture:  

(a) a convolutional layer connecting the input image to `6` feature maps with `5×5` convolutions (`K=5`) and followed by ReLU and maxpooling (`L=2`)  

(b) a convolutional layer connecting the `6` input channels to `16` output channels with `5×5` convolutions and followed by ReLU and maxpooling (`L=2`)  

(c) a fully-connected layer connecting `16` feature maps to `120` output units and followed by ReLU  

(d) a fully-connected layer connecting `120` inputs to `84` output units and followed by ReLU  

(e) a final linear layer connecting `84` inputs to `10` linear outputs (one for each of our digits)

First layer  
* input: `(28, 28, 1)`  
* after *padding*: `(32, 32, 1)`
* after convolution(kernel=`5x5`): `(28, 28, 6)` where `28=32-5+1`  
* after ReLU: `(28, 28, 6)`  
* after maxpooling(stride=`2x2`): `(14, 14, 6)` $\Rightarrow$ **OUTPUT**  


Second layer
* input: `(14, 14, 6)`
* after convolution(kernel=`5x5`): `(10, 10, 16)`
* after ReLU: `(10, 10, 16)`  
* after maxpooling(stride=`2x2`): `(5, 5, 16)` $\Rightarrow$ **OUTPUT**  


Third layer
* input: `(5, 5, 16)` $\Rightarrow$ `5x5x16=400`  
* after fully-connected: `(120, 1)`
* after ReLU: `(120, 1)` $\Rightarrow$ **OUTPUT**  


Fourth layer
* input: `(120, 1)`
* after fully-connected: `(84, 1)`
* after ReLU: `(84, 1)` $\Rightarrow$ **OUTPUT**  


Fifth layer
* input: `(84, 1)`
* after fully-connected: `(10, 1)`
* after ReLU: `(10, 1)` $\Rightarrow$ **OUTPUT**  

In [136]:
import torch.nn as nn
import torch.nn.functional as F

In [137]:
class LeNet(nn.Module):

    # network structure
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 5, padding=2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1   = nn.Linear(16*5*5, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 10)

    def forward(self, x):
        '''
        One forward pass through the network.
        
        Args:
            x: input
        '''
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        '''
        Get the number of features in a batch of tensors `x`.
        '''
        size = x.size()[1:]
        return np.prod(size)

### Check the network structure

In [138]:
net = LeNet()
print(net)

LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


### Check the network parameters

In [139]:
for name, param in net.named_parameters():
    print(name, param.size(), param.requires_grad)

conv1.weight torch.Size([6, 1, 5, 5]) True
conv1.bias torch.Size([6]) True
conv2.weight torch.Size([16, 6, 5, 5]) True
conv2.bias torch.Size([16]) True
fc1.weight torch.Size([120, 400]) True
fc1.bias torch.Size([120]) True
fc2.weight torch.Size([84, 120]) True
fc2.bias torch.Size([84]) True
fc3.weight torch.Size([10, 84]) True
fc3.bias torch.Size([10]) True


### The accuracy without backprop

In [140]:
# avoid tracking for gradient during testing and then save some computation time
yinit = []
ltest = []
with torch.no_grad():
    for item in test_loader:
        images, labels = item
        yinit.append(net(images))
        ltest.append(labels)

In [141]:
lpred = []
for y in yinit:
	_, pred = y.max(1)
	lpred.append(pred)

# Flatten ltest and lpred to ensure they are comparable
ltest_flat = torch.cat(ltest)
lpred_flat = torch.cat(lpred)

print(100 * (ltest_flat == lpred_flat).float().mean())

tensor(4.7300)


`ltest_flat == lpred_flat` generates a tensor with values of `0` and `1`, where `0` means inequal and `1` means equal. Therefore, `(ltest_flat == lpred_flat).float().mean()` implies the accuracy which is **4.73%**.

### (Mini-Batch) Stochastic Gradient Descent (SGD) with cross-entropy and momentum

**Note**: PyTorch’s CrossEntropyLoss is the composition of a softmax activation with the standard cross-entropy loss.

In [142]:
def backprop_deep(train_loader, net, T, B=100, gamma=.001, rho=.9, device=device):
    '''
    Args:
        train_loader: Train dataloader
        net: model
        T: training epochs
        B: expected minibatch size (for logging only)
        gamma: learning rate
        rho: momentum
        device: training device
    '''
    net.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net.parameters(), lr=gamma, momentum=rho)

    for epoch in range(T):
        running_loss = 0.0
        for k, (inputs, labels) in enumerate(train_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()

            # Forward propagation
            outputs = net(inputs)

            # Error evaluation
            loss = criterion(outputs, labels)

            # Back propagation
            loss.backward()

            # Parameter update
            optimizer.step()

            # Print averaged loss per minibatch every 100 mini-batches
            # Compute and print statistics
            with torch.no_grad():
                running_loss += loss.item()
            if (k + 1) % 100 == 0:
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, k + 1, running_loss / 100))
                running_loss = 0.0

In [143]:
net = LeNet()

In [144]:
start = time.time()
backprop_deep(train_loader, net, T=3)
end = time.time()
print(f'It takes {end-start:.6f} seconds.')

[1,   100] loss: 2.302
[1,   200] loss: 2.296
[1,   300] loss: 2.288
[1,   400] loss: 2.273
[1,   500] loss: 2.247
[1,   600] loss: 2.184
[1,   700] loss: 1.929
[1,   800] loss: 1.111
[1,   900] loss: 0.625
[2,   100] loss: 0.428
[2,   200] loss: 0.366
[2,   300] loss: 0.320
[2,   400] loss: 0.287
[2,   500] loss: 0.269
[2,   600] loss: 0.261
[2,   700] loss: 0.226
[2,   800] loss: 0.207
[2,   900] loss: 0.199
[3,   100] loss: 0.182
[3,   200] loss: 0.183
[3,   300] loss: 0.167
[3,   400] loss: 0.167
[3,   500] loss: 0.148
[3,   600] loss: 0.142
[3,   700] loss: 0.143
[3,   800] loss: 0.133
[3,   900] loss: 0.135
It takes 28.220095 seconds.


### Evaluate on the testing dataset (CPU)

In [145]:
start = time.time()
backprop_deep(test_loader, net, T=3, device='cpu')
end = time.time()
print(f'It takes {end-start:.6f} seconds.')

[1,   100] loss: 0.133
[2,   100] loss: 0.124
[3,   100] loss: 0.113
It takes 4.554592 seconds.


In [148]:
y = []
with torch.no_grad():
    for item in test_loader:
        images, labels = item
        y.append(net(images))

y_flat = torch.cat(y)

In [149]:
print(100 * (ltest_flat==y_flat.max(1)[1]).float().mean())

tensor(96.6600)


The accuracy for 3 epochs is **96.66%**.

### Network on GPU

In [24]:
net_gpu = LeNet().to(device)

In [None]:
start = time.time()
backprop_deep(train_loader, net_gpu, T=10, device='cuda')
end = time.time()
print(f'It takes {end-start:.6f} seconds.')

[1,   100] loss: 2.298
[1,   200] loss: 2.288
[1,   300] loss: 2.274
[1,   400] loss: 2.253
[1,   500] loss: 2.201
[1,   600] loss: 2.031
[2,   100] loss: 1.437
[2,   200] loss: 0.852
[2,   300] loss: 0.606
[2,   400] loss: 0.470
[2,   500] loss: 0.402
[2,   600] loss: 0.357
[3,   100] loss: 0.311
[3,   200] loss: 0.294
[3,   300] loss: 0.251
[3,   400] loss: 0.219
[3,   500] loss: 0.227
[3,   600] loss: 0.210
[4,   100] loss: 0.194
[4,   200] loss: 0.181
[4,   300] loss: 0.164
[4,   400] loss: 0.168
[4,   500] loss: 0.173
[4,   600] loss: 0.151
[5,   100] loss: 0.143
[5,   200] loss: 0.142
[5,   300] loss: 0.131
[5,   400] loss: 0.136
[5,   500] loss: 0.127
[5,   600] loss: 0.115
[6,   100] loss: 0.116
[6,   200] loss: 0.122
[6,   300] loss: 0.110
[6,   400] loss: 0.105
[6,   500] loss: 0.106
[6,   600] loss: 0.104
[7,   100] loss: 0.111
[7,   200] loss: 0.094
[7,   300] loss: 0.097
[7,   400] loss: 0.092
[7,   500] loss: 0.097
[7,   600] loss: 0.087
[8,   100] loss: 0.081
[8,   200] 

### Re-evaluate on the testing dataset (GPU)

In [None]:
y = []
with torch.no_grad():
    for item in test_loader:
        images, labels = item
        y.append(net_gpu(images))

y_flat = torch.cat(y)

In [None]:
print(100 * (ltest_flat==y_flat.max(1)[1]).float().mean())

tensor(97.9800)


The accuracy for 10 epochs is **97.98%** which is higher than that for 3 epochs, **95.42%**.

In [None]:
start = time.time()
backprop_deep(train_loader, net_gpu, T=10, device='cuda')
end = time.time()
print(f'It takes {end-start:.6f} seconds.')

[1,   100] loss: 0.063
[2,   100] loss: 0.059
[3,   100] loss: 0.052
[4,   100] loss: 0.050
[5,   100] loss: 0.049
[6,   100] loss: 0.045
[7,   100] loss: 0.046
[8,   100] loss: 0.043
[9,   100] loss: 0.042
[10,   100] loss: 0.038
It takes 2.311700 seconds.
