# MNIST Pytorch demo

We load the MNIST dataset and train a model on it.  

## CPU version

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

In [2]:
# Hyperparameters
batch_size = 64
learning_rate = 0.001
num_epochs = 5

In [3]:
# Transformations for the training data and testing data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

In [4]:
# Download and load the training data
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:12<00:00, 807700.39it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 367641.58it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:02<00:00, 811494.70it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 3453685.42it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [5]:
# Download and load the test data
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False)

In [6]:
# Define the neural network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the network
net = Net()

In [7]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

In [8]:
# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = net(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 100 == 99:  # Print every 100 mini-batches
            print(f'Epoch [{epoch + 1}, {i + 1}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0

print('Finished Training')

Epoch [1, 100] loss: 1.138
Epoch [1, 200] loss: 0.447
Epoch [1, 300] loss: 0.397
Epoch [1, 400] loss: 0.352
Epoch [1, 500] loss: 0.338
Epoch [1, 600] loss: 0.301
Epoch [1, 700] loss: 0.295
Epoch [1, 800] loss: 0.268
Epoch [1, 900] loss: 0.282
Epoch [2, 100] loss: 0.243
Epoch [2, 200] loss: 0.234
Epoch [2, 300] loss: 0.201
Epoch [2, 400] loss: 0.219
Epoch [2, 500] loss: 0.202
Epoch [2, 600] loss: 0.197
Epoch [2, 700] loss: 0.188
Epoch [2, 800] loss: 0.177
Epoch [2, 900] loss: 0.173
Epoch [3, 100] loss: 0.160
Epoch [3, 200] loss: 0.167
Epoch [3, 300] loss: 0.162
Epoch [3, 400] loss: 0.146
Epoch [3, 500] loss: 0.155
Epoch [3, 600] loss: 0.138
Epoch [3, 700] loss: 0.136
Epoch [3, 800] loss: 0.119
Epoch [3, 900] loss: 0.133
Epoch [4, 100] loss: 0.115
Epoch [4, 200] loss: 0.116
Epoch [4, 300] loss: 0.130
Epoch [4, 400] loss: 0.126
Epoch [4, 500] loss: 0.122
Epoch [4, 600] loss: 0.112
Epoch [4, 700] loss: 0.125
Epoch [4, 800] loss: 0.101
Epoch [4, 900] loss: 0.122
Epoch [5, 100] loss: 0.100
E

In [9]:
# Test the network on the test data
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct / total:.2f}%')

Accuracy of the network on the 10000 test images: 96.09%


## GPU version

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

# Hyperparameters
batch_size = 64
learning_rate = 0.001
num_epochs = 5

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Transformations for the training data and testing data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Download and load the training data
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)

# Download and load the test data
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False)

# Define the neural network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the network and move it to the GPU
net = Net().to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        # Move data to the GPU
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = net(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 100 == 99:  # Print every 100 mini-batches
            print(f'Epoch [{epoch + 1}, {i + 1}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0

print('Finished Training')

# Test the network on the test data
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        # Move data to the GPU
        images, labels = images.to(device), labels.to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct / total:.2f}%')


Using device: cuda
Epoch [1, 100] loss: 1.015
Epoch [1, 200] loss: 0.472
Epoch [1, 300] loss: 0.402
Epoch [1, 400] loss: 0.325
Epoch [1, 500] loss: 0.307
Epoch [1, 600] loss: 0.300
Epoch [1, 700] loss: 0.280
Epoch [1, 800] loss: 0.269
Epoch [1, 900] loss: 0.249
Epoch [2, 100] loss: 0.221
Epoch [2, 200] loss: 0.217
Epoch [2, 300] loss: 0.190
Epoch [2, 400] loss: 0.193
Epoch [2, 500] loss: 0.188
Epoch [2, 600] loss: 0.201
Epoch [2, 700] loss: 0.182
Epoch [2, 800] loss: 0.183
Epoch [2, 900] loss: 0.163
Epoch [3, 100] loss: 0.153
Epoch [3, 200] loss: 0.146
Epoch [3, 300] loss: 0.142
Epoch [3, 400] loss: 0.151
Epoch [3, 500] loss: 0.134
Epoch [3, 600] loss: 0.134
Epoch [3, 700] loss: 0.129
Epoch [3, 800] loss: 0.133
Epoch [3, 900] loss: 0.142
Epoch [4, 100] loss: 0.109
Epoch [4, 200] loss: 0.117
Epoch [4, 300] loss: 0.118
Epoch [4, 400] loss: 0.106
Epoch [4, 500] loss: 0.128
Epoch [4, 600] loss: 0.112
Epoch [4, 700] loss: 0.109
Epoch [4, 800] loss: 0.100
Epoch [4, 900] loss: 0.120
Epoch [5,

## CNN instead of full Linear  

We use convolution layers instead of linear ones.

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

# Hyperparameters
batch_size = 64
learning_rate = 0.001
num_epochs = 5

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Transformations for the training data and testing data
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Download and load the training data
trainset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)

# Download and load the test data
testset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False)

# Define the CNN
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize the network and move it to the GPU
net = CNN().to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data
        # Move data to the GPU
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = net(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 100 == 99:  # Print every 100 mini-batches
            print(f'Epoch [{epoch + 1}, {i + 1}] loss: {running_loss / 100:.3f}')
            running_loss = 0.0

print('Finished Training')

# Test the network on the test data
correct = 0
total = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        # Move data to the GPU
        images, labels = images.to(device), labels.to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct / total:.2f}%')


Using device: cuda
Epoch [1, 100] loss: 0.716
Epoch [1, 200] loss: 0.174
Epoch [1, 300] loss: 0.130
Epoch [1, 400] loss: 0.094
Epoch [1, 500] loss: 0.089
Epoch [1, 600] loss: 0.073
Epoch [1, 700] loss: 0.079
Epoch [1, 800] loss: 0.074
Epoch [1, 900] loss: 0.061
Epoch [2, 100] loss: 0.055
Epoch [2, 200] loss: 0.043
Epoch [2, 300] loss: 0.057
Epoch [2, 400] loss: 0.047
Epoch [2, 500] loss: 0.044
Epoch [2, 600] loss: 0.040
Epoch [2, 700] loss: 0.045
Epoch [2, 800] loss: 0.047
Epoch [2, 900] loss: 0.047
Epoch [3, 100] loss: 0.034
Epoch [3, 200] loss: 0.031
Epoch [3, 300] loss: 0.029
Epoch [3, 400] loss: 0.031
Epoch [3, 500] loss: 0.039
Epoch [3, 600] loss: 0.030
Epoch [3, 700] loss: 0.028
Epoch [3, 800] loss: 0.035
Epoch [3, 900] loss: 0.035
Epoch [4, 100] loss: 0.018
Epoch [4, 200] loss: 0.027
Epoch [4, 300] loss: 0.016
Epoch [4, 400] loss: 0.023
Epoch [4, 500] loss: 0.024
Epoch [4, 600] loss: 0.020
Epoch [4, 700] loss: 0.028
Epoch [4, 800] loss: 0.026
Epoch [4, 900] loss: 0.024
Epoch [5,

## Test the saving and loading of a trained model  

We save both the model architecture and the weights, then test accuracy to check results.  

In [23]:
# Save the entire model
model_path = 'cnn_mnist_model.pth'
torch.save(net, model_path)
print(f'Entire model saved to {model_path}')


Entire model saved to cnn_mnist_model.pth


In [24]:
# Load the entire model
net = torch.load(model_path)
net = net.to(device)  # Ensure the model is moved to the correct device
net.eval()  # Set the model to evaluation mode
print('Model loaded and ready for evaluation')


Model loaded and ready for evaluation


In [25]:
for data in testloader:
        images, labels = data
        # Move data to the GPU
        images, labels = images.to(device), labels.to(device)
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the 10000 test images: {100 * correct / total:.2f}%')

Accuracy of the network on the 10000 test images: 99.18%
