In [99]:
import os
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms

# Defining Model
The `nn.Module`, which allows layers to be stacked to form a network, is the most commonly used approach for building a NN in PyTorch. We now have more control over the forward pass.

The Linear layer, also known as a fully connected layer or dense layer, is best represented by $f(wx + b)$, where $x$ represents a tensor containing the input features, $w$ and $b$ are the weight matrix and bias vector, respectively, and $f$ is the activation function.

Because each layer in a NN receives input from the previous layer, its dimensionality is fixed. Typically, we only need to consider output dimensionality when designing a NN architecture. In this case, we'd like to define a model with two hidden layers. The first takes $784$ features as input and projects them to $50$ neurons. Because we have $10$ class labels, the second layer receives the output of the previous layer (which has a size of $25$) and projects it to three $10$ output neurons.

In [100]:
# a simple classifier
class NN(nn.Module):
    def __init__(self, in_features, num_classes):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(in_features, 25)
        self.fc2 = nn.Linear(25, num_classes)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [101]:
# sets device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [102]:
# initializes and sends the model to appropriate GPU/CPU
model = NN(784, 10).to(device)

# Data Loader & Preprocessing
`torchvision` has several downloadable datasets. All of these are subclasses of `torch.utils.data.Dataset` and, therefore, can be used in `torch.utils.data.DataLoader` class. Find more about torchvision datasets at [here](https://pytorch.org/vision/0.8/datasets.html).

In [103]:
# load data
train_data = datasets.MNIST(
    root='./datasets',
    train=True,
    transform=transforms.ToTensor(),
    download=True
)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

test_data = datasets.MNIST(
    root='./datasets',
    train=False,
    transform=transforms.ToTensor(),
    download=True
)
test_loader = DataLoader(test_data, batch_size=64, shuffle=True)

# Hyperparameters & Optimizers

In [104]:
# sets hyperparameters
in_features = 784
num_classes = 10
learning_rate = 0.001
num_epochs = 10

In [105]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [106]:
loss_per_epoch = [0] * num_epochs
acc = [0] * num_epochs

# Training

In [107]:
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Moves the data to the GPU/CPU
        data = data.to(device)
        # Converts the 2D image into a 1-D vector
        data = data.reshape(data.shape[0], -1)
        # Moves the target labels to GPU/CPU
        targets = targets.to(device)

        # Make predictions with the current parameters
        scores = model(data)
        # Calculates loss of the current minibatch
        loss = criterion(scores, targets)

        # Resets the gradients to zero
        optimizer.zero_grad()
        # Compute gradients of loss function with respect to parameters
        loss.backward()
        # Updates parameters
        optimizer.step()

        loss_per_epoch[epoch] += loss.item() * data.size(0)
        correct = (torch.argmax(scores, dim=1) == targets).float()
        acc[epoch] += correct.mean()

        print('Batch {}| Loss {}'.format(batch_idx, loss))

    loss_per_epoch[epoch] /= len(train_loader.dataset)
    acc[epoch] /= len(train_loader.dataset)

Batch 0| Loss 2.317640781402588
Batch 1| Loss 2.316059112548828
Batch 2| Loss 2.284973621368408
Batch 3| Loss 2.2830348014831543
Batch 4| Loss 2.241117238998413
Batch 5| Loss 2.2357306480407715
Batch 6| Loss 2.2427940368652344
Batch 7| Loss 2.1992528438568115
Batch 8| Loss 2.1978189945220947
Batch 9| Loss 2.1188087463378906
Batch 10| Loss 2.14697265625
Batch 11| Loss 2.164813280105591
Batch 12| Loss 2.091487169265747
Batch 13| Loss 2.0554986000061035
Batch 14| Loss 2.0829992294311523
Batch 15| Loss 2.079868793487549
Batch 16| Loss 2.100694179534912
Batch 17| Loss 2.0447051525115967
Batch 18| Loss 2.024963855743408
Batch 19| Loss 1.9874577522277832
Batch 20| Loss 1.9627504348754883
Batch 21| Loss 2.012830972671509
Batch 22| Loss 1.8261098861694336
Batch 23| Loss 1.951613426208496
Batch 24| Loss 1.9314908981323242
Batch 25| Loss 1.8572230339050293
Batch 26| Loss 1.914268136024475
Batch 27| Loss 1.7805200815200806
Batch 28| Loss 1.7670564651489258
Batch 29| Loss 1.800213098526001
Batch 30

# Saving & Loading Models
Trained models can be saved to disk and reused in the future. When you call `save(model)`, **you are saving both the model architecture and all of the learned parameters**. As a standard practice, we can save models with the 'pt' or 'pth' file extensions.

In [108]:
if 'models' not in os.listdir():
    os.mkdir('models')
    print('models directory created!')
else:
    print('models directory already exists!')

models directory already exists!


In [109]:
torch.save(model, 'models/ann.pth')

In [110]:
model = torch.load('models/ann.pth')
model.eval()

NN(
  (fc1): Linear(in_features=784, out_features=25, bias=True)
  (fc2): Linear(in_features=25, out_features=10, bias=True)
)

Instead, if you wanted, you could also save just the parameters, not the architecture.

In [111]:
torch.save(model.state_dict(), 'models/ann_state.pth')

In [112]:
model = NN(784, 10)
model.load_state_dict(torch.load('models/ann_state.pth'))

<All keys matched successfully>

# Model Evaluation

In [113]:
def check_accuracy(loader, model):
    correct = 0
    total = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            x = x.reshape(x.shape[0], -1)
            y = y.to(device)

            scores = model(x)
            _, predictions = scores.max(1)
            correct += (predictions == y).sum()
            total += predictions.size(0)
        model.train()
        print('Accuracy: ', correct/total)

In [114]:
check_accuracy(test_loader, model)

Accuracy:  tensor(0.9599)
