In [155]:
import os
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.transforms as transforms

# Defining Model
The `nn.Module`, which allows layers to be stacked to form a network, is the most commonly used approach for building a NN in PyTorch. We now have more control over the forward pass.

The Linear layer, also known as a fully connected layer or dense layer, is best represented by $f(wx + b)$, where $x$ represents a tensor containing the input features, $w$ and $b$ are the weight matrix and bias vector, respectively, and $f$ is the activation function.

Because each layer in a NN receives input from the previous layer, its dimensionality is fixed. Typically, we only need to consider output dimensionality when designing a NN architecture. In this case, we'd like to define a model with two hidden layers. The first takes $784$ features as input and projects them to $50$ neurons. Because we have $10$ class labels, the second layer receives the output of the previous layer (which has a size of $25$) and projects it to three $10$ output neurons.

 initializing model parameters with random weights is necessary to break the symmetry during backpropagation—otherwise, a multilayer NN would be no more useful than a single-layer NN like logistic regression. When creating a PyTorch tensor, we can also use a random initialization scheme. `nn.init.xavier_normal_` and `nn.init.xavier_uniform_` are such two initialization methods. You can find many other initialization techniques in the `nn.init` module.

In [169]:
# a simple classifier
class NN(nn.Module):
    def __init__(self, in_features, num_classes):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(in_features, 25)
        nn.init.xavier_normal_(self.fc1.weight)
        self.fc2 = nn.Linear(25, num_classes)
        nn.init.xavier_normal_(self.fc2.weight)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [170]:
# sets device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [171]:
# initializes and sends the model to appropriate GPU/CPU
model = NN(784, 10).to(device)

# Data Loader & Preprocessing
`torchvision` has several downloadable datasets. All of these are subclasses of `torch.utils.data.Dataset` and, therefore, can be used in `torch.utils.data.DataLoader` class. Find more about torchvision datasets at [here](https://pytorch.org/vision/0.8/datasets.html).

In [172]:
batch_size = 64

# load data
train_data = datasets.MNIST(
    root='./datasets',
    train=True,
    transform=transforms.ToTensor(),
    download=True
)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = datasets.MNIST(
    root='./datasets',
    train=False,
    transform=transforms.ToTensor(),
    download=True
)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [173]:
# Checks dimensions of each minibatch
x, y = next(iter(train_loader))
print(x.size())
print(y.size())

torch.Size([64, 1, 28, 28])
torch.Size([64])


In [174]:
# Checks how the target labels are encoded
y[0]

tensor(2)

# Hyperparameters & Optimizers

In [175]:
# sets hyperparameters
in_features = 784
num_classes = 10
learning_rate = 0.01
num_epochs = 5

In [176]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [177]:
loss_per_epoch = [0] * num_epochs
acc = [0] * num_epochs

# Training

In [178]:
for epoch in range(num_epochs):
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Moves the data to the GPU/CPU
        data = data.to(device)
        # Converts the 2D image into a 1-D vector
        data = data.reshape(data.shape[0], -1)
        # Moves the target labels to GPU/CPU
        targets = targets.to(device)

        # Make predictions with the current parameters
        scores = model(data)
        # Calculates loss of the current minibatch
        loss = criterion(scores, targets)

        # Resets the gradients to zero
        optimizer.zero_grad()
        # Compute gradients of loss function with respect to parameters
        loss.backward()
        # Updates parameters
        optimizer.step()

        loss_per_epoch[epoch] += loss.item() * data.size(0)
        correct = (torch.argmax(scores, dim=1) == targets).float()
        acc[epoch] += correct.mean()

    loss_per_epoch[epoch] /= len(train_loader.dataset)
    acc[epoch] /= (len(train_loader.dataset) / batch_size)
    print('Epoch {}| Loss {} | Accuracy {}'.format(epoch, loss_per_epoch[epoch], acc[epoch]))


Epoch 0| Loss 0.308759891919295 | Accuracy 0.9072499871253967
Epoch 1| Loss 0.2033080210407575 | Accuracy 0.9395166635513306
Epoch 2| Loss 0.1819783017973105 | Accuracy 0.9467499852180481
Epoch 3| Loss 0.17102455580830575 | Accuracy 0.9488666653633118
Epoch 4| Loss 0.16038994969328244 | Accuracy 0.9523166418075562


# Saving & Loading Models
Trained models can be saved to disk and reused in the future. When you call `save(model)`, **you are saving both the model architecture and all of the learned parameters**. As a standard practice, we can save models with the 'pt' or 'pth' file extensions.

In [179]:
if 'models' not in os.listdir():
    os.mkdir('models')
    print('models directory created!')
else:
    print('models directory already exists!')

models directory already exists!


In [180]:
torch.save(model, 'models/ann.pth')

In [181]:
model = torch.load('models/ann.pth')
model.eval()

NN(
  (fc1): Linear(in_features=784, out_features=25, bias=True)
  (fc2): Linear(in_features=25, out_features=10, bias=True)
)

Instead, if you wanted, you could also save just the parameters, not the architecture.

In [182]:
torch.save(model.state_dict(), 'models/ann_state.pth')

In [183]:
model = NN(784, 10)
model.load_state_dict(torch.load('models/ann_state.pth'))

<All keys matched successfully>

# Model Evaluation

In [184]:
def check_accuracy(loader, model):
    correct = 0
    total = 0
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x = x.to(device)
            x = x.reshape(x.shape[0], -1)
            y = y.to(device)

            scores = model(x)
            _, predictions = scores.max(1)
            correct += (predictions == y).sum()
            total += predictions.size(0)
        model.train()
        print('Accuracy: ', correct/total)

In [185]:
check_accuracy(test_loader, model)

Accuracy:  tensor(0.9456)
