In [1]:
import torch
from torch import nn
import torch.nn.functional as F

# Create Tensors to hold inputs and outputs for digits
x = torch.FloatTensor([
    [[1, 1, 1], #0
    [1, 0, 1], 
    [1, 0, 1], 
    [1, 0, 1], 
    [1, 1, 1]], 
    
    [[0, 0, 1], #1
    [0, 1, 1], 
    [0, 0, 1], 
    [0, 0, 1],
    [0, 0, 1]],
    
    [[1, 1, 1], #2
    [0, 0, 1], 
    [0, 1, 0], 
    [1, 0, 0], 
    [1, 1, 1]],
    
    [[1, 1, 1], #3
    [0, 0, 1], 
    [1, 1, 1], 
    [0, 0, 1], 
    [1, 1, 1]],
    
    [[1, 0, 1], #4
    [1, 0, 1], 
    [1, 1, 1], 
    [0, 0, 1], 
    [0, 0, 1]],
    
    [[1, 1, 1], #5
    [1, 0, 0], 
    [1, 1, 1], 
    [0, 0, 1], 
    [1, 1, 1]],
    
    [[1, 1, 1], #6
    [1, 0, 0], 
    [1, 1, 1], 
    [1, 0, 1], 
    [1, 1, 1]],
    
    [[1, 1, 1], #7
    [0, 0, 1], 
    [0, 1, 0], 
    [1, 0, 0], 
    [1, 0, 0]],

    [[1, 1, 1], #8
    [1, 0, 1], 
    [1, 1, 1], 
    [1, 0, 1], 
    [1, 1, 1]],
    
    [[1, 1, 1], #9
    [1, 0, 1], 
    [1, 1, 1], 
    [0, 0, 1], 
    [0, 0, 1]]])

y = torch.LongTensor([0, 1, 2, 3, 4, 
                5, 6, 7, 8, 9])

print(x.shape, y.shape)

x = x.view(x.shape[0], -1)
print(x.shape, y.shape)


torch.Size([10, 5, 3]) torch.Size([10])
torch.Size([10, 15]) torch.Size([10])


In [2]:
# Build a feed-forward network
model = nn.Sequential(nn.Linear(15, 13),
                      nn.ReLU(),
                      nn.Linear(13, 11),
                      nn.ReLU(),
                      nn.Linear(11, 10))

# Define the loss
criterion = nn.CrossEntropyLoss()


# Forward pass, get our logits
logits = model(x)
# Calculate the loss with the logits and the labels
loss = criterion(logits, y)

print(loss)

tensor(2.3181, grad_fn=<NllLossBackward>)


In [3]:
## Solution

# Build a feed-forward network
model = nn.Sequential(nn.Linear(15, 13),
                      nn.ReLU(),
                      nn.Linear(13, 11),
                      nn.ReLU(),
                      nn.Linear(11, 10),
                      nn.LogSoftmax(dim=1))

# Define the loss
criterion = nn.NLLLoss()

# Forward pass, get our log-probabilities
logps = model(x)
# Calculate the loss with the logps and the labels
loss = criterion(logps, y)

print(loss)

tensor(2.3199, grad_fn=<NllLossBackward>)


In [4]:
# Build a feed-forward network
model = nn.Sequential(nn.Linear(15, 13),
                      nn.ReLU(),
                      nn.Linear(13, 11),
                      nn.ReLU(),
                      nn.Linear(11, 10),
                      nn.LogSoftmax(dim=1))

criterion = nn.NLLLoss()

logps = model(x)
loss = criterion(logps, y)

In [5]:
print('Before backward pass: \n', model[0].weight.grad)

loss.backward()

print('After backward pass: \n', model[0].weight.grad)

Before backward pass: 
 None
After backward pass: 
 tensor([[-0.0037,  0.0035, -0.0077, -0.0060, -0.0041, -0.0089, -0.0037, -0.0037,
         -0.0077,  0.0000,  0.0000, -0.0077,  0.0035,  0.0035, -0.0077],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0017,  0.0017,  0.0100,  0.0000,  0.0083,  0.0100,  0.0000,  0.0017,
          0.0083,  0.0017,  0.0000,  0.0083,  0.0017,  0.0000,  0.0083],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0051,  0.0039,  0.0073,  0.0131,  0.0022,  0.0061,  0.0078, -0.0101,
          0.0100, -0.0063,  0.0000,  0.0100, -0.0031, -0.0087,  0.0017],
        [-0.0170, -0.0170, -0.0170,  0.0000,  0.0000, -0.0170,  0.0000, -0.0170,
          0.0000, -0.0170,  0.0000,  0.0000, -0.0170,  0.0000,  0.0000],
        [-0.0033, -0.010

## Training the network!

There's one last piece we need to start training, an optimizer that we'll use to update the weights with the gradients. We get these from PyTorch's [`optim` package](https://pytorch.org/docs/stable/optim.html). For example we can use stochastic gradient descent with `optim.SGD`. You can see how to define an optimizer below.

In [6]:
from torch import optim

# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [10]:
print('Initial weights - ', model[0].weight)

images, labels = x, y

# Clear the gradients, do this because gradients are accumulated
optimizer.zero_grad()

# Forward pass, then backward pass, then update weights
output = model(images)
loss = criterion(output, labels)
loss.backward()
print('Gradient -', model[0].weight.grad)

Initial weights -  Parameter containing:
tensor([[-3.1761e-01, -6.5274e-01, -4.6811e-01,  4.2604e-01,  1.4061e-01,
          1.2140e+00, -4.1593e-01,  2.6078e+00, -3.5011e-01,  1.4083e-01,
          5.2977e-02, -4.3784e-01, -5.3717e-01,  6.3714e-01,  1.0747e+00],
        [-2.3521e-01,  1.3545e-01,  1.2190e-01, -1.5839e-01,  5.8536e-01,
          1.1407e+00, -1.3661e+00,  9.6107e-01, -6.9325e-01,  4.6395e+00,
          2.2145e-01, -6.8543e-01,  9.3831e-01,  4.6617e-01,  2.1689e-01],
        [ 8.7146e-01,  8.1826e-01,  5.1984e-01,  7.3994e-01, -9.0716e-01,
         -5.5008e-01,  1.4793e+00, -1.5921e+00,  9.0604e-01, -1.0692e+00,
          1.4049e-01,  1.0482e+00,  3.8220e-01,  6.1204e-01,  2.5061e-01],
        [-1.5989e-01, -3.8103e-02,  1.8927e-01, -2.2989e-01,  3.4840e-02,
          1.6149e-01,  6.9545e-02,  4.8231e-02, -3.9307e-02,  4.6381e-03,
         -1.0283e-02, -2.2207e-01, -2.0040e-01, -7.7899e-02,  8.6175e-02],
        [ 1.4780e-01, -9.8956e-02,  9.4070e-02,  5.1845e-02,  6.336

In [8]:
# Take an update step and few the new weights
optimizer.step()
print('Updated weights - ', model[0].weight)

Updated weights -  Parameter containing:
tensor([[-3.1761e-01, -6.5274e-01, -4.6811e-01,  4.2604e-01,  1.4061e-01,
          1.2140e+00, -4.1593e-01,  2.6078e+00, -3.5011e-01,  1.4083e-01,
          5.2977e-02, -4.3784e-01, -5.3717e-01,  6.3714e-01,  1.0747e+00],
        [-2.3521e-01,  1.3545e-01,  1.2190e-01, -1.5839e-01,  5.8536e-01,
          1.1407e+00, -1.3661e+00,  9.6107e-01, -6.9325e-01,  4.6395e+00,
          2.2145e-01, -6.8543e-01,  9.3831e-01,  4.6617e-01,  2.1689e-01],
        [ 8.7146e-01,  8.1826e-01,  5.1984e-01,  7.3994e-01, -9.0716e-01,
         -5.5008e-01,  1.4793e+00, -1.5921e+00,  9.0604e-01, -1.0692e+00,
          1.4049e-01,  1.0482e+00,  3.8220e-01,  6.1204e-01,  2.5061e-01],
        [-1.5989e-01, -3.8103e-02,  1.8927e-01, -2.2989e-01,  3.4840e-02,
          1.6149e-01,  6.9545e-02,  4.8231e-02, -3.9307e-02,  4.6381e-03,
         -1.0283e-02, -2.2207e-01, -2.0040e-01, -7.7899e-02,  8.6175e-02],
        [ 1.4780e-01, -9.8956e-02,  9.4070e-02,  5.1845e-02,  6.336

In [7]:
model = nn.Sequential(nn.Linear(15, 7),
                      nn.ReLU(),
                      nn.Linear(7, 5),
                      nn.ReLU(),
                      nn.Linear(5, 10),
                      nn.LogSoftmax(dim=1))

criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.05)
epochs = 10000
x_train = x
y_train = y
for e in range(epochs):
        running_loss = 0

        # Training pass
        optimizer.zero_grad()
        output = model(x_train)
        loss = criterion(output, y_train)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if (e % 500) == 0:
            print(f"epoch:{e}, Training loss: {running_loss}")
        idx = torch.randperm(x_train.shape[0])
        x_train = x_train[idx]
        y_train = y_train[idx]
else: print(f"epoch:{e}, Final Training loss: {running_loss}")


epoch:0, Training loss: 2.306408405303955
epoch:500, Training loss: 1.4523013830184937
epoch:1000, Training loss: 0.8487351536750793
epoch:1500, Training loss: 0.5815219283103943
epoch:2000, Training loss: 0.4624112546443939
epoch:2500, Training loss: 0.4174840450286865
epoch:3000, Training loss: 0.3949768543243408
epoch:3500, Training loss: 0.381454735994339
epoch:4000, Training loss: 0.3723662197589874
epoch:4500, Training loss: 0.36590707302093506
epoch:5000, Training loss: 0.36107224225997925
epoch:5500, Training loss: 0.35734423995018005
epoch:6000, Training loss: 0.3543396592140198
epoch:6500, Training loss: 0.20576202869415283
epoch:7000, Training loss: 0.049119800329208374
epoch:7500, Training loss: 0.023886673152446747
epoch:8000, Training loss: 0.01552845723927021
epoch:8500, Training loss: 0.011382844299077988
epoch:9000, Training loss: 0.008950436487793922
epoch:9500, Training loss: 0.007335667498409748
epoch:9999, Final Training loss: 0.006221638526767492


With the network trained, we can check out it's predictions.

In [11]:
i = torch.randint(0,9,(1,))
digit = x[i].view(1, 15)
print(digit.view(5,-1))
# Turn off gradients to speed up this part
with torch.no_grad():
    logps = model(digit)

# Output of the network are log-probabilities, need to take exponential for probabilities
ps = torch.exp(logps)
#print(logps, ps, ps.sum())
print(f"Input digit:{i.item()}. Predicted digit:{torch.argmax(ps).item()}")

tensor([[0., 0., 1.],
        [0., 1., 1.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]])
Input digit:1. Predicted digit:1


Now our network is brilliant. It can accurately predict the digits in our images. Next up you'll write the code for training a neural network on a more complex dataset.