In [1]:
import torch
import torchvision
from torchvision import transforms, datasets

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

%config Completer.use_jedi = False

In [2]:
train = datasets.MNIST(root='data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor()
                       ]))

test = datasets.MNIST(root='data', train=False, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor()
                       ]))


trainset = torch.utils.data.DataLoader(train, batch_size=10, shuffle=True)
testset = torch.utils.data.DataLoader(test, batch_size=10, shuffle=False)

In [3]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # it will expect a flatten tensor 
        self.fc1 = nn.Linear(28*28, 64)  # 28x28 is the image size
        self.fc2 = nn.Linear(64, 64)  # 64 is the size of the previous layer
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, 10)  # 10 are the labels at the end
        
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        return F.log_softmax(x, dim=1)  # to get a prob distribution

net = Net()
net

Net(
  (fc1): Linear(in_features=784, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=10, bias=True)
)

In [4]:
X = torch.randn((28,28))
output = net(X)  # should break

RuntimeError: mat1 and mat2 shapes cannot be multiplied (28x28 and 784x64)

In [5]:
X = X.view(-1,28*28)  # -1 for 'any size', it can be the batch size
output = net(X)
output

tensor([[-2.2430, -2.2578, -2.4820, -2.2384, -2.3692, -2.2776, -2.2241, -2.3631,
         -2.2420, -2.3598]], grad_fn=<LogSoftmaxBackward>)

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


In [7]:
model = Net().to(device)
model

Net(
  (fc1): Linear(in_features=784, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=10, bias=True)
)

In [8]:
X = torch.rand(28, 28, device=device).view(-1,28*28)
logits = model(X)
logits

tensor([[-2.4100, -2.3266, -2.3490, -2.1812, -2.4160, -2.1583, -2.1943, -2.3352,
         -2.3529, -2.3428]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)

# Training

In [7]:
optimizer = optim.Adam(net.parameters(), lr=0.001)

EPOCHS = 3

for epoch in range(EPOCHS): # 3 full passes over the data
    for X, y in trainset:  # `data` is a batch of data
        # X is the batch of features, y is the batch of targets.
        net.zero_grad()  # sets gradients to 0 before loss calc. You will do this likely every step.
        
        output = net(X.view(-1, 28*28))  # pass in the reshaped batch (recall they are 28x28 atm)
        
        loss = F.nll_loss(output, y)  # calc and grab the loss value
        
        loss.backward()  # apply this loss backwards thru the network's parameters
        optimizer.step()  # attempt to optimize weights to account for loss/gradients
        
    print(loss)  # print loss. We hope loss (a measure of wrong-ness) declines!

tensor(0.2205, grad_fn=<NllLossBackward>)
tensor(0.7928, grad_fn=<NllLossBackward>)
tensor(0.0028, grad_fn=<NllLossBackward>)


In [8]:
correct = 0
total = 0

with torch.no_grad():
    for data in testset:
        X, y = data
        output = net(X.view(-1,784))
        #print(output)
        for idx, i in enumerate(output):
            #print(torch.argmax(i), y[idx])
            if torch.argmax(i) == y[idx]:
                correct += 1
            total += 1

print("Accuracy: ", round(correct/total, 3))

Accuracy:  0.968
