In [12]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [64]:
learning_rate = 0.01
batch_size = 200
epochs = 10
log_interval = 10

In [70]:
train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=True)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])),
    batch_size=batch_size, shuffle=True)

In [60]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    
    # override the init method to construct the neural net
    def __init__(self):
        # start by calling super
        super(Net, self).__init__()
        self.fc1 = nn.Linear(28*28, 200)
        self.fc2 = nn.Linear(200, 200)
        self.fc3 = nn.Linear(200, 10)
        
    def forward(self, x):
        # Feed the input to the first layer, then apply a relu to it
        x = F.relu(self.fc1(x)) 
        
        # Feed the input to the second layer, then apply a relu to it
        x = F.relu(self.fc2(x))
        
        # Feed the input to the third layer
        x = self.fc3(x)
        
        # Usually use a softmax activation function in multi-class
        # learning problems where a set of features can be related to 
        # one of k classes
        
        # The softmax activation normalizes the output vector so that the
        # the sum of the vector is 1, which we can interpret as being
        # the probabilities that the set of features belongs to each class
        return F.log_softmax(x)

In [61]:
# Get a summary of the network
net = Net()
print(net)

Net(
  (fc1): Linear(in_features=784, out_features=200, bias=True)
  (fc2): Linear(in_features=200, out_features=200, bias=True)
  (fc3): Linear(in_features=200, out_features=10, bias=True)
)


In [65]:
# A stochastic gradient descent optimizer

# The learning rate controls the amount that the weights are updated during training
optimizer = optim.SGD(net.parameters(), lr = learning_rate, momentum = 0.9)

In [68]:
# Create a loss function
# The loss function maps the parameter values to a scalar that
# indicates how well the paramets accomplish the task of the network

criterion = nn.NLLLoss()

In [72]:
# Each epoch is a complete pass through the dataset
for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        
        # Variable class wraps a tensor and allows automatic
        # gradient computation on the tensor when the backward function
        # is called.  It contains the data of the tensor, the gradient of
        # the tensor, and a reference to whatever function created the variable
        data, target = Variable(data), Variable(target)
        
        # resize the data from (batch_size, 1, 28, 28) to (batch_size, 28*28)
        data = data.view(-1, 28*28)
        
        # for each batch, need to zero out the gradient
        optimizer.zero_grad()
        
        # apply the network
        net_out = net(data)
        loss = criterion(net_out, target) 
        loss.backward() # compute the gradient
        optimizer.step() # step in the direction of the gradient (update the network parameters)
        
        if batch_idx % log_interval == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.dataset),
                           100. * batch_idx / len(train_loader), loss.data))

        







In [87]:
# Get one example from the test dataset

data, target = [ x[0] for x in iter(test_loader).next() ]

In [88]:
target # this is the correct answer

tensor(2)

In [89]:
data, target = Variable(data, volatile=True), Variable(target)
data = data.view(-1, 28 * 28)
net_out = net(data)

  """Entry point for launching an IPython kernel.


In [91]:
net_out

tensor([[-2.1919e+01, -1.2988e+01, -1.6888e-03, -6.7287e+00, -1.4883e+01,
         -1.6335e+01, -2.1311e+01, -7.6240e+00, -2.0260e+01, -2.6327e+01]],
       grad_fn=<LogSoftmaxBackward>)

In [90]:
net_out.argmax() # This is what the neural network predicts

tensor(2)