In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from collections import OrderedDict
import torch.nn.functional as F

torch.manual_seed(999)

class Net(nn.Module):
    def __init__(self, input_size, out_size):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 500),
            nn.ReLU(),
#             nn.Linear(1000, 500),
#             nn.ReLU(),
#             nn.Linear(500, 250),
#             nn.ReLU(),
#             nn.Linear(250, 125),
#             nn.ReLU(),
#             nn.Linear(125, 64),
#             nn.ReLU(),
#             nn.Linear(64, 32),
#             nn.ReLU(),
            nn.Linear(500, out_size)
        )
        
    def forward(self, x):
        return self.net(x)
    

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 128
train_kwargs = {'batch_size': batch_size,
               'shuffle': True}
test_kwargs = {'batch_size': batch_size,
              'shuffle': True}
if device.type == 'cuda':
    cuda_kwargs = {'num_workers': 1,
                   'pin_memory': True,
                   'shuffle': True}
    train_kwargs.update(cuda_kwargs)
    test_kwargs.update(cuda_kwargs)
transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
        ])
dataset1 = datasets.MNIST('../data', train=True, download=True,
                   transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
                   transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)

model = Net(28*28, 10).to(device)
optimizer = optim.Adadelta(model.parameters(), lr=1e-2)
            
model

In [None]:
len(train_loader)*batch_size

In [None]:
from matplotlib import pyplot as plt

examples = iter(test_loader)
example_data, example_targets = examples.next()
for i in range(6):
    plt.subplot(2,3,i+1)
    plt.imshow(example_data[i][0], cmap='gray')
plt.show()

In [None]:
import copy

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        # print(data.size(), data.view(data.size(0), -1).size())
        optimizer.zero_grad()
        output = model(data.view(data.size(0), -1))
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))
            

def test():
    with torch.no_grad():
        n_correct = 0
        n_samples = 0
        for images, labels in test_loader:
            images = images.reshape(-1, 28*28).to(device)
            labels = labels.to(device)
            outputs = model(images)
            # max returns (value ,index)
            _, predicted = torch.max(outputs.data, 1)
            n_samples += labels.size(0)
            n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')
    return acc


best_acc = -1
best_model = None
for epoch in range(1, 20):
    train(model, device, train_loader, optimizer, epoch)
    acc = test()
    if acc > best_acc:
        best_acc = acc
        best_model = copy.deepcopy(model)
        print(f'new best acc={best_acc}')
    else:
        print(f'current acc={acc}, prev_best_acc={best_acc}')