In [1]:
import torch
import torchvision as tv

import pandas as pd
import numpy as np
import time

import matplotlib.pyplot as plt
%matplotlib inline

In [37]:
BATCH_SIZE = 256
NUM_EPOCHS = 20

In [3]:
train_dataset = tv.datasets.MNIST('.', train=True, transform=tv.transforms.ToTensor(), download=True)
test_dataset = tv.datasets.MNIST('.', train=False, transform=tv.transforms.ToTensor(), download=True)
train = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE)
test = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 15.8MB/s]


Extracting ./MNIST/raw/train-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 478kB/s]


Extracting ./MNIST/raw/train-labels-idx1-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 4.46MB/s]


Extracting ./MNIST/raw/t10k-images-idx3-ubyte.gz to ./MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 4.55MB/s]

Extracting ./MNIST/raw/t10k-labels-idx1-ubyte.gz to ./MNIST/raw






In [27]:
def model_training(num_epochs):

  train_passed = sum([len(X) for X, _ in train])
  test_passed = sum([len(X) for X, _ in test])

  for epoch in range(num_epochs):

    train_loss, test_loss = 0.0, 0.0
    train_acc, test_acc = 0.0, 0.0

    model.train()
    for X, y in train:
      trainer.zero_grad()
      y_pred = model(X)
      l = loss(y_pred, y)
      l.backward()
      trainer.step()
      train_loss += l.item()
      train_acc += (y_pred.argmax(dim=1) == y).sum().item()

    model.eval()
    for X, y in test:
      y_pred = model(X)
      l = loss(y_pred, y)
      test_loss += l.item()
      test_acc += (y_pred.argmax(dim=1) == y).sum().item()

    train_loss = train_loss / len(train)
    test_loss = test_loss / len(test)
    train_acc = train_acc / train_passed
    test_acc = test_acc / test_passed
    print(f'epoch: {epoch}, train_loss: {train_loss}, train_acc: {train_acc}, test_loss: {test_loss}, test_acc: {test_acc}')

In [38]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 10))
model

Sequential(
  (0): Flatten(start_dim=1, end_dim=-1)
  (1): Linear(in_features=784, out_features=512, bias=True)
  (2): ReLU()
  (3): Linear(in_features=512, out_features=256, bias=True)
  (4): ReLU()
  (5): Linear(in_features=256, out_features=128, bias=True)
  (6): ReLU()
  (7): Linear(in_features=128, out_features=10, bias=True)
)

In [39]:
loss = torch.nn.CrossEntropyLoss()
trainer = torch.optim.Adam(model.parameters(), lr=1e-4)

model_training(NUM_EPOCHS)

epoch: 0, train_loss: 1.0958726655929647, train_acc: 0.7366666666666667, test_loss: 0.4082482496276498, test_acc: 0.8832
epoch: 1, train_loss: 0.3620964386361711, train_acc: 0.89675, test_loss: 0.296930278185755, test_acc: 0.9123
epoch: 2, train_loss: 0.2903431922672911, train_acc: 0.9171, test_loss: 0.2539081945084035, test_acc: 0.9244
epoch: 3, train_loss: 0.25049691873979063, train_acc: 0.9287666666666666, test_loss: 0.2243829749058932, test_acc: 0.9327
epoch: 4, train_loss: 0.2210037019183027, train_acc: 0.9373166666666667, test_loss: 0.20140279582701623, test_acc: 0.9404
epoch: 5, train_loss: 0.19737579399283897, train_acc: 0.9433333333333334, test_loss: 0.1826742734760046, test_acc: 0.9463
epoch: 6, train_loss: 0.17785998854706897, train_acc: 0.9490333333333333, test_loss: 0.16712641820777208, test_acc: 0.9515
epoch: 7, train_loss: 0.16130420735224765, train_acc: 0.95385, test_loss: 0.1541385515825823, test_acc: 0.9552
epoch: 8, train_loss: 0.14711268424512225, train_acc: 0.95788

В данном случае модели на первый взгляд не хватило 20 эпох. Но результат и для train, и для test отличный:

- train_acc: 0.982

- test_acc: 0.973

In [40]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.Dropout(p=0.2),
    torch.nn.ReLU(),
    torch.nn.Linear(512, 256),
    torch.nn.Dropout(p=0.2),
    torch.nn.ReLU(),
    torch.nn.Linear(256, 128),
    torch.nn.Dropout(p=0.2),
    torch.nn.ReLU(),
    torch.nn.Linear(128, 10))

trainer = torch.optim.Adam(model.parameters(), lr=1e-4)
model_training(NUM_EPOCHS)

epoch: 0, train_loss: 1.1840645741909108, train_acc: 0.6719833333333334, test_loss: 0.4250142477452755, test_acc: 0.875
epoch: 1, train_loss: 0.4380735270203428, train_acc: 0.8704333333333333, test_loss: 0.3044787151739001, test_acc: 0.9098
epoch: 2, train_loss: 0.3421643522825647, train_acc: 0.9001, test_loss: 0.2537342737894505, test_acc: 0.9244
epoch: 3, train_loss: 0.29032610298154204, train_acc: 0.9148166666666666, test_loss: 0.21832672748714685, test_acc: 0.9335
epoch: 4, train_loss: 0.2491679124217084, train_acc: 0.92715, test_loss: 0.19003959347028285, test_acc: 0.9433
epoch: 5, train_loss: 0.21903300489834016, train_acc: 0.9368166666666666, test_loss: 0.1676249239128083, test_acc: 0.9478
epoch: 6, train_loss: 0.19452123759274786, train_acc: 0.9436833333333333, test_loss: 0.1493986495770514, test_acc: 0.9534
epoch: 7, train_loss: 0.17472821139591804, train_acc: 0.9491833333333334, test_loss: 0.1350092537002638, test_acc: 0.959
epoch: 8, train_loss: 0.15897690894755911, train_ac

С Dropout(p=0.2) после каждого линейного слоя результат для тренировочных данных немного хуже, а вот для тестовых немного лучше:

- train_acc: 0.9789

- test_acc: 0.9777

In [42]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 512),
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    torch.nn.Linear(512, 256),
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    torch.nn.Linear(256, 128),
    torch.nn.ReLU(),
    torch.nn.Dropout(p=0.2),
    torch.nn.Linear(128, 10))

trainer = torch.optim.Adam(model.parameters(), lr=1e-4)
model_training(NUM_EPOCHS)

epoch: 0, train_loss: 1.208498548193181, train_acc: 0.6501666666666667, test_loss: 0.43007735572755335, test_acc: 0.8794
epoch: 1, train_loss: 0.43919686811401487, train_acc: 0.86975, test_loss: 0.3106439569965005, test_acc: 0.9093
epoch: 2, train_loss: 0.34807124968538894, train_acc: 0.8979666666666667, test_loss: 0.26032690913416445, test_acc: 0.9212
epoch: 3, train_loss: 0.2945139465813941, train_acc: 0.91295, test_loss: 0.22380346334539353, test_acc: 0.9328
epoch: 4, train_loss: 0.2552384901554027, train_acc: 0.9252, test_loss: 0.1958294557640329, test_acc: 0.9417
epoch: 5, train_loss: 0.22522315807799076, train_acc: 0.9335166666666667, test_loss: 0.17414361550472676, test_acc: 0.9474
epoch: 6, train_loss: 0.19804155945460847, train_acc: 0.9425666666666667, test_loss: 0.1543273250805214, test_acc: 0.9519
epoch: 7, train_loss: 0.17833438548952976, train_acc: 0.9479333333333333, test_loss: 0.13936315472237765, test_acc: 0.9561
epoch: 8, train_loss: 0.16247771079552933, train_acc: 0.9

Вариант с Dropout(p=0.2) после функции активании показывает результаты еще немного хуже:

- train_acc: 0.9787

- test_acc: 0.9767

In [43]:
model = torch.nn.Sequential(
    torch.nn.Flatten(),
    torch.nn.Linear(784, 256),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(256),
    torch.nn.Linear(256, 64),
    torch.nn.ReLU(),
    torch.nn.BatchNorm1d(64),
    torch.nn.Linear(64, 10)
)

trainer = torch.optim.Adam(model.parameters(), lr=1e-4)
model_training(NUM_EPOCHS)

epoch: 0, train_loss: 0.7508871886324375, train_acc: 0.8423833333333334, test_loss: 0.38904562406241894, test_acc: 0.9286
epoch: 1, train_loss: 0.31965995643367157, train_acc: 0.93895, test_loss: 0.24167739506810904, test_acc: 0.9477
epoch: 2, train_loss: 0.2114252802222333, train_acc: 0.9559666666666666, test_loss: 0.17849876736290754, test_acc: 0.9574
epoch: 3, train_loss: 0.1544539986455694, train_acc: 0.9666, test_loss: 0.14204424801282584, test_acc: 0.9642
epoch: 4, train_loss: 0.11833514059953233, train_acc: 0.9738, test_loss: 0.11954169850796462, test_acc: 0.9685
epoch: 5, train_loss: 0.09301967226919976, train_acc: 0.9798666666666667, test_loss: 0.1044840920716524, test_acc: 0.9717
epoch: 6, train_loss: 0.07439024738095543, train_acc: 0.9846, test_loss: 0.09407475343905389, test_acc: 0.973
epoch: 7, train_loss: 0.059899170447061674, train_acc: 0.98835, test_loss: 0.08699838276952505, test_acc: 0.9744
epoch: 8, train_loss: 0.048360444434938275, train_acc: 0.99125, test_loss: 0.0

В случае BatchNorm1d модель обучается быстрее. После 11 эпохи судя по метрикам для тестовых данных модель начала переобучаться

- train_acc: 1

- test_acc: 0.9775