# fashion MNIST  
---  
## Training  
* calculate the loss  
* calculate gradient  
* update the weights  

https://www.youtube.com/watch?v=0VCOG8IeVf8&list=PLZbbT5o_s2xrfNyHZsM6ufI0iZENK9xgG&index=25

In [19]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms
# from torchvision import transforms

torch.set_printoptions(linewidth=120) # display option for output
torch.set_grad_enabled(True)

<torch.autograd.grad_mode.set_grad_enabled at 0x205214c77b8>

In [20]:
def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

In [21]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        
        # linear layer == fully connected layer == fc == dense layer
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
        
    def forward(self, t):
        # (1) input layer:
        # t = t
        
        # (2) hidden conv layer:
        t = self.conv1(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)
        
        # (3) hidden conv layer:
        t = self.conv2(t)
        t = F.relu(t)
        t = F.max_pool2d(t, kernel_size=2, stride=2)
        
        # (4) hidden linear layer:
        t = t.reshape(-1, 12*4*4)
        # t = t.flatten()
        t = self.fc1(t)
        t = F.relu(t)
        
        # (5) hidden linear layer:
        t = self.fc2(t)
        t = F.relu(t)
        
        # (6) output layer:
        t = self.out(t)
        #t = F.softmax(t, dim=1)
        
        return t

In [22]:
train_set = torchvision.datasets.FashionMNIST(
    root='data',
    train=True,
    download=True,
    transform=transforms.Compose([
        transforms.ToTensor()
    ])
)

In [23]:
network = Network()

In [24]:
train_loader = torch.utils.data.DataLoader(train_set, batch_size=100)
batch = next(iter(train_loader))
images, labels = batch

---  
## Calculating the loss

In [25]:
preds = network(images)
loss = F.cross_entropy(preds, labels)
loss.item()

2.314129590988159

---  
## Calculating the gradients

In [26]:
print(network.conv1.weight.grad)

None


In [27]:
loss.backward() # calculating the gradients

In [28]:
network.conv1.weight.grad.shape

torch.Size([6, 1, 5, 5])

---  
## Updating the weights

In [29]:
optimizer = optim.Adam(network.parameters(), lr=0.01)
# another optimizer option is SGD
# lr == learning rate - hyperparemeter (has to be tweaked to optimize)
# sets how far the optimizer can step in the direction of the loss function's minimum
# too small value = more epochs training required = longer learning
# too big value = chance of skipping the local minimum, or worse stepping back and forward

In [30]:
loss.item()

2.314129590988159

In [31]:
get_num_correct(preds, labels)

4

In [35]:
optimizer.step()
# this line updates the weight
# == step into the direction of loss function's minimum
# lr (learning rate) sets how far to step

In [36]:
preds = network(images)
loss = F.cross_entropy(preds, labels)

In [37]:
loss.item()

2.283449649810791

In [38]:
get_num_correct(preds, labels)

11

---  
# Short version training

In [39]:
network = Network()

train_loader = torch.utils.data.DataLoader(train_set, batch_size=100)
optimizer = optim.Adam(network.parameters(), lr=0.01)

batch = next(iter(train_loader))
images, labels = batch

preds = network(images)
loss = F.cross_entropy(preds, labels) # calculating the loss function

loss.backward() # calculating the gradients
optimizer.step() # update the weight

print(f'loss1: {loss.item()}')
preds = network(images)
loss = F.cross_entropy(preds, labels)
print(f'loss2: {loss.item()}')

loss1: 2.302074670791626
loss2: 2.287165880203247
