In [1]:
import torch
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision import transforms
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [2]:
transform = transforms.Compose([
    # transforms.Resize,
    transforms.ToTensor(),
    transforms.Lambda(lambda x: torch.flatten(x)),
])

training_data = datasets\
    .FashionMNIST(".",
                  download=True,
                  train=True,
                  transform=transform)
test_data = datasets\
    .FashionMNIST(".",
                  download=True,
                  train=False,
                  transform=transform)

## Improving network

If we think about it, all StupidNet™ does is project linearly transform images to elements of . The
classification is then to which integer we are the closest to. This means that it has an inherent bias : some classes are more similar just because their labels are closer (as integers).


So the first thing we can improve in our neural network is the number of outputs : we could assign one neuron
to each possible class of output, i.e. go from 1 to 10 output neurons.

The problem now is that the type of our output is not the same as that of our labels anymore, it now becomes a
one-dimensional vector of size . How do we pick the winning class ? To do this, we need to adjust our loss
function and evaluation methods, there are two ways: we could either transform the labels into vectors of length 10, or we could change the way we pick a winning neuron, in which case we need to set a winning condition.
The easiest solution is to take the max, and in a way that is what we will do.

The last layer of your network will be a LogSoftmax, i.e. a softmax (a differentiable version of max) which is
given to a log so that the values are easier to compute.

The loss function we will use is NLLLoss. Note that together with LogSoftmax they are equivalent to using
directly CrossEntropyLoss on the output neurons.

**Question 7.**  Implement this new model, which will be called model_10, and train it.

In [49]:
model_10 = torch.nn.Sequential(
    torch.nn.Linear(28**2, 10),
    torch.nn.LogSoftmax()
)

In [50]:
loader = DataLoader(
    training_data,
    batch_size=64
)

In [51]:
loss_fn = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(model_10.parameters(),
                             lr = 1e-3)

In [52]:
n_epochs= 10
for epoch in tqdm(range(n_epochs)):
    for X_batch, y_batch in loader:

        y_pred = model_10(X_batch)

        loss = loss_fn(y_pred, y_batch)
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()

  input = module(input)
100%|██████████| 10/10 [01:31<00:00,  9.10s/it]


In [53]:
success_rate = torch.tensor([])

with torch.no_grad():
    for X_batch, y_batch in loader:
        y_pred = model_10(X_batch)
        
        y_pred_discrete = model_10(X_batch)\
            .argmax(dim = 1)
        
        success_rate = torch\
            .cat([success_rate, y_pred_discrete == y_batch])
        

In [54]:
success_rate.sum()/success_rate.shape[0]

tensor(0.8656)

## Activation function

We have for the moment used only layers that apply a linear transformation, which can be efficient, but is
limited, as you saw. As you may have inferred, networks often have more than an input and an output layer, it
would not make any sense however to add intermediate layers if all they did was linear transformations. The
idea is to use non-linearities after each layer. We call these "activation functions". The most common are the
Sigmoid, ReLU and Tanh.

These activation functions all modify the output of the neuron in a nonlinear way. The idea being that they
sharpen the result at some threshold:

* The ReLU sends everything that is below zero to 0 and everything positive to itself.
* The sigmoid sends everything that is too negative to 0 and everything that is positive to 1.
* The tanh sends everything that is negative to -1 and everything positive to 1, thus "flattening" the output.

In [81]:
model_activation = torch.nn.Sequential(
    torch.nn.Linear(28**2, 10),
    torch.nn.Tanh(),
    torch.nn.LogSoftmax()
)

optimizer = torch.optim.Adam(model_activation.parameters(),
                             lr = 1e-4)

In [82]:
n_epochs= 10
for epoch in tqdm(range(n_epochs)):
    for X_batch, y_batch in loader:

        y_pred = model_activation(X_batch)

        loss = loss_fn(y_pred, y_batch)
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()

  input = module(input)
100%|██████████| 10/10 [01:41<00:00, 10.13s/it]


In [83]:
success_rate = torch.tensor([])

with torch.no_grad():
    for X_batch, y_batch in loader:
        
        y_pred_discrete = model_activation(X_batch)\
            .argmax(dim = 1)
        
        success_rate = torch\
            .cat([success_rate, y_pred_discrete == y_batch])
        

In [84]:
success_rate.sum()/success_rate.shape[0]

tensor(0.8192)

## Classes

We have only seen one (simple) way of defining models : using the Sequential construct of PyTorch, however
in later TDs we will need some flexibility that this framework cannot provide.

A more flexible way of defining models is by defining a class inheriting from torch.nn.Module, it will need
two methods (at least) :


```python
import torch
from torch import nn
class MyNetwork(nn.Module): # a class inheriting from nn.Module
    def __init__(self, ... some_arguments ...):
        super().__init__() # call the constructor of nn.Module
        # you can now define some layers
        self.layer1 = ...
        self.layer2 = ...
    def forward(self,x):
        # how do the layers compute the output ?
        # this function needs to return the output of the net
        # usually by applying the layers in the right order
        ...
    return result
```

In [85]:
class StupidNetwork(torch.nn.Module):
    def __init__(self, activation_function = None) -> None:
        super().__init__()

        self.layer1 = torch.nn.Linear(28**2, 10)
        self.layer2 = torch.nn.LogSoftmax()

        self.activation_function = None
        if activation_function != None:
            self.activation_function = activation_function

    def forward(self, x):
        if self.activation_function is not None:

            x = self.activation_function(self.layer1(x))
            x = self.layer2(x)

            return x
        
        x = self.layer1(x)
        x = self.layer2(x)

        return x

In [87]:
model_class = StupidNetwork()

optimizer = torch.optim.Adam(model_class.parameters(),
                             lr = 1e-4)

In [88]:
n_epochs= 10
for epoch in tqdm(range(n_epochs)):
    for X_batch, y_batch in loader:

        y_pred = model_class(X_batch)

        loss = loss_fn(y_pred, y_batch)
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()

  x = self.layer2(x)
100%|██████████| 10/10 [01:30<00:00,  9.06s/it]


In [89]:
success_rate = torch.tensor([])

with torch.no_grad():
    for X_batch, y_batch in loader:
        
        y_pred_discrete = model_class(X_batch)\
            .argmax(dim = 1)
        
        success_rate = torch\
            .cat([success_rate, y_pred_discrete == y_batch])

  x = self.layer2(x)


In [94]:
trainable_params = sum(
    p.numel() for p in model_class.parameters() 
    if p.requires_grad
)

In [95]:
trainable_params

7850

In [90]:
success_rate.sum()/success_rate.shape[0]

tensor(0.8387)