# 3. Multi-layer perceptron

- Used as part of INFO8010 Deep Learning (Gilles Louppe, 2018-2019).
- Originally adapted from [Pytorch tutorial for Deep Learning researchers](https://github.com/yunjey/pytorch-tutorial) (Yunvey Choi, 2018).

---

In [2]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import torch 
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.autograd import Variable

# Hyper-parameters

In [3]:
input_size = 784
hidden_size = 500
num_classes = 10
num_epochs = 5
batch_size = 100
learning_rate = 0.001

# Data

In [4]:
# MNIST Dataset (Images and Labels)
train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms.ToTensor(),
                            download=True)

test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms.ToTensor())

# Dataset Loader (Input Pipline)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

# Model

In [5]:
# Model
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()#inheritance
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

model = Net(input_size, hidden_size, num_classes)

# Loss and optimizer

In [14]:
# Loss and Optimizer
# Softmax is internally computed.
# Set parameters to be updated.
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

# Training the Model
for epoch in range(num_epochs):
    loss_tot = 0
    for i, (images, labels) in enumerate(train_loader):
        images = images.view(-1, 28*28)
        
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss_tot+=loss.detach().data
        writer.add_scalar('data/loss',loss.detach().data)
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' 
                  % (epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss))
            

Epoch: [1/5], Step: [100/600], Loss: 1.2657
Epoch: [1/5], Step: [200/600], Loss: 1.3377
Epoch: [1/5], Step: [300/600], Loss: 1.3427
Epoch: [1/5], Step: [400/600], Loss: 1.2121
Epoch: [1/5], Step: [500/600], Loss: 1.2958
Epoch: [1/5], Step: [600/600], Loss: 1.1308
Epoch: [2/5], Step: [100/600], Loss: 1.1277
Epoch: [2/5], Step: [200/600], Loss: 1.0630
Epoch: [2/5], Step: [300/600], Loss: 1.1928
Epoch: [2/5], Step: [400/600], Loss: 1.0932
Epoch: [2/5], Step: [500/600], Loss: 1.0605
Epoch: [2/5], Step: [600/600], Loss: 1.0625
Epoch: [3/5], Step: [100/600], Loss: 1.1545
Epoch: [3/5], Step: [200/600], Loss: 0.8863
Epoch: [3/5], Step: [300/600], Loss: 0.9181
Epoch: [3/5], Step: [400/600], Loss: 0.9937
Epoch: [3/5], Step: [500/600], Loss: 1.0642
Epoch: [3/5], Step: [600/600], Loss: 0.9100
Epoch: [4/5], Step: [100/600], Loss: 1.0181
Epoch: [4/5], Step: [200/600], Loss: 0.8795
Epoch: [4/5], Step: [300/600], Loss: 0.9071
Epoch: [4/5], Step: [400/600], Loss: 1.0159
Epoch: [4/5], Step: [500/600], L

# Tensorboard

As you can see, evaluating the learning of your model ask you to take a look at the loss evolution during training time. [Tensorboard](https://www.tensorflow.org/guide/summaries_and_tensorboard), more specifically [tensorboardX](https://github.com/lanpa/tensorboardX) on pytorch, is a toolkit which is made to simplify this procedure. 

In [31]:
!pip install tensorboardX



In [6]:
from tensorboardX import SummaryWriter
import time

In [7]:
# Initialize the data logger
writer = SummaryWriter() 

In [8]:
for i in range(50):
    writer.add_scalar('data/scalar1', i**2, i)
    time.sleep(.5)

<div class="alert alert-success">
<b>EXERCISE</b>:

Now that you have seen how this handy tool works, you can monitor the progress of your neural networks within it. 
Redirect what you think are the most informative logs of your training to tensorboard by creating an appropriate writer.

</div>

In [9]:
# Your code
# Loss and Optimizer
# Softmax is internally computed.
# Set parameters to be updated.
criterion = nn.CrossEntropyLoss()  
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

# Training the Model
for epoch in range(num_epochs):
    loss_tot = 0
    for i, (images, labels) in enumerate(train_loader):
        images = images.view(-1, 28*28)
        
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss_tot+=loss.detach().data
        writer.add_scalar('data/loss',loss.detach().data)
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' 
                  % (epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss))
writer.add_scalar('loss',loss_tot/len(train_dataset), epoch)        

Epoch: [1/5], Step: [100/600], Loss: 2.2766
Epoch: [1/5], Step: [200/600], Loss: 2.2587
Epoch: [1/5], Step: [300/600], Loss: 2.2602
Epoch: [1/5], Step: [400/600], Loss: 2.2145
Epoch: [1/5], Step: [500/600], Loss: 2.1932
Epoch: [1/5], Step: [600/600], Loss: 2.1668
Epoch: [2/5], Step: [100/600], Loss: 2.1463
Epoch: [2/5], Step: [200/600], Loss: 2.0933
Epoch: [2/5], Step: [300/600], Loss: 2.0724
Epoch: [2/5], Step: [400/600], Loss: 2.0611
Epoch: [2/5], Step: [500/600], Loss: 2.0643
Epoch: [2/5], Step: [600/600], Loss: 1.9820
Epoch: [3/5], Step: [100/600], Loss: 1.9856
Epoch: [3/5], Step: [200/600], Loss: 1.9195
Epoch: [3/5], Step: [300/600], Loss: 1.8944
Epoch: [3/5], Step: [400/600], Loss: 1.8905
Epoch: [3/5], Step: [500/600], Loss: 1.8609
Epoch: [3/5], Step: [600/600], Loss: 1.8209
Epoch: [4/5], Step: [100/600], Loss: 1.8037
Epoch: [4/5], Step: [200/600], Loss: 1.7698
Epoch: [4/5], Step: [300/600], Loss: 1.7343
Epoch: [4/5], Step: [400/600], Loss: 1.7219
Epoch: [4/5], Step: [500/600], L

# Test the model

In [None]:
# Test the Model
correct = 0
total = 0

for images, labels in test_loader:
    images = images.view(-1, 28*28)
    outputs = model(images)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()
    
print('Accuracy of the model on the 10000 test images: %d %%' % (100 * correct / total))

<div class="alert alert-success">
<b>EXERCISE</b>:

Compare the performance of an MLP with sigmoid activation units against a rectified network.

</div>

In [None]:
# Your code
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()#inheritance
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

model = Net(input_size, hidden_size, num_classes)


## Vanishing gradient (bonus)

<div class="alert alert-success">
<b>EXERCISE</b>:

Very deep networks are known to be affected by the "Vanishing Gradient" problem.
Investigate this phenomenon by defining an architecture which would suffer from this issue and compare the gradients that you obtain with the ones of a shallower network by using the weight.grad.norm() method.
Plot the results.


</div>

In [35]:
# Loss and Optimizer
# Softmax is internally computed.
# Set parameters to be updated.
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()#inheritance
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, hidden_size)  
        self.layers = nn.Sequential(nn.ReLU(),nn.Linear(hidden_size, hidden_size),
                                    nn.ReLU(),nn.Linear(hidden_size, hidden_size),
                                    nn.ReLU(),nn.Linear(hidden_size, hidden_size),
                                    nn.ReLU(),nn.Linear(hidden_size, num_classes))
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.layers(out)
        return out

model = Net(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()  

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

# Training the Model
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.view(-1, 28*28)
        
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        print(model.fc1.weight.grad.norm())
        break
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print('Epoch: [%d/%d], Step: [%d/%d], Loss: %.4f' 
                  % (epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss))

tensor(0.0120)
tensor(0.0134)
tensor(0.0128)
tensor(0.0130)
tensor(0.0122)
