$\newcommand{\xbf}{{\bf x}}
\newcommand{\ybf}{{\bf y}}
\newcommand{\wbf}{{\bf w}}
\newcommand{\Ibf}{\mathbf{I}}
\newcommand{\Xbf}{\mathbf{X}}
\newcommand{\Rbb}{\mathbb{R}}
\newcommand{\vec}[1]{\left[\begin{array}{c}#1\end{array}\right]}
$

# Introduction to PyTorch -- Part 2
Pascal Germain, 2019
Vera Shalaeva (translated to English), 2020
************

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import aidecours

## Le module `torch.nn`

In [None]:
import torch
torch.__version__ # This notebook works with the pytorch version'1.5.1' 

The module `nn` of the library `torch` content many tools to create neural network architectures.

In [None]:
from torch import nn

Let's take the example of least squares method from the Part1 and represent it in the form of a neural network learning problem by using *pyTorch* library tools.

#### Data preparation
Let's prepare the learning data in form of *pyTorch tensors*.

In [None]:
x = np.array([(1,1),(0,-1),(2,.5)])
y = np.array([-1., 3, 2])

x_tensor = torch.tensor(x, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

In [None]:
x_tensor

In [None]:
y_tensor

In [None]:
y_tensor = y_tensor.unsqueeze(1) # Methods of torch.nn module are designed to manipulate matrices.
y_tensor

#### Hidden layer

The class `Linear` corresponds to a *hidden* layer. The least squares method requires only one output neuron.

In [None]:
nn.Linear?

In [None]:
neuron = nn.Linear(2, 1, bias=False)
neuron

In [None]:
neuron.weight

In [None]:
neuron(x_tensor)

#### Loss function

In [None]:
nn.MSELoss?

In [None]:
quadratic_loss = nn.MSELoss()

In [None]:
quadratic_loss(neuron(x_tensor), y_tensor)

## Optimization module `torch.optim`

In [None]:
torch.optim.SGD?

In [None]:
eta = 0.4
alpha = 0.1

neuron = nn.Linear(2, 1, bias=False)
optimizer = torch.optim.SGD(neuron.parameters(), lr=eta, momentum=alpha)

for t in range(20):

    y_pred = neuron(x_tensor)                   # Computes output of the neuron
    loss = quadratic_loss(y_pred, y_tensor)     # Computes the loss value
    loss.backward()                             # Computes gradients
    optimizer.step()                           # Does one step of the gradient descent
    optimizer.zero_grad()                      # Reinitialize gradient values to zero 
    
    print(t, loss.item())

Let's modify the class `least_squares` by using *pyTorch*.

In [None]:
class least_squares:
    def __init__(self, eta=0.4, alpha=0.1, nb_iter=50, seed=None):
        # Initialization of parameters of gradient descent
        self.eta = eta         # Gradient step value
        self.alpha = alpha     # Momentum
        self.nb_iter = nb_iter # Number of iterations
        self.seed = seed       # Seed of random number generator
        
        # Initialization of the path of the gradien descent
        self.w_list = list()   
        self.obj_list = list()
        
    def _trace(self, w, obj):
        self.w_list.append(np.array(w.squeeze().detach()))
        self.obj_list.append(obj.item())      
        
    def training(self, x, y):
        if self.seed is not None:
            torch.manual_seed(self.seed)
        
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

        n, d = x.shape
        self.neuron = nn.Linear(d, 1, bias=False)
        quadratic_loss = nn.MSELoss()
        optimizer = torch.optim.SGD(self.neuron.parameters(), lr=self.eta, momentum=self.alpha)
                   
        for t in range(self.nb_iter + 1):
            y_pred = self.neuron(x)
            loss = quadratic_loss(y_pred, y)         
            self._trace(self.neuron.weight, loss)
  
            if t < self.nb_iter:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                
    def prediction(self, x):
        x = torch.tensor(x, dtype=torch.float32)
        
        with torch.no_grad():
            pred = self.neuron(x)
            
        return pred.squeeze().numpy()

In [None]:
eta = 0.4      # gradient step
alpha = 0.0    # momentum
nb_iter = 20   # number of iterations

algo = least_squares(eta, alpha, nb_iter, seed=None)
algo.training(x, y)

In [None]:
algo.prediction(x)

In [None]:
w_opt = np.linalg.inv(x.T @ x) @ x.T @ y
fig, axes = plt.subplots(1, 2, figsize=(14.5, 4))
objective_function = lambda w: np.mean((x @ w - y) ** 2)
aidecours.show_2d_trajectory(algo.w_list, objective_function, ax=axes[0])
aidecours.show_learning_curve(algo.obj_list, ax=axes[1], obj_opt=objective_function(w_opt))

## Adding a hidden layer

In [None]:
hidden_layer = nn.Linear(2, 4)
hidden_layer

In [None]:
hidden_layer.weight

In [None]:
hidden_layer.bias

In [None]:
for variables in hidden_layer.parameters():
    print(variables)
    print('---')

In [None]:
hidden_layer(x_tensor)

#### Activation function
Activation function *ReLU*

In [None]:
nn.ReLU?

In [None]:
activation_relu = nn.ReLU()

In [None]:
a = torch.linspace(-2, 2, 5)
a

In [None]:
activation_relu(a)

In [None]:
activation_relu(hidden_layer(x_tensor))

Activation function *tanh*

In [None]:
nn.Tanh?

In [None]:
activation_tanh = nn.Tanh()

In [None]:
activation_tanh(a)

In [None]:
activation_tanh(hidden_layer(x_tensor))

Activation fucntion *sigmoid*

In [None]:
nn.Sigmoid?

In [None]:
activation_sigmoid = nn.Sigmoid()

In [None]:
activation_sigmoid(a)

In [None]:
activation_sigmoid(hidden_layer(x_tensor))

#### Sequence of layers and activation functions

In [None]:
nn.Sequential?

In [None]:
model = nn.Sequential(
    torch.nn.Linear(2, 4),
    torch.nn.ReLU(),
)

In [None]:
model(x_tensor)

In [None]:
model = nn.Sequential(
    torch.nn.Linear(2, 4),
    torch.nn.ReLU(),
    torch.nn.Linear(4, 1),
)

In [None]:
model(x_tensor)

In [None]:
for variables in model.parameters():
    print(variables)
    print('---')

## Neural network with one hidden layer

In [None]:
class regression_network:
    def __init__(self, nb_neurones=4, eta=0.4, alpha=0.1, nb_iter=50, seed=None):
        # Network architecture
        self.nb_neurones = nb_neurones # Number of neurons of the hidden layer
        
        # Initialization of parameters of the gradient descent
        self.eta = eta         # Gradient step
        self.alpha = alpha     # Momentum
        self.nb_iter = nb_iter # Number of iterations
        self.seed = seed       # Seed of the random number generator
        
        # Initialization of the path of the gradient descent
        self.w_list = list()   
        self.obj_list = list()
        
    def _trace(self, obj):
        self.obj_list.append(obj.item())      
        
    def training(self, x, y):
        if self.seed is not None:
            torch.manual_seed(self.seed)
        
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

        n, d = x.shape
        self.model = nn.Sequential(
            torch.nn.Linear(d, self.nb_neurones),
            torch.nn.ReLU(),
            torch.nn.Linear(self.nb_neurones, 1)
        )
        
        quadratic_loss = nn.MSELoss()
        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.eta, momentum=self.alpha)
                   
        for t in range(self.nb_iter + 1):
            y_pred = self.model(x)
            loss = quadratic_loss(y_pred, y)         
            self._trace(loss)
  
            if t < self.nb_iter:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                
    def prediction(self, x):
        x = torch.tensor(x, dtype=torch.float32)
        
        with torch.no_grad():
            pred = self.model(x)
            
        return pred.squeeze().numpy()

In [None]:
nb_neurones = 4
eta = 0.1      # gradient step
alpha = 0.1    # momentum
nb_iter = 50   # number of iterations

x = np.array([(1,1),(0,-1),(2,.5)])
y = np.array([-1., 3, 2])

algo = regression_network(nb_neurones, eta, alpha, nb_iter, seed=None)
algo.training(x, y)

aidecours.show_learning_curve(algo.obj_list)
predictions = algo.prediction(x)
print('y    =', y)
print('R(x) =', predictions)

## Exersise

The goal of this exercise is to modify the class `reseau_regression` above that it will be able to solve the classification problem below.




In [None]:
from sklearn.datasets import make_circles
xx, yy = make_circles(n_samples=100, noise=.1, factor=0.2, random_state=10)
aidecours.show_2d_dataset(xx, yy)

Complete the function `fit` of the class `classification network` below. We recommend that you use the *sigmoid* activation function, as well as **negative log likelihood** loss function. It is not necessary to add the regularization term to the network.

**Note**: The **negative log likelihood** loss function correspondsto the class `nn.BCELoss`.

In [None]:
class classification_net:
    def __init__(self, nb_neurones=4, eta=0.4, alpha=0.1, nb_iter=50, seed=None):
        # Network architecture
        self.nb_neurones = nb_neurones # Number of neuroned in a hidden layer
        
        # Initialization of the parameters of the gradient descent
        self.eta = eta         # Gradient step
        self.alpha = alpha     # Momentum
        self.nb_iter = nb_iter # Number of iterations
        self.seed = seed       # Seed of random number generator
        
        # Initialization of the path of the gradient descent
        self.w_list = list()   
        self.obj_list = list()
        
    def _trace(self, obj):
        self.obj_list.append(obj.item())      
        
    def fit(self, x, y):
        if self.seed is not None:
            torch.manual_seed(self.seed)
        
        x = torch.tensor(x, dtype=torch.float32)
        y = torch.tensor(y, dtype=torch.float32).unsqueeze(1)

        n, d = x.shape
        self.model = nn.Sequential(
            torch.nn.Linear(d, self.nb_neurones),
            # Complete the architecture
        )
        
        logistic_loss = nn.BCELoss()
        optimizer = torch.optim.SGD(self.model.parameters(), lr=self.eta, momentum=self.alpha)
                   
        for t in range(self.nb_iter + 1):
            pass # Complete 
                
    def prediction(self, x):
        x = torch.tensor(x, dtype=torch.float32)
        
        with torch.no_grad():
            pred = self.model(x)
            
        pred = pred.squeeze()
        return np.array(pred > .5, dtype=np.int)

Run the code below to test your network. Try to change parameters to get insights of their influence.

In [None]:
nb_neurones = 10
eta = 0.6     # gradient step
alpha = 0.4   # momentum
nb_iter = 50  # number of iterations

algo = classification_net(nb_neurones, eta, alpha, nb_iter)
algo.fit(xx, yy)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
aidecours.show_learning_curve(algo.obj_list, ax=axes[0])
aidecours.show_2d_predictions(xx, yy, algo.prediction, ax=axes[1]);

Finally, we suggest that you explore the network by: 
1. Changing the activation function *ReLU* by the activation function *tanh*
2. Adding one or many hidden layers