# NEURAL NETWORKS AND DEEP LEARNING

ICT FOR LIFE AND HEALTH - Department of Information Engineering
PHYSICS OF DATA - Department of Physics and Astronomy
COGNITIVE NEUROSCIENCE AND CLINICAL NEUROPSYCHOLOGY - Department of Psychology

A.A. 2019/20 (6 CFU)
Dr. Alberto Testolin, Dr. Federico Chiariotti

Author: Dr. Matteo Gadaleta

Lab. 02 - Linear regression with artificial neurons

Define the true model and generate some noisy samples

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.utils import shuffle
# Set random seed
np.random.seed(3)

In [0]:
training_set=pd.read_csv('training_set.csv', header=None)
test_set=pd.read_csv('test_set.txt', header=None)

In [0]:
x_train = training_set[0].values
y_train = training_set[1].values
x_test = test_set[0]
y_test = test_set[1]
x_train, y_train = shuffle(x_train, y_train, random_state=0)

mask=np.array(np.ones(len(x_train)), 'bool')
x_cv=[]
x_val=[]
y_cv=[]
y_val=[]

for j in range(3):
  temp_mask=mask.copy()
  temp_mask[(j*40):((j+1)*40)]=False
  x_cv.append(x_train[temp_mask])
  x_val.append(x_train[~temp_mask])
  y_cv.append(y_train[temp_mask])
  y_val.append(y_train[~temp_mask])

x_cv=np.array(x_cv)
x_val=np.array(x_val)
y_cv=np.array(y_cv)
y_val=np.array(y_val)

In [0]:
### Plot
plt.close('all')
plt.figure(figsize=(12,8))
plt.plot(x_train, y_train, color='r', ls='', marker='.', label='Train data points')
plt.plot(x_test, y_test, color='g', ls='', marker='.', label='Test data points')
plt.xlabel('x')
plt.ylabel('y')
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()

## Activation function

In [0]:
# Define activation function
from scipy.special import expit
act_sig = expit
# 1st derivative
act_sig_der = lambda x: act_sig(x) * (1 - act_sig(x))

# Plot activation function
x_plot = np.linspace(-5,5,1000)
y_act = act_sig(x_plot)
y_act_der = act_sig_der(x_plot)

plt.close('all')
plt.plot(x_plot, y_act, label='Sigmoid function')
plt.plot(x_plot, y_act_der, label='Sigmoid 1st derivative')
plt.legend()
plt.grid()
plt.show()

In [0]:
act_tanh = lambda x: np.tanh(x)
act_tanh_der = lambda x: 1/(np.cosh(x))**2

# Plot activation function
x_plot = np.linspace(-5,5,1000)
y_act = act_tanh(x_plot)
y_act_der = act_tanh_der(x_plot)

plt.close('all')
plt.plot(x_plot, y_act, label='Tanh function')
plt.plot(x_plot, y_act_der, label='Tanh 1st derivative')
plt.legend()
plt.grid()
plt.show()

In [0]:
# Define activation function
act_relu= lambda x: x*(x>=0)+0*(x<0)
act_relu_der = lambda x: 1*(x>=0)+0*(x<0)



# Plot activation function
x_plot = np.linspace(-5,5,1000)
y_act = act_relu(x_plot)
y_act_der = act_relu_der(x_plot)

plt.close('all')
plt.plot(x_plot, y_act, label='ReLU function')
plt.plot(x_plot, y_act_der, label='ReLU 1st derivative')
plt.legend()
plt.grid()
plt.show()

In [0]:

act_leaky_relu= lambda x: x*(x>=0)+0.1*x*(x<0)
act_leaky_relu_der = lambda x: 1*(x>=0)+0.1*(x<0)


# Plot activation function
x_plot = np.linspace(-5,5,1000)
y_act = act_leaky_relu(x_plot)
y_act_der = act_leaky_relu_der(x_plot)

plt.close('all')
plt.plot(x_plot, y_act, label='Leaky ReLU function')
plt.plot(x_plot, y_act_der, label='Leaky ReLU 1st derivative')
plt.legend()
plt.grid()
plt.show()

In [0]:

act_elu= lambda x: x*(x>=0)+0.5*(np.exp(x)-1)*(x<0)
act_elu_der = lambda x: 1*(x>=0)+0.5*np.exp(x)*(x<0)


# Plot activation function
x_plot = np.linspace(-5,5,1000)
y_act = act_elu(x_plot)
y_act_der = act_elu_der(x_plot)

plt.close('all')
plt.plot(x_plot, y_act, label='ELU function')
plt.plot(x_plot, y_act_der, label='ELU 1st derivative')
plt.legend()
plt.grid()
plt.show()

##  Network class

In [0]:
class Network():
    
    def __init__(self, Ni, Nh1, Nh2, No, act_fn):
            
        ### WEIGHT INITIALIZATION (Xavier)
        # Initialize hidden weights and biases (layer 1)
        Wh1 = (np.random.rand(Nh1, Ni) - 0.5) * np.sqrt(12 / (Nh1 + Ni))
        Bh1 = np.zeros([Nh1, 1])
        self.WBh1 = np.concatenate([Wh1, Bh1], 1) # Weight matrix including biases
        # Initialize hidden weights and biases (layer 2)
        Wh2 = (np.random.rand(Nh2, Nh1) - 0.5) * np.sqrt(12 / (Nh2 + Nh1))
        Bh2 = np.zeros([Nh2, 1])
        self.WBh2 = np.concatenate([Wh2, Bh2], 1) # Weight matrix including biases
        # Initialize output weights and biases
        Wo = (np.random.rand(No, Nh2) - 0.5) * np.sqrt(12 / (No + Nh2))
        Bo = np.zeros([No, 1])
        self.WBo = np.concatenate([Wo, Bo], 1) # Weight matrix including biases

        ### ACTIVATION FUNCTION

        self.act_sig = expit
        self.act_sig_der = lambda x: act_sig(x) * (1 - act_sig(x))
        
        self.act_tanh = lambda x: np.tanh(x)
        self.act_tanh_der = lambda x: 1/(np.cosh(x))**2

        self.act_relu= lambda x: x*(x>=0)+0*(x<0)
        self.act_relu_der = lambda x: 1*(x>=0)+0*(x<0)

        self.act_leaky_relu= lambda x: x*(x>=0)+0.1*(x<0)
        self.act_leaky_relu_der = lambda x: 1*(x>=0)+0.1*(x<0)

        self.act_elu= lambda x: x*(x>=0)+0.5*(np.exp(x)-1)*(x<0)
        self.act_elu_der = lambda x: 1*(x>=0)+0.5*np.exp(x)*(x<0)

        acts={'sigmoid':act_sig, 'tanh': act_tanh, 'ReLU': act_relu, 'Leaky ReLU': act_leaky_relu, 'ELU': act_elu}
        acts_der={'sigmoid':act_sig_der, 'tanh': act_tanh_der, 'ReLU': act_relu_der, 'Leaky ReLU': act_leaky_relu_der, 'ELU': act_elu_der}
        
        self.act=acts[act_fn]
        self.act_der=acts_der[act_fn]

    #MAKE PREDICTIONS

    def forward(self, x, additional_out=False):
        
        # Convert to numpy array
        x = np.array(x)
        
        ### Hidden layer 1
        # Add bias term
        X = np.append(x, 1)
        # Forward pass (linear)
        H1 = np.matmul(self.WBh1, X)
        # Activation function
        Z1 = self.act(H1)
        
        ### Hidden layer 2
        # Add bias term
        Z1 = np.append(Z1, 1)
        # Forward pass (linear)
        H2 = np.matmul(self.WBh2, Z1)
        # Activation function
        Z2 = self.act(H2)
        
        ### Output layer
        # Add bias term
        Z2 = np.append(Z2, 1)
        # Forward pass (linear)
        Y = np.matmul(self.WBo, Z2)
        # NO activation function
        
        if additional_out:
            return Y.squeeze(), Z2
        
        return Y.squeeze()
        
    #ADJUST WEIGHTS

    def ADAMS(self, W, g, m, s, η, t, β1=0.9, β2=0.999, ε=10**(-8)):#, η=10**(-3)):
        m = β1*m + (1-β1)*g
        s = β2*s + (1-β2)*g**2
        m_avg = m/(1-β1**(t+1))
        s_avg = s/(1-β2**(t+1))
        W = W - η*m_avg/((s_avg)**(1/2)+ε)
        
        return W, g, m, s

    def update(self, x, label, η, num_ep):
        
        # Convert to numpy array
        X = np.array(x)
        
        ### Hidden layer 1
        # Add bias term
        X = np.append(X, 1)
        # Forward pass (linear)
        H1 = np.matmul(self.WBh1, X)
        # Activation function
        Z1 = self.act(H1)
        
        ### Hidden layer 2
        # Add bias term
        Z1 = np.append(Z1, 1)
        # Forward pass (linear)
        H2 = np.matmul(self.WBh2, Z1)
        # Activation function
        Z2 = self.act(H2)
        
        ### Output layer
        # Add bias term
        Z2 = np.append(Z2, 1)
        # Forward pass (linear)
        Y = np.matmul(self.WBo, Z2)
        # NO activation function
        
        # Evaluate the derivative terms
        D1 = Y - label 
        D2 = Z2
        D3 = self.WBo[:,:-1]
        D4 = self.act_der(H2)
        D5 = Z1
        D6 = self.WBh2[:,:-1]
        D7 = self.act_der(H1)
        D8 = X
        
        # Layer Error
        Eo = D1
        Eh2 = np.matmul(Eo, D3) * D4
        Eh1 = np.matmul(Eh2, D6) * D7
        
        
        # Derivative for weight matrices
        dWBo = np.matmul(Eo.reshape(-1,1), D2.reshape(1,-1))
        dWBh2 = np.matmul(Eh2.reshape(-1,1), D5.reshape(1,-1))
        dWBh1 = np.matmul(Eh1.reshape(-1,1), D8.reshape(1,-1))
        

        # Update the weights

        #we need global variables beacuse we need them to be initializated
        #outside the class and then update at every call
        global m1, s1, m2, s2, mo, so
        
        lambd=0  #L2 regularization
        #updating with adams
      
        g1 = dWBh1+np.abs(self.WBh1)*lambd
        WBh1, g1, m1, s1 = self.ADAMS(self.WBh1, g1, m1, s1,η, t=num_ep)

        g2 = dWBh2+np.abs(self.WBh2)*lambd
        self.WBh2, g2, m2, s2 = self.ADAMS(self.WBh2, g2, m2, s2,η, t=num_ep)

        go = dWBo+np.abs(self.WBo)*lambd
        self.WBo, go, mo, so = self.ADAMS(self.WBo, go, mo, so,η, t=num_ep)

        #self.WBh1 -= lr * dWBh1
        #self.WBh2 -= lr * dWBh2
        #self.WBo -= lr * dWBo
        
        # Evaluate loss function
        loss = (Y - label)**2/2
        
        return loss
    
    def plot_weights(self):
    
        fig, axs = plt.subplots(3, 1, figsize=(12,6))
        axs[0].hist(self.WBh1.flatten(), 20)
        axs[1].hist(self.WBh2.flatten(), 50)
        axs[2].hist(self.WBo.flatten(), 20)
        plt.legend()
        plt.grid()
        plt.show()

In [0]:
### PARAMETERS
Ni = 1 # Number of inputs
Nh1 = 100# Number of hidden neurons (layer 1)
Nh2 = 100# Number of hidden neurons (layer 2)
No = 1 # Number of outputs
act_fn = 'tanh' #activation function


### Initialize network
net = Network(Ni, Nh1, Nh2, No, act_fn)

# Access the class members
print('1st hidden layer weigth matrix shape:', net.WBh1.shape)
print('2nd hidden layer weigth matrix shape:', net.WBh2.shape)
print('Output layer weigth matrix shape:', net.WBo.shape)

# Plot weights
plt.close('all')
net.plot_weights()

## FORWARD PASS (before training)

In [0]:
# Define the x vector
x_highres = np.linspace(min(x_train), max(x_train), 1000)
# Evaluate the output for each input (this can be done as a batch, but for now let's do 1 input at a time)
initial_net_output = []
for x in x_highres:
    net_out = net.forward(x)
    initial_net_output.append(net_out)
initial_net_output = np.array(initial_net_output)

In [0]:
# Or in just 1 line of pythonic code!!
initial_net_output = np.array([net.forward(x) for x in x_highres])

In [0]:
### Plot
plt.close('all')
plt.figure(figsize=(12,8))
plt.plot(x_train, y_train, color='r', ls='', marker='.', label='Train data points')
plt.plot(x_highres, initial_net_output, color='g', ls='--', label='Network output (random weights)')
plt.xlabel('x')
plt.ylabel('y')
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()

## TRAINING

RANDOM SEARCH

In [0]:
from random import seed
from random import choice
from numpy.random import randint
seed(1)

N=15

num_epochs = 5000
η = 0.001
en_decay = True
η_final = 0.0005
η_decay = (η_final / η)**(1 / num_epochs)

Ni=1
No=1

act_fn_list = ['sigmoid', 'tanh', 'ReLU', 'Leaky ReLU', 'ELU']

act_selection=[]
for _ in range(N):
    act_selection.append(choice(act_fn_list))


Nh1_list = randint(10, 200, N)
Nh2_list = randint(10, 200, N)



In [0]:
print(Nh1_list)
print(Nh2_list)
print(act_selection)

In [0]:
best_error= np.inf 
MSE=[]
#create random search for the parameters

for act_fn, Nh1, Nh2 in zip(act_selection, Nh1_list, Nh2_list):
      print(act_fn)
      print('Nh1', Nh1)
      print('Nh2', Nh2)

      
      val_loss=0
      stopping_index=0
      cont=0

      #training over k-folds
      for xt,yt,xv,yv in zip(x_cv, y_cv, x_val, y_val):
          print(cont)
          cont+=1
          
          η = 0.001
          η_final = 0.0005
          η_decay = (η_final / η)**(1 / num_epochs)

          #reilitialize the network for each training
          net = Network(Ni, Nh1, Nh2, No, act_fn)
          #train_loss_log=[]
          val_loss_log=[]

          #it needs to initialize this variables outside the cycle
          m1= np.zeros(net.WBh1.shape)
          s1= np.zeros(net.WBh1.shape)
          m2= np.zeros(net.WBh2.shape)
          s2= np.zeros(net.WBh2.shape)
          mo= np.zeros(net.WBo.shape)
          so= np.zeros(net.WBo.shape)


          for num_ep in range(num_epochs):

              # Learning rate decay
              #if en_decay:
              η *= η_decay

              # Train single epoch (sample by sample, no batch for now)
              train_loss_vec = [net.update(x, y,η, num_ep) for x, y in zip(x_train, y_train)]
              #avg_train_loss = np.mean(train_loss_vec)
              # val network
              y_val_est = np.array([net.forward(x) for x in xv])
              avg_val_loss = np.mean((y_val_est - yv)**2/2)

              # Log
              #train_loss_log.append(avg_train_loss)
              val_loss_log.append(avg_val_loss)
    
              #print('Epoch %d - lr: %.5f - Train loss: %.5f - Val loss: %.5f' % (num_ep + 1, η, avg_train_loss, avg_val_loss))
              
              #early stopping condition GL
              if (val_loss_log[-1]/min(val_loss_log)-1>0.04):
                  break
                
          #average error between k-folds
          val_loss+=min(val_loss_log)/len(x_cv)
          if np.argmin(val_loss_log) > stopping_index:
              stopping_index=np.argmin(val_loss_log)
              
      print('MSE:', val_loss)
      print('index:', stopping_index)
      MSE.append(val_loss)

      #assign the best error to the lower value and choosing the parameters using it
      if val_loss < best_error:
            best_error = val_loss
            best_Nh1=Nh1
            best_Nh2=Nh2
            best_act_fn=act_fn
            best_epochs = stopping_index
 

BEST NETWORK

In [0]:
 print(best_Nh1, best_Nh2, best_act_fn, best_epochs, best_error)

In [0]:
num_epochs = best_epochs

η = 0.001
η_final = 0.0005
η_decay = (η_final / η)**(1 / num_epochs)
en_decay=True
                          
train_loss_log = []
test_loss_log = []
net = Network(Ni,best_Nh1,best_Nh2, No, best_act_fn)
#it needs to initialize this variables outside the cycle
m1= np.zeros(net.WBh1.shape)
s1= np.zeros(net.WBh1.shape)
m2= np.zeros(net.WBh2.shape)
s2= np.zeros(net.WBh2.shape)
mo= np.zeros(net.WBo.shape)
so= np.zeros(net.WBo.shape)

for num_ep in range(num_epochs):
    # Learning rate decay
    if en_decay:
        η *= η_decay
    # Train single epoch (sample by sample, no batch for now)
    train_loss_vec = [net.update(x, y,η, num_ep) for x, y in zip(x_train, y_train)]
    avg_train_loss = np.mean(train_loss_vec)
    # Test network
    y_test_est = np.array([net.forward(x) for x in x_test])
    avg_test_loss = np.mean((y_test_est - y_test)**2/2)
    # Log
    train_loss_log.append(avg_train_loss)
    test_loss_log.append(avg_test_loss)
    print('Epoch %d  - Train loss: %.5f - Test loss: %.5f' % (num_ep + 1, avg_train_loss, avg_test_loss))



In [0]:
# Plot losses
plt.close('all')
plt.figure(figsize=(12,8))
plt.semilogy(train_loss_log, label='Train loss')
plt.semilogy(test_loss_log, label='Test loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()

### Plot weights after training

In [0]:
net.plot_weights()

###  FORWARD PASS (after training)

In [0]:
x_highres = np.linspace(min(x_train), max(x_train), 1000)
net_output = np.array([net.forward(x) for x in x_highres])

### Plot
plt.close('all')
plt.figure(figsize=(12,8))
plt.plot(x_train, y_train, color='r', ls='', marker='.', label='Train data points')
plt.plot(x_test, y_test, color='b', ls='', marker='.', label='Test data points')
plt.plot(x_highres, net_output, color='g', ls='--', label='Network output (trained weights)')
plt.xlabel('x')
plt.ylabel('y')
plt.grid()
plt.legend()
plt.tight_layout()
plt.show()

###  Analyze activations

In [0]:
x1 = 0.1
y1, z1 = net.forward(x1, additional_out=True)
x2 = 2
y2, z2 = net.forward(x2, additional_out=True)
x3 = -3.5
y3, z3 = net.forward(x3, additional_out=True)


fig, axs = plt.subplots(3, 1, figsize=(12,6))
axs[0].stem(z1)
axs[0].set_title('Last layer activations for input x=%.2f' % x1)
axs[1].stem(z2)
axs[1].set_title('Last layer activations for input x=%.2f' % x2)
axs[2].stem(z3)
axs[2].set_title('Last layer activations for input x=%.2f' % x3)
plt.tight_layout()
plt.show()

In [0]:
np.save('WBh1', net.WBh1) 
np.save('WBh2', net.WBh2) 
np.save('WBo', net.WBo) 