## Programming and Maths for AI: Task 1

#### Activation functions: sigmoid and ReLU

Sigmoid and its derivative

In [1]:
import numpy as np

def sigmoid(x):

    s = 1/(1 + np.exp(-x))

    return s

def derivative_sigmoid(z):
    s = 1 / (1 + np.exp(-z))
    ds = s*(1-s)

    return ds


ReLU and its derivative

In [2]:
def relu(x):
    return x * (x > 0)

def derivative_relu(x):
    return 1 * (x>0)

Now let's implement the softmax function

In [3]:
#def softmax(vector):
#    e = np.exp(vector)
#    return e / e.sum()

# if z is very large, exp(z) can overflow
# solution: subtract by max
def softmax(z): #stable, so we dont have exploding issues due to exp
    z = z - np.max(z, axis=1, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)


### Implement a fully parametrizable neural network class

In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
iris = load_iris(as_frame=True)
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['species'] = iris.target
data


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [5]:
from sklearn.model_selection import train_test_split
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2,random_state=42)


X_train

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
22,4.6,3.6,1.0,0.2
15,5.7,4.4,1.5,0.4
65,6.7,3.1,4.4,1.4
11,4.8,3.4,1.6,0.2
42,4.4,3.2,1.3,0.2
...,...,...,...,...
71,6.1,2.8,4.0,1.3
106,4.9,2.5,4.5,1.7
14,5.8,4.0,1.2,0.2
92,5.8,2.6,4.0,1.2


## Create a NN class

In [8]:
from sklearn.model_selection import train_test_split
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2,random_state=42)

X_train_np = X_train.values if hasattr(X_train, 'values') else np.array(X_train)
y_train_np = y_train.values if hasattr(y_train, 'values') else np.array(y_train)

X_test_np = X_test.values if hasattr(X_test, 'values') else np.array(X_test)
y_test_np = y_test.values if hasattr(y_test, 'values') else np.array(y_test)

# just 4 samples
X = np.array(X_train)

# target values 
y = np.array(y_train).T 

class NeuralNetwork:

    #hidden layer (sigmoid, relu)
    #output layer (softmax)

    def __init__(self, activation_function, no_of_input_nodes, no_of_hidden_nodes, no_of_output_nodes, n_epochs,
                 lambda1,lambda2,lr,dropout, optimizer, momentum):
        self.hidden_layers = len(no_of_hidden_nodes)
        self.activation_function = activation_function
        self.no_of_input_nodes = no_of_input_nodes # as many as the dataset's features
        self.no_of_hidden_nodes = no_of_hidden_nodes # no fixed number, needs tuning
        self.no_of_output_nodes = no_of_output_nodes # as many as the output classes
        self.n_epochs = n_epochs #no. of epochs to run the NN
        self.lambda1 = lambda1 #lambda variable for L1 regularisation
        self.lambda2 = lambda2 #lambda variable for L2 regularisation
        self.learning_rate = lr #how much to update the weights
        self.z_values = [] # need to store for backprop
        self.a_values = []   # need to store for backprop, first value is the actual data
        self.dropout_masks = [] # save the masks for backprop
        self.dropout = dropout #probability for dropout
        self.weights, self.biases = self.weights_and_bias()
        
        #optimizer settings
        self.optimizer = optimizer # the optimizer that will be used
        self.momentum = momentum
        self.vW = [np.zeros_like(W) for W in self.weights]  # velocity for momentum
        self.vb = [np.zeros_like(b) for b in self.biases]

        

    def weights_and_bias(self):
        layers = [self.no_of_input_nodes] + self.no_of_hidden_nodes + [self.no_of_output_nodes] #e.g. [2,3,5,2]
        weights = [] #TODO: weight and bias proper initialization (xavier)
        biases = []
        for i in range(len((layers))-1):

            n_in = layers[i]
            n_out = layers[i+1]

            weights.append(2*np.random.random((n_in,n_out)) - 1)
            biases.append(np.zeros((1, n_out)))
        return weights, biases
    
    def inverted_dropout(self, x, p):
        # p = dropout probability
        mask = (np.random.rand(*x.shape) > p).astype(float)
        self.dropout_masks.append(mask)
        x_dropped = (x * mask)/(1 - p) # the actual dropout
        
        return x_dropped
    
    def forward_pass(self, X, training=True): #add training flag for dropout

        # Clear previous batch values
        self.a_values = []
        self.z_values = []
        self.dropout_masks = []

        #first hidden layer needs to have the actual data
        #all other hidden layers take the result from the previous layer
        a = X
        self.a_values.append(a) # save for backprop
        
        for layer in range(self.hidden_layers):

            W = self.weights[layer]
            b = self.biases[layer]
            z = np.dot(a, W) + b
            self.z_values.append(z) # save for backprop

            if self.activation_function =='sigmoid':
                a = sigmoid(z)
            elif self.activation_function =='relu':
                a = relu(z)
            #print(f'Shape of layer: {a.shape}')

            #implementing inverted dropout
            if training: # In testing, dropout is not applied
                a = self.inverted_dropout(a, self.dropout) 

            self.a_values.append(a) # save for backprop

        #---Output Layer---
        W = self.weights[-1]
        b = self.biases[-1]
        z = np.dot(a, W) + b
        self.z_values.append(z) # save for backprop
        a = softmax(z)
        self.a_values.append(a) # save for backprop
        #print(f'Shape of output layer: {a.shape}')
        return a
        

    def backward_pass(self, X_train, y_true):
        
        N = y_true.shape[0]

        # We convert labels to one-hot so that they match the shape of y_true
        y_one_hot = np.zeros((N, self.no_of_output_nodes))
        y_one_hot[np.arange(N), y_true] = 1

        # make empty lists for gradients
        dW = [None] * len(self.weights)
        db = [None] * len(self.biases)

        # --- Output layer ---
        delta = self.a_values[-1] - y_one_hot  # Derivative of loss: softmax + CE derivative (a(L) - y)
        
        dW[-1] = self.a_values[-2].T.dot(delta) / N

        dW[-1] += self.lambda1 * np.sign(self.weights[-1])   # L1 gradient
        dW[-1] += 2 * self.lambda2 * self.weights[-1]       # L2 gradient


        db[-1] = np.mean(delta, axis=0, keepdims=True) #gradient of bias

        #self.weights[-1] -= self.learning_rate * dW[-1] # update the weight
        #self.biases[-1] -= self.learning_rate * db[-1] # update the bias

        # --- Hidden layers ---
        for layer in range(self.hidden_layers - 1, -1, -1):

            # Backprop error
            delta = delta.dot(self.weights[layer + 1].T)

            #use dropout
            delta *= self.dropout_masks[layer]

            if self.activation_function =='sigmoid':
                delta *= derivative_sigmoid(self.z_values[layer])
            elif self.activation_function =='relu':
                delta *= derivative_relu(self.z_values[layer])
            
            a_prev = X_train if layer == 0 else self.a_values[layer]
            dW[layer] = a_prev.T.dot(delta) / N 
            db[layer] = np.mean(delta, axis=0, keepdims=True) #gradient of bias

            dW[layer] += self.lambda1 * np.sign(self.weights[layer])   # L1 gradient
            dW[layer] += 2 * self.lambda2 * self.weights[layer]       # L2 gradient

            
            #self.weights[layer] -= self.learning_rate * dW[layer] # update the weight
            #self.biases[layer] -= self.learning_rate * db[layer] # update the bias

        self.choose_optimizer(dW, db)




    def compute_loss(self, y_pred, y_true):
        '''
        Cross entropy loss for multi-class classification with L1 and L2 regularization
        '''

        # number of samples
        N = y_true.shape[0]
        
        correct_probs = y_pred[np.arange(N), y_true] #TODO: change to binary formula if dataset is binary
    
        # loss = average of -log(p) whre p is the predicted probability of the correct class
        loss = -np.sum(np.log(correct_probs)) / N

        l1_loss = self.lambda1 * sum(np.sum(np.abs(W)) for W in self.weights)
        l2_loss = self.lambda2 * sum(np.sum(W**2) for W in self.weights)


        return loss + l1_loss + l2_loss
    
    def choose_optimizer(self, dW, db): # function to call the respective optimizer method to be used
        if self.optimizer == "momentum":
            self.momentum_update(dW, db)
        elif self.optimizer == "adam":
            self.adam_update(dW, db)
        else:
            raise ValueError("Unsupported optimizer")
        
    def momentum_update(self,dW,db):
        for i in range(len(self.weights)):
            self.vW[i] = self.momentum * self.vW[i] + (1 - self.momentum) * dW[i]
            self.vb[i] = self.momentum * self.vb[i] + (1 - self.momentum) * db[i]
            self.weights[i] -= self.learning_rate * self.vW[i]
            self.biases[i]  -= self.learning_rate * self.vb[i]


    




In [9]:
#Param tuning:
#lambda1 and lambda 2 can be from 0 to 1

X_small = X_train_np[:10]
y_small = y_train_np[:10]

nn = NeuralNetwork(
    activation_function='sigmoid',
    no_of_input_nodes=X_train.shape[1], # input features
    no_of_hidden_nodes=[32,64,64],
    no_of_output_nodes=3, #TODO len(np.unique(y_train)?
    n_epochs=100,
    lambda1=0.1,
    lambda2=0.1,
    lr=0.01,
    dropout=0.5,
    optimizer="momentum",
    momentum=0.9
    )

for epoch in range(nn.n_epochs):
    # Forward pass
    y_pred = nn.forward_pass(X_train_np, training=True)

    # Loss function
    loss = nn.compute_loss(y_pred, y_train_np)

    # Backward pass
    nn.backward_pass(X_train_np, y_train_np)

    if epoch % 5 == 0:
        y_pred_labels = np.argmax(y_pred, axis=1)
        accuracy = np.mean(y_pred_labels == y_train_np)
        print(f"Epoch {epoch}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")


Epoch 0, Loss: 543.1824, Accuracy: 0.3417
Epoch 5, Loss: 538.6668, Accuracy: 0.3333
Epoch 10, Loss: 530.6938, Accuracy: 0.3167
Epoch 15, Loss: 520.5474, Accuracy: 0.2917
Epoch 20, Loss: 508.2607, Accuracy: 0.3083
Epoch 25, Loss: 495.0442, Accuracy: 0.3500
Epoch 30, Loss: 482.4752, Accuracy: 0.3667
Epoch 35, Loss: 469.5084, Accuracy: 0.3333
Epoch 40, Loss: 456.9961, Accuracy: 0.3083
Epoch 45, Loss: 444.4383, Accuracy: 0.3250
Epoch 50, Loss: 432.0897, Accuracy: 0.2917
Epoch 55, Loss: 420.0829, Accuracy: 0.3417
Epoch 60, Loss: 408.4005, Accuracy: 0.3667
Epoch 65, Loss: 397.0564, Accuracy: 0.3667
Epoch 70, Loss: 385.8525, Accuracy: 0.3917
Epoch 75, Loss: 375.2078, Accuracy: 0.3917
Epoch 80, Loss: 364.7857, Accuracy: 0.3000
Epoch 85, Loss: 354.2515, Accuracy: 0.2750
Epoch 90, Loss: 344.0690, Accuracy: 0.3750
Epoch 95, Loss: 333.9475, Accuracy: 0.3917


| Parameter       | Description                                        | Where it is used                                                     |
| --------------- | -------------------------------------------------- | -------------------------------------------------------------------- |
| `learning_rate` | Step size for gradient descent                     | Used in weight update: `W -= learning_rate * grad_W`                 |
| `update_rule`   | Optimization method (`sgd`, `momentum`, `adam`)    | Determines how gradients are applied to weights                      |
| `decay`         | Learning rate decay per epoch                      | `learning_rate *= (1 / (1 + decay * epoch))`                         |
| `epochs`        | Number of times to iterate over the entire dataset | Controls the main training loop                                      |
| `batch_size`    | Number of samples per mini-batch                   | Used to split data into mini-batches for stochastic gradient descent |
| `momentum`      | Momentum coefficient (if using momentum optimizer) | Used in update: `v = beta*v + (1-beta)*grad`                         |
