# Used Modules

In [21]:
import numpy as np
from typing import List, Union, Tuple
import pandas as pd

# Utility functions

In [14]:
def categoerical_cross_entropy(y_true: np.ndarray, y_pred: np.ndarray, deriative=False) -> Union[float, Tuple[np.ndarray, np.ndarray]]:
    # Clip to prevent NaN's and Inf's to prevent log(0) or division by zero:
    y_pred = np.clip(y_pred, 1e-7, 1 - 1e-7)
    loss = -np.sum(y_true * np.log(y_pred), axis=-1)

    if deriative:
        grad = y_pred - y_true
        return loss, grad
    else:
        return loss
    
def to_one_hot(y:np.ndarray, num_classes: int):
    one_hot_encoding = np.zeros((y.shape[0], num_classes))
    one_hot[np.arange(y.shape[0], y)] = 1
    return one_hot

                      
def softmax(x: np.ndarray, derivative=False) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
    softmax_output = np.exp(x) / np.sum(np.exp(x), axis=0)

    if derivative:
        s = softmax_output.reshape(-1, 1)
        return np.diagflat(s) - np.dot(s, s.T)
    else:
        return softmax_output
        
def softmax_stable(x: np.ndarray):
    return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())
                
                
def relu(x, deriative=False):
    if deriative:
        grad = np.zeros_like(x)
        grad = np.where(x >= 0, 1, grad)
        return grad
    else:
        return np.maximum(0, x)       

def softmax_loss(x: np.ndarray, y_true: np.ndarray) -> float:
    y_pred = softmax(x)
    loss = categorical_cross_entropy(y_true, y_pred)
    return loss                

# Architecture of a single Layer class

In [15]:
class Layer:
    def __init__(self, hidden_units, activation_fun: str=None):
         """
        Connected layer for a neural network.

        :param num_neurons: The number of neurons in the layer
        :param activation: The activation function to use (if any)
        """
            
        self.hidden_units = hidden_units
        self.activation_fun = activation_fun
        self.weights = None
        self.biases = None
    
    def _init_params(self, input_size: int, hidden_units: int):
         """
        Initialize the weights and biases for the layer.

        :param input_size: The number of inputs to the layer
        :param num_neurons: The number of neurons in the layer
        """
        np.random.seed(42)
        
        self.weights = np.random.randn(input_size, output_size) * np.sqrt(2. / input_size)
        self.biases = np.zeros((1, num_neurons)) 
    
    def _apply_activation(self, weighted_input: np.ndarray)
        """
        Apply the activation function if any.
        """
        if self.activation == 'relu':
            return self.relu(weighted_input, derivative)
        elif self.activation == 'softmax':
            return self.softmax(weighted_input, derivative)
        else:
            return weighted_input # just return input

    
    def forward_pass(self, inputs: np.ndarray) -> np.ndarray
        self.inputs = inputs
        if self.weights is None:
            self._init_params(inputs.shape[-1], self.hidden_units)
        
        self.weighted_inputs = inputs @ self.weights + self.biases
        if self.activation:
            self.outputs = self._apply_activation(self.weighted_inputs)
        else:
            self.outputs= self.weighted_inputs
        
        return self.outputs
    
    @staticmathod
    def softmax(x: np.ndarray, derivative=False) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
        softmax_output = np.exp(x) / np.sum(np.exp(x), axis=0)
        
        if derivative:
            s = softmax_output.reshape(-1, 1)
            return np.diagflat(s) - np.dot(s, s.T)
        else:
            return softmax_output
    
    @staticmathod
    def softmax_stable(x: np.ndarray):
        return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())
                
    @staticmathod           
    def relu(x, deriative=False):
        if deriative:
            grad = np.zeros_like(x)
            grad = np.where(x >= 0, 1, grad)
            return grad
        else:
            return np.maximum(0, x)       


    def softmax_loss(x: np.ndarray, y_true: np.ndarray) -> float:
        y_pred = softmax(x)
        loss = categorical_cross_entropy(y_true, y_pred)
        return loss
    
    @property
    def params(self):
        return [self.weights, self.biases]

# Optimizers


In [20]:
class Optimizer:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
    
    def update(self, params, grads):
        raise NotImplementedError
        
class SGD(Optimizer):
    def __init__(self, learning_rate=0.01, momentum=0.9):
        super().__init__(learning_rate)
        self.momentum = momentum
        self.velocity: List[Tuple[np.ndarray, np.ndarray]] = []

    def update(self, layers, grads):
        if not self.velocity:
            self.velocity = [(np.zeros_like(layer.weights), np.zeros_like(layer.biases)) for layer in layers]

        for (v_w, v_b), layer, (dw, db) in zip(self.velocity, layers, grads):
            v_w = self.momentum * v_w + self.learning_rate * dw
            layer.weights -= v_w
            
            v_b = self.momentum * v_b + self.learning_rate * db
            layer.biases -= v_b
            
class AdaGrad(Optimizer):
    def __init__(self, learning_rate=0.01, epsilon=1e-7):
        super().__init__(learning_rate)
        self.epsilon = epsilon
        self.accumulated_grads: List[Tuple[np.ndarray, np.ndarray]] = []
    
    def update(self, layers: List[Layer], grads: List[Tuple[np.ndarray, np.ndarray]]):
        if not self.accumulated_grads:
            self.accumulated_grads = [(np.zeros_like(layer.weights), np.zeros_like(layer.biases)) for layer in layers]

        for (h_w, h_b), layer, (dw, db) in zip(self.h, layers, accumulated_grads):
            accumulated_grads_w += dw * dw
            layer.weights -= self.learning_rate * dw / (np.sqrt(accumulated_grads_w) + self.epsilon)
            
            accumulated_grads_b += db * db
            layer.biases -= self.learning_rate * db / (np.sqrt(accumulated_grads_b) + self.epsilon)

class Adam(Optimizer):
    """
    Adam optimizer implementation.
    https://optimization.cbe.cornell.edu/index.php?title=Adam
    """
    def __init__(self, learning_rate: float = 0.001, beta1: float = 0.9, beta2: float = 0.999, epsilon: float = 1e-7):
        """
        Initialize Adam optimizer.
        
        :param learning_rate: learning rate
        :param beta1: The exponential decay rate for the first moment estimates
        :param beta2: The exponential decay rate for the second-moment estimates
        :param epsilon: small value to prevent division by zero
        """
        super().__init__(learning_rate)
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m: List[Tuple[np.ndarray, np.ndarray]] = []
        self.v: List[Tuple[np.ndarray, np.ndarray]] = []
        self.t = 0
        
    def update(self, layers: List[Layer], grads: List[Tuple[np.ndarray, np.ndarray]]) -> None:
        """
        Perform the Adam update on parameters.

        :param layers: list of layers with parameters to update
        :param grads: list of gradients for each layer's parameters
        """
        if not self.m:
            self.m = [(np.zeros_like(layer.weights), np.zeros_like(layer.biases)) for layer in layers]
            self.v = [(np.zeros_like(layer.weights), np.zeros_like(layer.biases)) for layer in layers]

        self.t += 1

        for (m, v), layer, (dw, db) in zip(zip(self.m, self.v), layers, grads):
            m[0] = self.beta1 * m[0] + (1.0 - self.beta1) * dw
            bias_corrected_first_moment = m[0] / (1.0 - self.beta1**self.t)
            v[0] = self.beta2 * v[0] + (1.0 - self.beta2) * dw**2
            bias_corrected_second_moment = v[0] / (1.0 - self.beta2**self.t)
            
            updated_weights = layer.weights - self.learning_rate * bias_corrected_first_moment / (np.sqrt(bias_corrected_second_moment) + self.epsilon)
            layer.weights = updated_weights

            m[1] = self.beta1 * m[1] + (1.0 - self.beta1) * db
            bias_corrected_first_moment = m[1] / (1.0 - self.beta1**self.t)
            v[1] = self.beta2 * v[1] + (1.0 - self.beta2) * db**2
            bias_corrected_second_moment = v[1] / (1.0 - self.beta2**self.t)
            
            updated_biases = layer.biases - self.learning_rate * bias_corrected_first_moment / (np.sqrt(bias_corrected_second_moment) + self.epsilon)
            layer.biases = updated_biases
    

# Artificial Neural Network Architecture

In [18]:
class NeuralNetwork:
    def __init__(self, learning_rate=0.01, num_epochs=100, verbose=False):
        self.layers = []
        self.optimizer = None
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.verbose = verbose

    def add_layer(self, layer: Layer)
        self.layers.append(layer)
    
    def set_optimizer(self, optimizer)
        self.optimizer = optimizer
    
    def forward(self, X: np.ndarray):
    """
    Performs a forward pass throught the neural network.
    
    :param X: input data
    :return: the output of the last layer of the neural network, necessary to calculate backprop
    """
    if not self.layers:
        raise ValueError("No layers in the neural network.")
    
    for layer in self.layers:
        X = layer.forward(X)
    
    return X


            