In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from PIL import Image

In [3]:
class FFNN:
    def __init__(self, layer_sizes, 
                 activation_fn, activation_fn_deriv, 
                 output_activation_fn, output_activation_fn_deriv, 
                 loss_fn, loss_fn_deriv, task_type='classification'):
        
        self.num_layers = len(layer_sizes)
        self.layer_sizes = layer_sizes
        self.activation_fn = activation_fn
        self.activation_fn_deriv = activation_fn_deriv
        self.output_activation_fn = output_activation_fn
        self.output_activation_fn_deriv = output_activation_fn_deriv
        self.loss_fn = loss_fn
        self.loss_fn_deriv = loss_fn_deriv
        self.task_type = task_type
        
        # Initialize weights and biases (He initialization)
        self.W = [np.random.randn(layer_sizes[i+1], layer_sizes[i]) * np.sqrt(2./layer_sizes[i]) 
                  for i in range(self.num_layers-1)]
        self.b = [np.random.randn(layer_sizes[i+1], 1) for i in range(self.num_layers-1)]

    def forward(self, x):
        a = [x]
        z = []
        
        # Hidden layers
        for i in range(self.num_layers-2):
            z_i = self.W[i].dot(a[i]) + self.b[i]
            a_i = self.activation_fn(z_i)
            z.append(z_i)
            a.append(a_i)
        
        # Output layer
        z_out = self.W[-1].dot(a[-1]) + self.b[-1]
        a_out = self.output_activation_fn(z_out)
        z.append(z_out)
        a.append(a_out)
        
        return a_out, a, z
    
    def compute_loss(self, predictions, t):
        # Ensure the target shape matches the predictions
        if t.shape[0] != predictions.shape[0]:
            t = t.T
        return self.loss_fn(predictions, t)
    
    def _one_hot(self, inputs):
        n_values = len(np.unique(inputs))
        return np.eye(n_values)[:,inputs]
    
    def backward(self, x, t, a, z):
        m = x.shape[1]
        dL_dw = [np.zeros_like(w) for w in self.W]
        dL_db = [np.zeros_like(b) for b in self.b]
        
        # Ensure the target shape matches the predictions
        if t.shape[0] != a[-1].shape[0]:
            t = t.T
        
        dL_da = self.loss_fn_deriv(a[-1], t)
        dL_dz = dL_da * self.output_activation_fn_deriv(z[-1])
        
        dL_dw[-1] = dL_dz.dot(a[-2].T)
        dL_db[-1] = np.sum(dL_dz, axis=1, keepdims=True)
        
        for i in range(self.num_layers-3, -1, -1):
            dL_da = self.W[i+1].T.dot(dL_dz)
            dL_dz = dL_da * self.activation_fn_deriv(z[i])
            dL_dw[i] = dL_dz.dot(a[i].T)
            dL_db[i] = np.sum(dL_dz, axis=1, keepdims=True)
        
        return dL_dw, dL_db
    
    def update_weights(self, dL_dw, dL_db, learning_rate):
        self.W = [w - learning_rate * dw for w, dw in zip(self.W, dL_dw)]
        self.b = [b - learning_rate * db for b, db in zip(self.b, dL_db)]
        
    def train(self, X, labels, learning_rate=0.01, epochs=1000, batch_size=None):
        """Trains the neural network using the given training data and labels."""
        m = X.shape[1]

        for epoch in range(epochs):
            if batch_size:  # If batch size is specified, use mini-batch gradient descent
                num_batches = m // batch_size
                for batch in range(num_batches):
                    X_batch = X[:, batch*batch_size:(batch+1)*batch_size]
                    labels_batch = labels[batch*batch_size:(batch+1)*batch_size]

                    # Forward pass
                    predictions, a_batch, z_batch = self.forward(X_batch)
                    
                    # Backward pass
                    dL_dw, dL_db = self.backward(X_batch, labels_batch, a_batch, z_batch)

                    # Update weights and biases
                    self.update_weights(dL_dw, dL_db, learning_rate)
            else:  # Otherwise, use batch gradient descent
                # Forward pass
                predictions, a_full, z_full = self.forward(X)
                
                # Backward pass
                dL_dw, dL_db = self.backward(X, labels, a_full, z_full)

                # Update weights and biases
                self.update_weights(dL_dw, dL_db, learning_rate)

            # Print loss at the end of each 100 epochs:
            if epoch % 100 == 0:
                # Ensure activations are for the entire dataset
                predictions, _, _ = self.forward(X)
                loss = self.compute_loss(predictions, labels)
                print(f"Epoch {epoch+1}/{epochs} - Loss: {np.mean(loss)}")

In [4]:
def ReLU(inputs):
    return np.maximum(inputs, 0)
def deriv_ReLU(Z):
    return Z>0

def sigmoid(inputs, clip_value=200):
    inputs = np.clip(inputs, -clip_value, clip_value)
    return 1 / (1 + np.exp(-inputs))

def deriv_sigmoid(inputs):
    return(inputs*(1-inputs))

def Linear(inputs):
    return (1/10)*inputs

def deriv_Linear(inputs):
    return inputs*0 + (1/10)

def clipped_ReLU(x, c=1):
    return np.minimum(np.maximum(0, x), c)

def deriv_clipped_ReLU(inputs):
    return (inputs>0) & (inputs<1)

def L2(outputs, targets):
    return 0.5*sum((outputs - targets)**2)

def deriv_L2(outputs, targets):
    return outputs - targets

def one_hot(inputs):
    n_values = np.max(inputs) + 1
    return np.eye(n_values)[:,inputs]

def softmax(Z):
    shiftZ = Z - np.max(Z, axis=0)
    exps = np.exp(shiftZ)
    return exps / np.sum(exps, axis=0)

def dummy_deriv_softmax(Z):
    return Z

def categorical_cross_entropy(predictions, labels):
    m = labels.shape[1]
    return -np.sum(np.log(predictions) * labels) / m

def deriv_cat_cross_entropy(predictions, labels):
    return predictions - labels  




In [5]:
layer_sizes = [3,30,3]

ffnn = FFNN(layer_sizes = layer_sizes, 
                activation_fn = ReLU, 
                activation_fn_deriv = deriv_ReLU, 
                output_activation_fn = Linear, 
                output_activation_fn_deriv = deriv_Linear, 
                loss_fn = L2, 
                loss_fn_deriv = deriv_L2,
                task_type='regression')

ffnn.train(X.T, Y, learning_rate=0.001, epochs = 3000, batch_size=2000)