In [51]:
from abc import abstractmethod, ABC
from typing import List
import numpy as np
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from icecream import ic




In [52]:
def sigmoid(x):
    clipped_x = np.clip(x, -500, 500)  # Prevent overflow
    return 1 / (1 + np.exp(-clipped_x))

def sigmoid_derivative(x):
    clipped_x = np.clip(x, -500, 500)  # Prevent overflow
    return clipped_x * (1 - clipped_x)

In [53]:
# def sigmoid(x):
#     return 1 / (1 + np.exp(-x))

# def sigmoid_derivative(x):
#     return x * (1 - x)


In [73]:

class Layer(ABC):
    """Basic building block of the Neural Network"""

    def __init__(self) -> None:
        self._learning_rate = 0.01

    @abstractmethod
    def forward(self, x: np.ndarray) -> np.ndarray:
        """Forward propagation of x through layer"""
        pass

    @abstractmethod
    def backward(self, output_error_derivative) -> np.ndarray:
        """Backward propagation of output_error_derivative through layer"""
        pass

    @property
    def learning_rate(self):
        return self._learning_rate

    @learning_rate.setter
    def learning_rate(self, learning_rate):
        assert learning_rate < 1, f"Given learning_rate={learning_rate} is larger than 1"
        assert learning_rate > 0, f"Given learning_rate={learning_rate} is smaller than 0"
        self._learning_rate = learning_rate

class FullyConnected(Layer):
    def __init__(self, input_size: int, output_size: int) -> None:
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.weights = np.random.randn(input_size, output_size)
        self.bias = np.zeros((1, output_size))
        self.inputs = None
        self.outputs = None

    def forward(self, x: np.ndarray) -> np.ndarray:
        self.inputs = x
        self.outputs = np.dot(x, self.weights) + self.bias
        return sigmoid(self.outputs)

    def backward(self, output_error_derivative) -> np.ndarray:
        sigmoid_derivative_output = sigmoid_derivative(self.outputs)
        error_derivative = output_error_derivative * sigmoid_derivative_output
        weight_gradients = np.dot(self.inputs.T.reshape(-1, 1), error_derivative.reshape(1, -1))
        self.weights -= self.learning_rate * weight_gradients
        self.bias -= self.learning_rate * np.sum(error_derivative, axis=0, keepdims=True)
        return np.dot(error_derivative, self.weights.T).reshape(self.inputs.shape)

class Tanh(Layer):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x: np.ndarray) -> np.ndarray:
        self.inputs = x
        self.outputs = np.tanh(x)
        return self.outputs

    def backward(self, output_error_derivative) -> np.ndarray:
        tanh_derivative_output = 1 - np.tanh(self.inputs)**2
        return output_error_derivative * tanh_derivative_output

class Loss:
    def __init__(self, loss_function: callable, loss_function_derivative: callable) -> None:
        self.loss_function = loss_function
        self.loss_function_derivative = loss_function_derivative

    def loss(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Loss function for a particular x and y"""
        return self.loss_function(x, y)

    def loss_derivative(self, x: np.ndarray, y: np.ndarray) -> np.ndarray:
        """Loss function derivative for a particular x and y"""
        return self.loss_function_derivative(x, y)

class MeanSquaredErrorLoss:
    @staticmethod
    def loss(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        return np.mean((x - y)**2)

    @staticmethod
    def loss_derivative(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        return 2 * (x - y) / x.size

class CrossEntropyLoss:
    @staticmethod
    def loss(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        epsilon = 1e-15
        x_clipped = np.clip(x, epsilon, 1 - epsilon)
        return -np.mean(y * np.log(x_clipped) + (1 - y) * np.log(1 - x_clipped))

    @staticmethod
    def loss_derivative(x: np.ndarray, y: np.ndarray) -> np.ndarray:
        epsilon = 1e-15
        x_clipped = np.clip(x, epsilon, 1 - epsilon)
        return -(y / x_clipped - (1 - y) / (1 - x_clipped)) / x.size

class Network:
    def __init__(self, layers: List[Layer], learning_rate: float) -> None:
        self.layers = layers
        self.learning_rate = learning_rate
        self.loss = None

    def compile(self, loss: Loss) -> None:
        """Define the loss function and loss function derivative"""
        self.loss = loss

    def __call__(self, x: np.ndarray) -> np.ndarray:
        """Forward propagation of x through all layers"""
        for layer in self.layers:
            x = layer.forward(x)
        return x

    def fit(self,
            x_train: np.ndarray,
            y_train: np.ndarray,
            epochs: int,
            verbose: int = 0,
            loss: Loss = MeanSquaredErrorLoss) -> None:
        """Fit the network to the training data"""
        for layer in self.layers:
            layer.learning_rate = self.learning_rate

        self.compile(loss)
        for epoch in range(epochs):
            total_loss = 0
            for i in range(len(x_train)):
                x = x_train[i]
                y = y_train[i]
                # Forward propagation
                output = self(x)

                # Compute loss
                total_loss += self.loss.loss(output, y)

                # Backward propagation
                error_derivative = self.loss.loss_derivative(output, y)
                for layer in reversed(self.layers):
                    error_derivative = layer.backward(error_derivative)

            if verbose and epoch % verbose == 0:
                print(f"Epoch {epoch}, Loss: {total_loss / len(x_train)}")



In [78]:
digits = load_digits()
X, y = digits.data, digits.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert target labels to one-hot encoding
y_train_one_hot = np.eye(10)[y_train]


learning_rate = 0.00001
# Build and train the network
network = Network([FullyConnected(64, 256), Tanh(), Tanh(), FullyConnected(256, 10)], learning_rate=learning_rate)
network.compile(loss=MeanSquaredErrorLoss)
network.fit(X_train, y_train_one_hot, epochs=20, verbose=1)

# Evaluate the network
predictions = np.argmax(network(X_test), axis=1)
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy}")

Epoch 0, Loss: 0.2470171336770237
Epoch 1, Loss: 0.15228227897442678
Epoch 2, Loss: 0.113334038628934
Epoch 3, Loss: 0.08997283981368895
Epoch 4, Loss: 0.08032414666310927
Epoch 5, Loss: 0.07320191474199839
Epoch 6, Loss: 0.06829085313650542
Epoch 7, Loss: 0.06512031876869273
Epoch 8, Loss: 0.061627331845212296
Epoch 9, Loss: 0.05904390901392411
Epoch 10, Loss: 0.05756454920133255
Epoch 11, Loss: 0.05603329113245437
Epoch 12, Loss: 0.0549676256424551
Epoch 13, Loss: 0.054039644483366504
Epoch 14, Loss: 0.05286067229232598
Epoch 15, Loss: 0.05258125244154933
Epoch 16, Loss: 0.05245609366052751
Epoch 17, Loss: 0.052368052951380493
Epoch 18, Loss: 0.05164031942808724
Epoch 19, Loss: 0.05099047317651817
Accuracy: 0.8055555555555556


In [56]:
# import numpy as np
# from sklearn.datasets import load_digits
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from typing import List

# def sigmoid(x):
#     clipped_x = np.clip(x, -500, 500)  # Prevent overflow
#     return 1 / (1 + np.exp(-clipped_x))

# def sigmoid_derivative(x):
#     clipped_x = np.clip(x, -500, 500)  # Prevent overflow
#     return clipped_x * (1 - clipped_x)

# class Layer:
#     def __init__(self):
#         self._learning_rate = 0.00001

#     @property
#     def learning_rate(self):
#         return self._learning_rate

#     @learning_rate.setter
#     def learning_rate(self, learning_rate):
#         assert 0 < learning_rate < 1, f"Invalid learning rate: {learning_rate}"
#         self._learning_rate = learning_rate

# class FullyConnected(Layer):
#     def __init__(self, input_size: int, output_size: int):
#         super().__init__()
#         self.input_size = input_size
#         self.output_size = output_size
#         self.weights = np.random.randn(input_size, output_size)
#         self.bias = np.zeros((1, output_size))
#         self.inputs = None
#         self.outputs = None

#     def forward(self, x: np.ndarray) -> np.ndarray:
#         self.inputs = x.reshape(-1, self.input_size)
#         self.outputs = np.dot(self.inputs, self.weights) + self.bias
#         return sigmoid(self.outputs)

#     def backward(self, output_error_derivative) -> np.ndarray:
#         sigmoid_derivative_output = sigmoid_derivative(self.outputs)
#         error_derivative = output_error_derivative * sigmoid_derivative_output
#         weight_gradients = np.dot(self.inputs.T, error_derivative)
#         self.weights -= self.learning_rate * weight_gradients
#         self.bias -= self.learning_rate * np.sum(error_derivative, axis=0, keepdims=True)
#         return np.dot(error_derivative, self.weights.T).reshape(self.inputs.shape)

# class Network:
#     def __init__(self, layers: List[Layer]) -> None:
#         self.layers = layers

#     def fit(self, x_train: np.ndarray, y_train: np.ndarray, epochs: int, verbose: int = 0) -> None:
#         for epoch in range(epochs):
#             total_loss = 0
#             for i in range(len(x_train)):
#                 x = x_train[i]
#                 y = y_train[i]

#                 # Forward propagation
#                 output = x
#                 for layer in self.layers:
#                     output = layer.forward(output)

#                 # Compute loss
#                 total_loss += np.mean((output - y) ** 2)

#                 # Backward propagation
#                 error_derivative = 2 * (output - y) / len(x_train)
#                 for layer in reversed(self.layers):
#                     error_derivative, = layer.backward(error_derivative)

#             if verbose and epoch % verbose == 0:
#                 print(f"Epoch {epoch}, Loss: {total_loss / len(x_train)}")

#     def __call__(self, x: np.ndarray) -> np.ndarray:
#         for layer in self.layers:
#             x = layer.forward(x)
#         return x

# # Load and preprocess the data
# digits = load_digits()
# X, y = digits.data, digits.target
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)
# y_train_one_hot = np.eye(10)[y_train]

# # Build and train the network
# network = Network([FullyConnected(64, 256), FullyConnected(256, 10)])
# network.fit(X_train, y_train_one_hot, epochs=30, verbose=1)

# # Evaluate the network
# predictions = np.argmax(network(X_test), axis=1)
# accuracy = np.mean(predictions == y_test)
# print(f"Accuracy: {accuracy}")

Epoch 0, Loss: 0.3807006511789819
Epoch 1, Loss: 0.3873839163949159
Epoch 2, Loss: 0.38936492112868215
Epoch 3, Loss: 0.3859346748924096
Epoch 4, Loss: 0.37236478374148935
Epoch 5, Loss: 0.36537073706029827
Epoch 6, Loss: 0.3427777715574182
Epoch 7, Loss: 0.3078890884725175
Epoch 8, Loss: 0.2715709639961782
Epoch 9, Loss: 0.24053367374991771
Epoch 10, Loss: 0.21651958681639366
Epoch 11, Loss: 0.20935913754538696
Epoch 12, Loss: 0.2028029120026361
Epoch 13, Loss: 0.19393066765716782
Epoch 14, Loss: 0.18737875489341596
Epoch 15, Loss: 0.1821547078042899
Epoch 16, Loss: 0.17896430621968115
Epoch 17, Loss: 0.17592424915991006
Epoch 18, Loss: 0.16953376764118225
Epoch 19, Loss: 0.1600407398566393
Epoch 20, Loss: 0.16067578911936778
Epoch 21, Loss: 0.15917245007076197
Epoch 22, Loss: 0.15703544981388204
Epoch 23, Loss: 0.15821843706198868
Epoch 24, Loss: 0.16429967366726853
Epoch 25, Loss: 0.16375070034973702
Epoch 26, Loss: 0.16008233453311602
Epoch 27, Loss: 0.16000320345736097
Epoch 28, L