In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
class Sequential:
    """
    A sequential container in Minima.

    Modules will be added to it in the order they are passed in the constructor.
    A `Sequential` module contains a sequence of child modules stored in the order they were added. 
    Each module is applied in order to the input to produce the output.

    The `Sequential` class makes it easy to build networks where the output of one layer is the input to the next.

    Attributes:
    - `modules` (tuple of `Module`): The sequence of child modules to apply.

    Methods:
    - `forward(x: Tensor) -> Tensor`: Passes the input through all the child modules in sequential order.
    """
    def __init__(
        self,
        *modules # The sequence of child modules to apply. Each argument should be an instance of `Module`.
    ):
        """
        Initializes a new `Sequential` instance.
        
        Args:
            *modules: The sequence of child modules to apply. Each argument should be an instance of `Module`.
        """
        for i, module in enumerate(modules):
            setattr(self, f'module_{i}', module)
        self.modules = modules
        
    def forward(self, x):
        """
        Defines the forward pass for the sequential module.
        
        Passes the input through all the child modules in the order they were added.

        Args:
            x (Tensor): The input tensor.
        
        Returns:
            Tensor: The output tensor.
        """
        for module in self.modules:
            x = module(x)
        return x

    def __iter__(self):
        self._iter_idx = 0;
        return self
    def __next__(self):
        if self._iter_idx < len(self.modules):
            res = self.modules[self._iter_idx]
            self._iter_idx += 1
            return res
        raise StopIteration()

In [None]:
class Parameter(np.ndarray):
    """ Parameter class"""

In [None]:
def _unpack_params(value: object):
    if isinstance(value, Parameter):
        return [value]
    elif isinstance(value, Module):
        return list(value.parameters())
    elif isinstance(value, list):
        return [item for v in value for item in _unpack_params(v)]
    return []

In [None]:
class Module:

    def parameters(self):
        """
        Returns a list of all `Parameter` instances in the module.
        This is done by unpacking the parameters from the module's dictionary.
        """
        return _unpack_params(self.__dict__)

    def _children(self):
        """
        Returns a list of all child `Module` instances in the module.
        This is done by unpacking the modules from the module's dictionary.
        """
        return _child_modules(self.__dict__)

    def __call__(self, *args, **kwargs):
        """
        Defines the call method for the module.
        This method simply calls the forward method and must be overridden by all subclasses.
        """
        self.input = args
        self.output = self.forward(*args, **kwargs)
    
        return self.output

In [None]:
class Linear(Module):
    """
    A class representing a fully connected (linear) layer in a neural network.
    This class inherits from the `Module` class.

    Attributes:
        in_features (int): The number of input features.
        out_features (int): The number of output features.
        device (str): The device to store the Parameters on (defaults to None, which means CPU).
        dtype (str): The data type of the Parameters (defaults to 'float32').
        weight (Parameter): The weight parameters of the layer.
        bias (Parameter): The bias parameters of the layer, or None if bias=False.

    Methods:
        forward(X: Tensor) -> Tensor: Compute the forward pass of the layer.
    """
    
    def __init__(
        self,
        in_features, # The number of input features.
        out_features,# The number of output features.
    ):
        """
        Initialize the layer with given input/output feature sizes and, optionally, bias, device, and dtype.

        Args:
            in_features (int): The number of input features.
            out_features (int): The number of output features.
        """
        
        self.in_features = in_features
        self.out_features = out_features

        self.weight = 0.01 * np.random.randn(in_features, out_features)
        self.bias = np.zeros((1, out_features))
        
    def __repr__(self) -> str:
        return f'Linear(in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None})'
            
    def forward(self, X):
        """
        Compute the forward pass of the layer.

        This function applies the linear transformation to the input tensor X, 
        i.e., performs the matrix multiplication of X and the weight tensor, 
        and then adds the bias tensor (if bias is not None).

        Args:
            X (Tensor): The input tensor.

        Returns:
            Tensor: The output tensor.
        """
        
        out = X @ self.weight + self.bias
        self.inputs = X; self.outputs = out
        return out
        
    def backward(self, grad):
        # Gradients on parameters
        self.weight_grad = np.dot(self.inputs.T, grad)
        self.bias_grad = np.sum(grad, axis=0, keepdims=True)
    
        # Gradient on inputs
        self.inputs_grad = np.dot(grad, self.weight.T)

In [None]:
class ReLU(Module):

    def forward(self, X):
        self.inputs = X
        self.outputs = np.maximum(x, 0)
        return self.outputs
        
    def backward(self, grad):
        self.inputs_grad = grad.copy()
        self.inputs_grad[self.inputs <= 0] = 0

In [None]:
class Softmax(Module):
    def forward(self, X):
        exp_vals = np.exp(X - np.max(X, axis=1, keepdims=True))
        probs = exp_vals / np.sum(exp_vals, axis=1, keepdims=True); self.outputs = probs
        return probs
        

In [None]:
class Sigmoid(Module):
    
    def forward(self, X):
        self.inputs = X
        self.outputs = 1 / (1 + np.exp(-X))
        return self.outputs

    def backward(self, grad):
        self.inputs_grad = grad * (1 - self.outputs) * self.outputs

In [None]:
def logsumexp(x):
    m = x.max(-1)[0]
    return m + (x-m[:,None]).exp().sum(-1).log()

def log_softmax(x): return x - logsumexp(x)
def nll(input, target): return -input[range(target.shape[0]), target].mean()

In [None]:
class CrossEntropyLoss(Module):

    def forward(self, y_hat, y):
       return nll(y_hat, y)

    def backward(self, grad, y):
    
        # Number of labels in every sample
        num_labels = len(grad[0])
        
        # If labels are sparse, turn them into one-hot vectors
        if len(y_true.shape) == 1:
            y = np.eye(num_labels)[y]
    
        self.inputs_grad = (-y / grad) / len(grad[0])

In [None]:
class BinaryCrossEntropy(Module):
    def __init__(self):
        self.inputs = None
        self.targets = None
        self.dinputs = None
    
    def forward(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
        
        # Avoid division by zero and clip values for numerical stability
        epsilon = 1e-7
        inputs = np.clip(inputs, epsilon, 1.0 - epsilon)
        
        # Calculate binary cross-entropy loss
        loss = -np.mean(targets * np.log(inputs) + (1 - targets) * np.log(1 - inputs))
        
        return loss
    
    def backward(self, dvalues, y_true):
        # Number of samples and outputs
        samples = len(dvalues)
        outputs = len(dvalues[0])
        
        # Clip data to prevent division by 0
        clipped_dvalues = np.clip(dvalues, 1e-7, 1 - 1e-7)
        
        # Calculate gradient
        self.dinputs = -((y_true / clipped_dvalues) - ((1 - y_true) / (1 - clipped_dvalues))) / outputs
        
        # Normalize gradient
        self.dinputs = self.dinputs / samples

# Optim

In [None]:
class Optimizer:
    """
    Base class for all optimizers. Not meant to be instantiated directly.

    This class represents the abstract concept of an optimizer, and contains methods that 
    all concrete optimizer classes must implement. It is designed to handle the parameters 
    of a machine learning model, providing functionality to perform a step of optimization 
    and to zero out gradients.
    
    Parameters
    ----------
    params : Iterable
        The parameters of the model to be optimized.

    Raises
    ------
    NotImplementedError
        If the `step` method is not implemented in a subclass.
    """
    def __init__(
        self,
        layers # The parameters of the model to be optimized.
    ):
        self.layers = layers

    def step(self):
        """
        Performs a single optimization step.

        This method must be overridden by any subclass to provide the specific optimization logic.
        
        Raises
        ------
        NotImplementedError
            If the method is not implemented in a subclass.
        """
        raise NotImplementedError()

In [None]:
class SGD(Optimizer):
    """
    Implements stochastic gradient descent (optionally with momentum).

    This is a basic optimizer that's suitable for many machine learning models, and is often
    used as a baseline for comparing other optimizers' performance.

    Parameters
    ----------
    params : Iterable
        The parameters of the model to be optimized.
    lr : float, optional
        The learning rate.
    momentum : float, optional
        The momentum factor.
    wd : float, optional
        The weight decay (L2 regularization).
    """
    def __init__(
        self,
        layers, # The parameters of the model to be optimized.
        lr=0.01, # The learning rate.
    ):
        super().__init__(layers)
        self.lr = lr
        
    def step(self):
        """
        Performs a single optimization step.

        This method uses the current gradients to adjust the parameters using stochastic gradient descent.
        """
        for self.idx, layer in enumerate(self.layers):
            self._opt_step(layer)

    def _opt_step(self, layer):
        """
        Performs the optimization step for a single parameter tensor.

        If momentum is set, it applies momentum by using a running average of the previous gradients.
        """

        layer.weight -= self.lr * layer.weight_grad
        layer.bias -= self.lr * layer.bias_grad

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NeuralNetwork(nn.Module):
    def __init__(self, input_shape, output_shape):
        super(NeuralNetwork, self).__init__()
        self.dense1 = nn.Linear(in_features=input_shape, out_features=24)
        self.dense2 = nn.Linear(24, 24)
        self.dense3 = nn.Linear(24, 24)
        self.dense4 = nn.Linear(24, output_shape)

        self.initialize_weights()

    def forward(self, x):
        x = F.sigmoid(self.dense1(x))
        x = F.sigmoid(self.dense2(x))
        x = F.sigmoid(self.dense3(x))
        x = F.softmax(self.dense4(x), dim=1)
        return x

    def initialize_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.kaiming_uniform_(module.weight)

# Create the neural network
input_shape = 30  # Replace with the actual input shape
output_shape = 2  # Replace with the actual output shape

network = NeuralNetwork(input_shape, output_shape)

In [None]:
network

NeuralNetwork(
  (dense1): Linear(in_features=30, out_features=24, bias=True)
  (dense2): Linear(in_features=24, out_features=24, bias=True)
  (dense3): Linear(in_features=24, out_features=24, bias=True)
  (dense4): Linear(in_features=24, out_features=2, bias=True)
)

In [None]:
def normalize_dataset(dataset):
    # Compute the mean and standard deviation along the axis 0 (columns)
    mean = np.mean(dataset, axis=0)
    std = np.std(dataset, axis=0)

    # Normalize the dataset by subtracting the mean and dividing by the standard deviation
    normalized_dataset = (dataset - mean) / std

    return normalized_dataset

In [None]:
import pandas as pd
import numpy as np
def data_split(file_path):
    df = pd.read_csv(file_path)
    X = df.iloc[:, 2:].values
    y = df.iloc[:, 1].values
    y = np.where(y == 'M', 1, 0)

    X = normalize_dataset(X)
    
    split_ratio = 0.8
    split_index = int(split_ratio * len(X))
    X_tr, y_tr = X[:split_index], y[:split_index]
    X_val, y_val = X[split_index:], y[split_index:]
    return X_tr, y_tr, X_val, y_val

In [None]:
X_tr, y_tr, X_val, y_val = data_split('./data/data.csv')

In [None]:
X_tr.shape

(454, 30)

In [None]:
# from functools import partial
# t = partial(torch.tensor, dtype=torch.float32)
# X_tr, y_tr, X_val, y_val = map(t, (X_tr, y_tr, X_val, y_val))

In [None]:
X_tr.shape, X_tr.dtype

((454, 30), dtype('float64'))

In [None]:
network(X_tr)[:10]

tensor([[0.6764, 0.3236],
        [0.6763, 0.3237],
        [0.6864, 0.3136],
        [0.6690, 0.3310],
        [0.6832, 0.3168],
        [0.6814, 0.3186],
        [0.6782, 0.3218],
        [0.6815, 0.3185],
        [0.6852, 0.3148],
        [0.6824, 0.3176]], grad_fn=<SliceBackward0>)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Custom Dataset class
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]

tr_ds = MyDataset(X_tr, y_tr)
val_ds = MyDataset(X_val, y_val)

# Creating the data loader
batch_size = 4
tr_dl = DataLoader(tr_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=64, shuffle=True)

# Iterating over the data loader
for batch_X, batch_y in tr_dl:
    print("Batch X:", batch_X)
    print("Batch y:", batch_y)
    break

Batch X: tensor([[-0.0996, -1.4086, -0.1589, -0.2036, -0.3094, -0.7996, -0.9820, -0.7666,
         -0.8007, -0.5192, -0.6703, -0.9480, -0.6859, -0.4951, -0.8973, -0.9666,
         -0.9142, -0.8042,  0.1229, -0.7521, -0.3286, -1.4279, -0.3874, -0.3833,
         -0.6718, -0.9358, -1.1265, -0.8608, -0.1217, -0.8857],
        [-1.0966, -1.0725, -1.0598, -0.9462,  0.1780, -0.2379, -0.6642, -0.7355,
         -0.6505,  1.0358, -0.2392,  2.3420, -0.2301, -0.4599,  3.4366,  1.1897,
         -0.1904,  0.1899,  1.9650,  1.1222, -1.1152, -1.0123, -1.0834, -0.9192,
          0.1616, -0.5751, -0.9614, -1.1249, -0.7546,  0.0553],
        [-1.0455, -0.8975, -1.0425, -0.9234,  0.6398, -0.5126, -1.0323, -0.9482,
         -0.0826,  0.2462, -0.7711, -0.5706, -0.8209, -0.6250, -0.1762, -0.9717,
         -0.9624, -1.1052, -0.6668, -0.3667, -0.9824, -0.9602, -1.0063, -0.8516,
          0.0782, -0.8840, -1.1791, -1.0653, -0.4601, -0.0663],
        [ 1.5423,  2.2089,  1.7180,  1.5720, -0.2652,  1.9550,  1.1357

In [None]:
from tqdm import tqdm

network = NeuralNetwork(input_shape, output_shape)
opt = torch.optim.SGD(network.parameters(), lr=0.01)
bce = torch.nn.CrossEntropyLoss()

network.train()
num_epochs = 70

for epoch in range(num_epochs):
    train_losses = []
    val_losses = []
    train_accs = []
    val_accs = []
    
    # Training phase
    network.train()
    for xb, yb in tr_dl:
        preds = network(xb)
        loss = bce(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()
        train_losses.append(loss.item())
        
        # Calculate accuracy
        _, predicted_labels = torch.max(preds, dim=1)
        accuracy = (predicted_labels == yb).sum().item() / yb.size(0)
        train_accs.append(accuracy)
    
    # Validation phase
    network.eval()
    with torch.no_grad():
        for xb_val, yb_val in val_dl:
            preds_val = network(xb_val)
            val_loss = bce(preds_val, yb_val)
            val_losses.append(val_loss.item())
            
            # Calculate accuracy
            _, predicted_labels_val = torch.max(preds_val, dim=1)
            accuracy_val = (predicted_labels_val == yb_val).sum().item() / yb_val.size(0)
            val_accs.append(accuracy_val)
    
    avg_train_loss = sum(train_losses) / len(train_losses)
    avg_val_loss = sum(val_losses) / len(val_losses)
    avg_train_acc = sum(train_accs) / len(train_accs)
    avg_val_acc = sum(val_accs) / len(val_accs)
    
    # Print epoch-wise loss and accuracy
    print(f"epoch {epoch + 1:02d}/{num_epochs:02d} - loss: {avg_train_loss:.4f} - acc: {avg_train_acc:.4f} - val_loss: {avg_val_loss:.4f} - val_acc: {avg_val_acc:.4f}")


epoch 01/70 - loss: 0.6857 - acc: 0.5921 - val_loss: 0.5925 - val_acc: 0.7684
epoch 02/70 - loss: 0.6837 - acc: 0.5921 - val_loss: 0.5960 - val_acc: 0.7750
epoch 03/70 - loss: 0.6833 - acc: 0.5899 - val_loss: 0.5980 - val_acc: 0.7772
epoch 04/70 - loss: 0.6813 - acc: 0.5921 - val_loss: 0.6003 - val_acc: 0.7772
epoch 05/70 - loss: 0.6800 - acc: 0.5921 - val_loss: 0.6062 - val_acc: 0.7706
epoch 06/70 - loss: 0.6782 - acc: 0.5943 - val_loss: 0.6065 - val_acc: 0.7728
epoch 07/70 - loss: 0.6782 - acc: 0.5921 - val_loss: 0.6067 - val_acc: 0.7728
epoch 08/70 - loss: 0.6773 - acc: 0.5921 - val_loss: 0.6068 - val_acc: 0.7706
epoch 09/70 - loss: 0.6763 - acc: 0.5921 - val_loss: 0.6046 - val_acc: 0.7794
epoch 10/70 - loss: 0.6745 - acc: 0.5943 - val_loss: 0.6048 - val_acc: 0.7706
epoch 11/70 - loss: 0.6741 - acc: 0.5921 - val_loss: 0.6071 - val_acc: 0.7706
epoch 12/70 - loss: 0.6741 - acc: 0.5899 - val_loss: 0.6049 - val_acc: 0.7750
epoch 13/70 - loss: 0.6721 - acc: 0.5921 - val_loss: 0.6033 - va