# A Numpy Neural Net

In [1]:
! pip install -qU numpy sklearn

In [2]:
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [3]:
seed = 9
np.random.seed(seed)

# Objective 

Our goal is to write a simple two layer neural network in numpy, starting from a single layer logistic network. First, we'll define our loss function, activations, linear layer, and SGD. Then we'll tie it all together in a single Sequential model.

# Binary Cross Entropy


In [4]:
class BinaryCrossEntropy:
    """Container for the forward and backward pass of BCE."""
    
    def __call__(self, y_hat, y):
        return self.forward(y_hat, y)
    
    def forward(self, y_hat, y):
        """Return binary cross entropy given predictions and targets."""
        self.y_hat, self.y = y_hat.clip(min=1e-8, max=1-1e-8), y
        return -np.where(y==1, np.log(self.y_hat), np.log(1 - self.y_hat))
    
    def backward(self):
        """Backpropagate the gradient with respect to predictions."""
        return (self.y_hat - self.y) / (self.y_hat * (1 - self.y_hat))

# Sigmoid Activation

In [5]:
class Sigmoid:
    """Container for the forward and backward pass of sigmoid."""
    
    def __call__(self, x):
        return self.forward(x)
    
    def forward(self, x):
        """Pass a mini-batch through a sigmoid layer."""
        self.y_hat = np.where(x > 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
        return self.y_hat
        
    def backward(self, grad):
        """Backpropagate the gradient given the preceding gradient."""
        return self.y_hat * (1 - self.y_hat) * grad

# Linear Layer

In [6]:
class Linear:
    """Container for the forward and backward pass of a linear layer."""
    
    def __init__(self, n_inp, n_out):
        """Initialise layer with random weights and zero bias."""
        k = 1 / np.sqrt(n_inp)
        self.weights = np.random.uniform(-k, k, (n_inp, n_out))
        self.bias = np.zeros(n_out)
        
    def __call__(self, x):
        return self.forward(x)
        
    def forward(self, x):
        """Pass a mini-batch through a linear layer."""
        self.x = x
        return x @ self.weights + self.bias
    
    def backward(self, grad):
        """Backpropagate the gradient given the preceding gradient."""
        self.grad_w = (self.x[:,:,None] @ grad[:,None,:]).mean(axis=0)
        self.grad_b = grad.mean(axis=0)
        return grad @ self.weights.T

# Putting It All Together

It's finally time to string together all of the work we've done so far into a complete network. Then we'll put it to the test on the [breast cancer dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer) and see how we compare to sklearn's built-in logistic model. 

In [7]:
class Sequential:
    """Container for a feedforward neural net."""
    
    def __init__(self, layers, criterion):
        """Initialise layers and loss criterion."""
        self.layers = layers
        self.criterion = criterion
        
    def __call__(self, x):
        return self.forward(x)
        
    def forward(self, x):
        """Pass a mini-batch through the network."""
        for layer in self.layers:
            x = layer.forward(x)
        return x
    
    def backward(self):
        """Backpropagate gradients to the start of the network."""
        grad = self.criterion.backward()
        for layer in self.layers[::-1]:
            grad = layer.backward(grad)

In [8]:
class SGD:
    """Container for updating a model's weights via SGD."""
    
    def __init__(self, model, lr):
        """Initialise model parameters and learning rate."""
        self.model = model
        self.lr = lr
                  
    def step(self):
        """Update weights and biases of all linear layers."""
        for layer in self.model.layers:
            if isinstance(layer, Linear):
                layer.weights -= self.lr * layer.grad_w
                layer.bias -= self.lr * layer.grad_b

# Our Evaluation Metric

For simplicity, we'll just consider accuracy as our evaluation metric for the time being.

In [9]:
def accuracy(y_hat, y):
    """Compute accuracy given soft binary predictions."""
    y_pred = y_hat > 0.5
    return (y_pred == y).mean()

# Trainer

To make life easier, let's wrap all of the functionality we'll need to train a network in a single class.

In [10]:
class Trainer:
    """Container for training a feedforward neural net."""
    
    def __init__(self, model, optimizer, train_dl, val_dl, metric):
        self.model = model
        self.optimizer = optimizer
        self.train_dl = train_dl
        self.val_dl = val_dl
        self.metric = metric
        
    def train_one_epoch(self):
        """Train for one epoch and return the loss."""
        loss, n = 0, 0
        for x, y in self.train_dl:
            y_hat = self.model(x)
            batch_loss = self.model.criterion(y_hat, y).sum()
            self.model.backward()
            self.optimizer.step()
            loss += batch_loss
            n += len(y)
        return loss / n
            
    def train(self, n_epochs, log_level=1):
        """Train for several epochs."""
        for epoch in range(n_epochs):
            loss = self.train_one_epoch()
            val_loss, val_metric = self.evaluate(self.val_dl)
            if (epoch + 1) % log_level == 0:
                print(f"epoch= {epoch:2d} | loss= {loss:.3f} | "
                      f"val_loss= {val_loss:.3f} | val_metric= {val_metric:.3f}")
    
    def evaluate(self, dl):
        """Return loss and metric on validation or test set."""
        loss, n, metric = 0, 0, 0
        for x, y in dl:
            y_hat = self.model(x)
            batch_loss = self.model.criterion(y_hat, y).sum()
            batch_metric = self.metric(y_hat, y)
            metric += len(y) * batch_metric
            loss += batch_loss
            n += len(y)
        return loss / n, metric / n

# Pre-process Data

We'll use sklearn's [breast cancer dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer) for our binary classification task. 

In [11]:
# Load data
X, y = load_breast_cancer(return_X_y=True)

# Train-test-split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train.shape, X_val.shape

((455, 30), (114, 30))

In [12]:
def normalize(X_train, X_val):
    """Normalize training and validation data using training stats."""
    for j in range(X_train.shape[1]):
        mu, sigma = X_train[:,j].mean(), X_train[:,j].std()
        X_train[:,j] = (X_train[:,j] - mu) / sigma
        X_val[:,j] = (X_val[:,j] - mu) / sigma
    return X_train, X_val

In [13]:
# Normalize with training stats
X_train, X_val = normalize(X_train, X_val)

# Datasets & DataLoaders

In order to train in batches, we'll need to implement our own version of pytorch's datasets and dataloaders, since we're doing everything in numpy.

In [14]:
class Dataset:
    """Container for returning inputs and targets."""
    
    def __init__(self, X, y):
        """Initialise inputs and re-shape targets as a column vector."""
        self.X = X
        self.y = y.reshape(-1, 1)
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __setitem__(self, idx, val):
        self.X[idx], self.y[idx] = val
                
    def __len__(self):
        return len(self.y)

In [15]:
class DataLoader:
    """Container for returning a mini-batch of inputs and targets."""
    
    def __init__(self, ds, batch_size, shuffle=False):
        """Initialise dataset and batch size."""
        self.ds = ds
        self.batch_size = batch_size
        self.shuffle = shuffle
        
    def __iter__(self):
        """Yield a mini-batch of inputs and targets."""
        if self.shuffle: self.shuffle_data()
        n_batches = len(self.ds) // self.batch_size
        for i in range(n_batches):
            yield self.ds[i * self.batch_size: (i + 1) * self.batch_size]
            
    def shuffle_data(self):
        """Shuffle inputs and targets."""
        idxs = np.random.permutation(len(self.ds))
        self.ds = Dataset(*self.ds[idxs])

In [16]:
# Load training and validation data
train_ds = Dataset(X_train, y_train)
val_ds = Dataset(X_val, y_val)

batch_size = 64
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=len(X_val), shuffle=False)

# Train

Now we're ready to put our model to the test.

In [17]:
# Input and final output dims
n_inp = X_train.shape[1]

# Initialise layers and criterion
metric = accuracy
criterion = BinaryCrossEntropy()
layers = [Linear(n_inp, 1), Sigmoid()]
model = Sequential(layers, criterion)

# Initialise optimizer and trainer
optimizer = SGD(model, lr=0.1)
trainer = Trainer(model, optimizer, train_dl, val_dl, metric)
trainer.train(5)

epoch=  0 | loss= 0.421 | val_loss= 0.263 | val_metric= 0.939
epoch=  1 | loss= 0.246 | val_loss= 0.198 | val_metric= 0.956
epoch=  2 | loss= 0.199 | val_loss= 0.169 | val_metric= 0.965
epoch=  3 | loss= 0.170 | val_loss= 0.152 | val_metric= 0.965
epoch=  4 | loss= 0.158 | val_loss= 0.140 | val_metric= 0.965


# Comparison with sklearn

Let's see how our logistic network stacks up against sklearn's logistic classifier.

In [18]:
# We're close!
sklearn_model = LogisticRegression(random_state=seed)
sklearn_model.fit(X_train, y_train)
sklearn_model.score(X_val, y_val)

0.9824561403508771

# Two Layer Network

Now let's see if we can do a bit better by training a deeper network. First, we'll need to implement $\text{ReLU}$ for the activations in between our linear layers.

In [19]:
class ReLU:
    """Container for the forward and backward pass of ReLU."""
    
    def __call__(self, x):
        return self.forward(x)
    
    def forward(self, x):
        """Pass a mini-batch through ReLU."""
        self.x = x
        return np.where(x > 0, x, 0)
    
    def backward(self, grad):
        """Return the gradient where x is positive, otherwise zero."""
        return np.where(self.x > 0, grad, 0)

In [20]:
# Input and final output dims
n_inp = X_train.shape[1]

# Initialise layers and criterion
metric = accuracy
criterion = BinaryCrossEntropy()
layers = [Linear(n_inp, 20), ReLU(), Linear(20, 1), Sigmoid()]
model = Sequential(layers, criterion)

# Initialise optimizer and trainer
optimizer = SGD(model, lr=0.10)
trainer = Trainer(model, optimizer, train_dl, val_dl, metric)
trainer.train(10)

epoch=  0 | loss= 0.567 | val_loss= 0.485 | val_metric= 0.921
epoch=  1 | loss= 0.443 | val_loss= 0.361 | val_metric= 0.921
epoch=  2 | loss= 0.338 | val_loss= 0.273 | val_metric= 0.930
epoch=  3 | loss= 0.268 | val_loss= 0.220 | val_metric= 0.939
epoch=  4 | loss= 0.225 | val_loss= 0.187 | val_metric= 0.939
epoch=  5 | loss= 0.192 | val_loss= 0.165 | val_metric= 0.939
epoch=  6 | loss= 0.172 | val_loss= 0.150 | val_metric= 0.939
epoch=  7 | loss= 0.156 | val_loss= 0.138 | val_metric= 0.939
epoch=  8 | loss= 0.144 | val_loss= 0.130 | val_metric= 0.947
epoch=  9 | loss= 0.135 | val_loss= 0.123 | val_metric= 0.956


# Conclusion

That's all for today. Thanks for reading!