In [49]:
import numpy as np

In [50]:
class BinaryCrossEntropy:
    def __init__(self):
        pass

    def __call__(self, y_pred, y_true):
        ix_zeros = np.arange(0, y_true.shape[0])[y_true.reshape(-1) == 0]
        ix_ones = np.arange(0, y_true.shape[0])[y_true.reshape(-1) == 1]

        y_zero = np.log(1 - y_pred[ix_zeros] + 1e-10).sum()
        y_one = np.log(y_pred[ix_ones] + 1e-10).sum()

        return -1 * (y_zero + y_one)
    
    def grad_input(self, X, y_true):
        if y_true == 0:
            return 1/(1-X)
        else:
            return -1/X 
        


In [51]:
class Sigmoid:
    def __call__(self, X):
        return self.eval(X)
    
    def eval(self, X):
        return 1/(1+np.e**(-1*X))

    def grad_input(self, X):
        return np.identity(X.shape[0])*self.eval(X)*(1 - self.eval(X))

class Dot:
    def __init__(self, input_size, units):
        self.W = np.random.randn(input_size, units)
        self.b = np.random.randn(units, 1)

    def __call__(self, X):
        return self.W.T.dot(X) + self.b

    def grad_w(self, X):
        I = np.identity(self.b.shape[0])
        grad = np.stack([I]*self.W.shape[0], axis=1)*X
        return np.transpose(grad, [1, 0, 2])
    
    def grad_b(self):
        return np.identity(self.b.shape[0])

    def grad_input(self):
        return self.W.T
    
    def get_output_size(self):
        return self.b.shape
    
    def get_no_of_params(self):
        return np.prod(self.W.shape) + np.prod(self.b.shape)
    
    def update(self, gradW, gradb, optimizer, method):
        if method == "minimize":
            self.W = optimizer.minimize(self.W, gradW)
            self.b = optimizer.minimize(self.b, gradb)
        elif method == "maximize":
            self.W = optimizer.maximize(self.W, gradW)
            self.b = optimizer.maximize(self.b, gradb)

In [52]:
class Dense:
    
    def __init__(self, units, activation, input_size):
        self.units = units
        self.dot = Dot(input_size, units)
        self.activation = activation
        self.input_size = input_size

    def get_output_size(self):
        return self.dot.get_output_size()

    def get_no_of_params(self):
        return self.dot.get_no_of_params()

    def eval(self, X):
        return self.activation(self.dot(X))

    def grad_parameters(self, X):
        da_dI = self.activation.grad_input(self.dot(X))
        dI_dw = self.dot.grad_w(X)
        da_dw = da_dI.dot(dI_dw)
        dI_db = self.dot.grad_b()
        da_db = da_dI.dot(dI_db)
        return (np.transpose(da_dw, [1,0,2]), da_db)
    
    def grad_input(self, X):
        g1 = self.activation.grad_input(self.dot(X))

        g2 = self.dot.grad_input()

        return g1.dot(g2)
    
    def update(self, grad_w, grad_b, optimizer, method="minimize"):
        self.dot.update(grad_w, grad_b, optimizer, method)
        

In [53]:
class GradientDescentOptimizer:
    def __init__(self):
        pass

    def set_lr(self, learning_rate):
        self.learning_rate = learning_rate
        return self
    
    def minimize(self, X, grad_X):
        assert X.shape == grad_X.shape, f"Shape mismatch, Input shape {X.shape} != Gradient shape {grad_X.shape}"
        return X - (self.learning_rate*grad_X)
    
    def maximize(self, X, grad_X):
        assert X.shape == grad_X.shape, f"Shape mismatch, Input shape {X.shape} != Gradient shape {grad_X.shape}"
        return X + (self.learning_rate*grad_X)

        

In [111]:
class Sequential:
    def __init__(self):
        self.layers = []
        self.loss = None
        self.outputs = []

    def add(self, layer):
        self.layers.append(layer)
        return self
    
    def summary(self):
        from tabulate import tabulate

        headers = ["Layer Type", "Output Shape", "No. of parameters"]
        summary_ = []
        params = 0
        for layer in self.layers:
            p = layer.get_no_of_params()
            params += p
            summary_.append([layer.__class__.__name__, layer.get_output_size(), p])
        
        print(tabulate(summary_, headers=headers))
        print("Total No. of parameters:", params)
    
    def fit(self, X, y, n_epochs, learning_rate, optimizer, batch_size=1, verbose=1):
        self.optimizer = optimizer.set_lr(learning_rate)
        for i in range(n_epochs):
            if verbose == 1:
                print(f"Epoch: {i+1}") 
            gradients = []
            for j in range(X.shape[0]):
                _, outputs, _gradients_ = self.forward_propagation(X[j].reshape(1,-1))
                grads = self.backward_propagation(outputs, _gradients_, y[j].reshape(1,-1))
                gradients.append(grads)
                if (j+1) % batch_size == 0:
                    self._update_params(gradients)
                    gradients = []
                    losses = []
                    if verbose == 1:
                        print(f"\rLoss:{self._eval_loss(X, y)}", end="")
            if verbose == 1:
                print("")
            if verbose == 0:
                print(f"\rEpoch: {i+1} Loss:{self._eval_loss(X, y)}", end="")
            
        print("")
            
    def forward_propagation(self, X, eval=False):
        output = X.T
        outputs = [output]
        gradients = []
        for layer in self.layers:
            if not eval:
                grad_ = {}
                grad_["input"] = layer.grad_input(output)
                grad_["w"], grad_["b"] = layer.grad_parameters(output)
                gradients.append(grad_)
            output = layer.eval(output)
            outputs.append(output)

        return output.T, outputs, gradients
    
    def backward_propagation(self, outputs, gradients, y):
        grad_loss = self.loss.grad_input(outputs[-1], y)
        outputs = outputs[:-1]
        grads = []
        for grad, output in list(zip(gradients, outputs))[::-1]:
            grad_w, grad_b = grad_loss.dot(grad["w"])[0], grad_loss.dot(grad["b"]).T
            grads.append((grad_w, grad_b))
            grad_loss = grad_loss.dot(grad["input"])
        
        return grads
    
    def _update_params(self, gradients):
        grads = [[0, 0] for _ in gradients[0]]
        for grads_ in gradients:
            for i in range(len(grads_)):
                grads[i][0] += grads_[i][0]
                grads[i][1] += grads_[i][1]

        for ((grad_w, grad_b), layer) in zip(grads, self.layers[::-1]):
            layer.update(grad_w, grad_b, self.optimizer)

    def _eval(self, X):
        return self.forward_propagation(X, eval=True)[0]
    
    def compile(self, loss):
        self.loss = loss

    def _eval_loss(self, X, y_true):
        if self.loss is None:
            raise RuntimeError("Model not compiled")
            
        return self.loss(self._eval(X), y_true)

In [134]:
model = Sequential()
model.add(Dense(units=3, activation=Sigmoid(), input_size=2))
model.add(Dense(units=2, activation=Sigmoid(), input_size=3))
model.add(Dense(units=1, activation=Sigmoid(), input_size=2))
model.compile(BinaryCrossEntropy())

In [135]:
model.summary()

Layer Type    Output Shape      No. of parameters
------------  --------------  -------------------
Dense         (3, 1)                            9
Dense         (2, 1)                            8
Dense         (1, 1)                            3
Total No. of parameters: 20


In [136]:
from sklearn.datasets import make_gaussian_quantiles

In [137]:
X, y = make_gaussian_quantiles(n_samples=200,n_classes=2)
# print("Y_pred", model._eval(X))
print("Loss", model._eval_loss(X, y))

Loss 209.69092719233262


In [138]:
# model._eval(X)

In [140]:
model.fit(X, y, n_epochs=1000, learning_rate=0.05, optimizer=GradientDescentOptimizer(), verbose=0)

Epoch: 1000 Loss:8.437956608444045


In [141]:
((model._eval(X)>0.5) == y.reshape(-1,1)).mean()

0.98