In [46]:
import numpy as np

In [47]:
class BinaryCrossEntropy:
    def __init__(self):
        pass

    def __call__(self, y_pred, y_true):
        ix_zeros = np.arange(0, y_true.shape[0])[y_true.reshape(-1) == 0]
        ix_ones = np.arange(0, y_true.shape[0])[y_true.reshape(-1) == 1]

        y_zero = np.log(1 - y_pred[ix_zeros] + 1e-10).sum()
        y_one = np.log(y_pred[ix_ones] + 1e-10).sum()

        return -1 * (y_zero + y_one)
    
    def grad_input(self, X, y_true):
        if y_true == 0:
            return 1/(1-X)
        else:
            return -1/X 
        


In [105]:
class Sigmoid:
    def __call__(self, X):
        return self.eval(X)
    
    def eval(self, X):
        return 1/(1+np.e**(-1*X))

    def grad_input(self, X):
        return np.identity(X.shape[0])*self.eval(X)*(1 - self.eval(X))

class Dot:
    def __init__(self, input_size, units):
        self.W = np.random.randn(input_size, units)
        self.b = np.random.randn(units, 1)

    def __call__(self, X):
        return self.W.T.dot(X) + self.b

    def grad_w(self, X):
        I = np.identity(self.b.shape[0])
        grad = np.stack([I]*self.W.shape[0], axis=1)*X
        return np.transpose(grad, [1, 0, 2])
    
    def grad_b(self):
        return np.identity(self.b.shape[0])

    def grad_input(self):
        return self.W.T
    
    def get_output_size(self):
        return self.b.shape
    
    def get_no_of_params(self):
        return np.prod(self.W.shape) + np.prod(self.b.shape)
    
    def update(self, gradW, gradb, optimizer, method):
        if method == "minimize":
            self.W = optimizer.minimize(self.W, gradW)
            self.b = optimizer.minimize(self.b, gradb)
        elif method == "maximize":
            self.W = optimizer.maximize(self.W, gradW)
            self.b = optimizer.maximize(self.b, gradb)

In [106]:
class Dense:

    def __init__(self, units, activation, input_size):
        self.units = units
        self.dot = Dot(input_size, units)
        self.activation = activation
        self.input_size = input_size

    def get_output_size(self):
        return self.dot.get_output_size()

    def get_no_of_params(self):
        return self.dot.get_no_of_params()

    def eval(self, X):
        return self.activation(self.dot(X))

    def grad_parameters(self, X):
        da_dI = self.activation.grad_input(self.dot(X))
        dI_dw = self.dot.grad_w(X)
        da_dw = da_dI * dI_dw
        dI_db = self.dot.grad_b()
        da_db = da_dI * dI_db
        return (da_dw, da_db)
    
    def grad_input(self, X):
        g1 = self.activation.grad_input(self.dot(X))

        g2 = self.dot.grad_input()

        return g1.dot(g2)
    
    def update(self, grad_w, grad_b, optimizer, method="minimize"):
        self.dot.update(grad_w, grad_b, optimizer, method)
        

In [107]:
class GradientDescentOptimizer:
    def __init__(self):
        pass

    def set_lr(self, learning_rate):
        self.learning_rate = learning_rate
        return self
    
    def minimize(self, X, grad_X):
        assert X.shape == grad_X.shape, f"Shape mismatch, Input shape {X.shape} != Gradient shape {grad_X.shape}"
        return X - (self.learning_rate*grad_X)
    
    def maximize(self, X, grad_X):
        assert X.shape == grad_X.shape, f"Shape mismatch, Input shape {X.shape} != Gradient shape {grad_X.shape}"
        return X + (self.learning_rate*grad_X)

        

In [121]:
class Sequential:
    def __init__(self):
        self.layers = []
        self.loss = None
        self.outputs = []

    def add(self, layer):
        self.layers.append(layer)
        return self
    
    def summary(self):
        from tabulate import tabulate

        headers = ["Layer Type", "Output Shape", "No. of parameters"]
        summary_ = []
        params = 0
        for layer in self.layers:
            p = layer.get_no_of_params()
            params += p
            summary_.append([layer.__class__.__name__, layer.get_output_size(), p])
        
        print(tabulate(summary_, headers=headers))
        print("Total No. of parameters:", params)
    
    def fit(self, X, y, n_epochs, learning_rate, optimizer, verbose=1):
        self.optimizer = optimizer.set_lr(learning_rate)

        for i in range(n_epochs):
            _, outputs, gradients = self.forward_propagation(X)
            self.backward_propagation(outputs, gradients, y)
            if verbose == 1:
                print(f"\rEpoch: {i+1} Loss:{self._eval_loss(X, y)}", end="")
        
        if verbose == 0:
            print(f"\rEpoch: {i+1} Loss:{self._eval_loss(X, y)}", end="")
        
        print("")
            
    def forward_propagation(self, X):
        output = X.T
        outputs = [output]
        gradients = []
        for layer in self.layers:
            grad_ = {}
            grad_["input"] = layer.grad_input(output)
            grad_["w"], grad_["b"] = layer.grad_parameters(output)
            output = layer.eval(output)
            outputs.append(output)
            gradients.append(grad_)

        return output.T, outputs, gradients
    
    def backward_propagation(self, outputs, gradients, y):
        grad_loss = self.loss.grad_input(outputs[-1], y)
        outputs = outputs[:-1]
        for grad, output, layer in list(zip(gradients, outputs, self.layers))[::-1]:
            layer.update(grad_loss.dot(grad["w"])[0], grad_loss.dot(grad["b"]).T, self.optimizer)
            grad_loss = grad_loss.dot(grad["input"])

    def _eval(self, X):
        return self.forward_propagation(X)[0]
    
    def compile(self, loss):
        self.loss = loss

    def _eval_loss(self, X, y_true):
        if self.loss is None:
            raise RuntimeError("Model not compiled")
            
        return self.loss(self._eval(X), y_true)

In [125]:
model = Sequential()
model.add(Dense(units=10, activation=Sigmoid(), input_size=10))
model.add(Dense(units=10, activation=Sigmoid(), input_size=10))
model.add(Dense(units=10, activation=Sigmoid(), input_size=10))
model.add(Dense(units=10, activation=Sigmoid(), input_size=10))
model.add(Dense(units=1, activation=Sigmoid(), input_size=10))
model.compile(BinaryCrossEntropy())

In [126]:
model.summary()

Layer Type    Output Shape      No. of parameters
------------  --------------  -------------------
Dense         (10, 1)                         110
Dense         (10, 1)                         110
Dense         (10, 1)                         110
Dense         (10, 1)                         110
Dense         (1, 1)                           11
Total No. of parameters: 451


In [127]:
X = np.random.randn(100, 10)
print("Y_pred", model._eval(X))
print("Loss", model._eval_loss(X, np.array([1])))

Y_pred [[0.97218133]]
Loss 0.028212933625132663


In [129]:
model.fit(X, np.array([0]), n_epochs=2000, learning_rate=0.01, optimizer=GradientDescentOptimizer())

Epoch: 2000 Loss:0.010992775846750647


In [130]:
model._eval(X)

array([[0.01093258]])