In [1]:
import numpy as np

In [2]:
X = np.random.normal(0, 1, size=(1000, 20))
y = np.random.randint(4, size=1000)

In [3]:
class Layer(object):
    
    def __init__(self, input_size, output_size, activation):
        
        self.input_size = input_size
        self.output_size = output_size
        
        activations = {
            "none": lambda z: z,
            "relu": lambda z: max(0, z),
            "sigmoid": lambda z: 1/(1 + np.exp(-z)),
            "softmax": lambda z: np.exp(z)/np.exp(z).sum()
        }
        
        activation_derivs = {
            "none": lambda z: 1,
            "relu": lambda z: 1 if z > 0 else 1,
            "sigmoid": lambda z: 1/(1 + np.exp(-z)) * (1 - 1/(1 + np.exp(-z))),
            "softmax": lambda z: z # TODO
        }
        
        self.activation_name = activation
        self.activation_fun = activations.get(activation)
        self.activation_deriv_fun = activation_derivs.get(activation)
    

In [4]:
class Input(Layer):
    
    def __init__(self, output_size, activation="none"):
        super().__init__(output_size, output_size, activation)
        self.weights = np.empty(0)
    
    def process(self, inputs):
        self.raw_output = inputs
        self.output = self.activation_fun(self.raw_output)
        return self.output

In [419]:
class Dense(Layer):
    
    def __init__(self, input_size, output_size, activation="sigmoid", add_bias=True):
        super().__init__(input_size, output_size, activation)
        self.add_bias = add_bias
        self.weights = np.random.uniform(
            -0.1, 0.1,
            size=(
                self.output_size,
                self.input_size + 1 if self.add_bias else self.input_size
            )
        )
    

    def process(self, inputs):
        m = inputs.shape[0]
        self.raw_output = np.matmul(
            np.hstack((np.ones((m, 1)), inputs)) if self.add_bias else inputs,
            self.weights.T
        )
        self.output = self.activation_fun(self.raw_output)
        return self.output
    
    
    def calculate_error(self, next_error, prev_raw_output):
        # Omit bias weights
        self.error = np.matmul(
            next_error,
            self.weights[:, 1:] if self.add_bias else self.weights
        ) * self.activation_deriv_fun(prev_raw_output)
        return self.error
    
    
    def update_weights(self, meh):
        pass

In [420]:
X = np.random.normal(0, 1, size=(1000, 20))

In [421]:
layer1 = Dense(20, 10, "sigmoid")

In [422]:
layer1.process(X)
layer1.raw_output.shape

(1000, 10)

In [423]:
class Network(object):
    
    def __init__(self, layers):
        self.layers = layers
        self.output_size = layers[-1].output_size
    
    
    def show(self):
        k = 0
        for layer in self.layers:
            print("Layer {0}: {1}, {2} -> {3}, {4}".format(
                k, 
                type(layer).__name__,
                layer.input_size,
                layer.output_size,
                layer.activation_name
            ))
            k = k+1
    
    
    def predict(self, data, classes=True):
        outputs = data
        for k in range(len(self.layers)):
            outputs = self.layers[k].process(outputs)
        return np.argmax(outputs, axis=1) if classes else outputs
    
    
    def get_cost(self, data, labels, penalty=0):
        m = data.shape[0]
        preds = self.predict(data, classes=False)
        
        cost = (-1/m) * (np.log(preds) * np.eye(self.output_size)[:, labels].T +
                         np.log(1-preds) * (1 - np.eye(self.output_size)[:, labels]).T).sum()
        
        # Add regularisation
        cost = cost + (penalty/m) * sum([(layer.weights**2).sum() for layer in self.layers])
        return cost
    
    
    def get_weights(self):
        return np.concatenate([layer.weights.flatten() for layer in self.layers])
    
    
    def get_weights_grads(self, data, labels, penalty=0):
        m = data.shape[0]
        preds = self.predict(data, classes=False)
        
        # The "k+1st" error actually gets stored in Input layer - all fine!
        error = preds - np.eye(self.output_size)[:, labels].T
        self.layers[0].error = error
        all_grads = np.empty(0)
        
        for k in range(len(self.layers)-1):
            
            # Calculate gradients using error from next layer
            grads = 1/m * np.matmul(
                np.hstack((np.ones((m, 1)), self.layers[-k-2].output)).T if self.layers[-k-1].add_bias else self.layers[-k-2].output.T,
                error
            )
            # Compute regularisation term
            reg = (penalty/m) * self.layers[-k-1].weights.T
            if self.layers[-k-1].add_bias:
                reg[:, 0] = 0
            
            all_grads = np.concatenate(((grads + reg).flatten(), all_grads))
            
            # Calculate error of next layer
            error = self.layers[-k-1].calculate_error(self.layers[-k].error, self.layers[-k-2].raw_output)
        
        return all_grads
    

In [424]:
model = Network(
    layers=[
        Input(20),
        Dense(20, 10, "sigmoid"),
        Dense(10, 10, "sigmoid"),
        Dense(10, 4, "sigmoid")
    ]
)

In [425]:
model.show()

Layer 0: Input, 20 -> 20, none
Layer 1: Dense, 20 -> 10, sigmoid
Layer 2: Dense, 10 -> 10, sigmoid
Layer 3: Dense, 10 -> 4, sigmoid


In [426]:
model.predict(X, classes=False).shape

(1000, 4)

In [427]:
model.get_cost(X, y, penalty=1)

2.7892216476151734

In [428]:
model.get_weights().shape

(364,)

In [431]:
model.get_weights_grads(X, y).shape

(364,)