In [1]:
import numpy as np

In [207]:
X = np.random.normal(0, 1, size=(1000, 20))
y = np.random.randint(4, size=1000)

In [282]:
class Layer(object):
    
    def __init__(self, input_size, output_size, activation):
        
        self.input_size = input_size
        self.output_size = output_size
        
        activations = {
            "none": lambda z: z,
            "relu": lambda z: max(0, z),
            "sigmoid": lambda z: 1/(1 + np.exp(-z)),
            "softmax": lambda z: np.exp(z)/np.exp(z).sum()
        }
        
        activation_derivs = {
            "none": lambda z: 1,
            "relu": lambda z: 1 if z > 0 else 1,
            "sigmoid": lambda z: 1/(1 + np.exp(-z)) * (1 - 1/(1 + np.exp(-z))),
            "softmax": lambda z: z # TODO
        }
        
        self.activation_name = activation
        self.activation_fun = activations.get(activation)
        self.activation_deriv_fun = activation_derivs.get(activation)
    

In [283]:
class Input(Layer):
    
    def __init__(self, output_size, activation="none"):
        super().__init__(output_size, output_size, activation)
        self.weights = np.empty(0)
    
    def process(self, inputs):
        self.raw_output = inputs
        self.output = self.activation_fun(self.raw_output)
        return self.output

In [284]:
class Dense(Layer):
    
    def __init__(self, input_size, output_size, activation="sigmoid", add_bias=True):
        super().__init__(input_size, output_size, activation)
        self.add_bias = add_bias
        self.weights = np.random.uniform(
            -0.1, 0.1,
            size=(self.input_size + 1 if self.add_bias else self.input_size,
                  self.output_size)
        )
    

    def process(self, inputs):
        m = inputs.shape[0]
        self.raw_output = np.matmul(
            np.hstack((np.ones((m, 1)), inputs)) if self.add_bias else inputs,
            self.weights
        )
        self.output = self.activation_fun(self.raw_output)
        return self.output
    
    
    def calculate_error(self, next_error):
        # Omit bias weights
        self.error = np.matmul(
            next_error, 
            self.weights[:, 1:] if self.add_bias else self.weights
        ) * self.activation_deriv_fun(self.raw_output)
        return self.error
    
    def update_weights(self, meh):
        pass

In [285]:
X = np.random.normal(0, 1, size=(1000, 20))

In [286]:
layer1 = Dense(20, 10, "sigmoid")

In [287]:
class Network(object):
    
    def __init__(self, layers):
        self.layers = layers
        self.output_size = layers[-1].output_size
    
    
    def show(self):
        k = 0
        for layer in self.layers:
            print("Layer {0}: {1}, {2} -> {3}, {4}".format(
                k, 
                type(layer).__name__,
                layer.input_size,
                layer.output_size,
                layer.activation_name
            ))
            k = k+1
    
    
    def predict(self, data, classes=True):
        outputs = data
        for k in range(len(self.layers)):
            outputs = self.layers[k].process(outputs)
        return np.argmax(outputs, axis=1) if classes else outputs
    
    
    def get_cost(self, data, labels, penalty=0):
        m = data.shape[0]
        preds = self.predict(data, classes=False)
        
        cost = (-1/m) * (np.log(preds) * np.eye(self.output_size)[:, labels].T +
                         np.log(1-preds) * (1 - np.eye(self.output_size)[:, labels]).T).sum()
        
        # Add regularisation
        cost = cost + (penalty/m) * sum([(layer.weights**2).sum() for layer in self.layers])
        return cost
    
    
    def get_weights(self):
        return np.concatenate([layer.weights.flatten() for layer in self.layers])
    
    
    def get_weights_grads(self, data, labels, penalty=0):
        m = data.shape[0]
        preds = self.predict(data, classes=False)
        
        # The "k+1st" error actually gets stored in Input layer - all fine!
        self.layers[0].error = preds - np.eye(self.output_size)[:, labels].T
        all_grads = np.empty(0)
        
        for k in range(len(self.layers)):
            error = self.layers[-k-1].calculate_error(self.layers[-k].error)
            accumulator = 1/m * np.matmul(error.T, self.layers[-k-2].output)
            # Compute regularisation term
            reg = (penalty/m) * layer.weights
            reg[:, 0] = 0
            
            grads = (accumulator + reg).flatten()
            all_grads = np.concatenate(all_grads, grads)
        
        return all_grads
    

In [288]:
model = Network(
    layers=[
        Input(20),
        Dense(20, 10, "sigmoid"),
        Dense(10, 4, "softmax")
    ]
)

In [289]:
model.show()

Layer 0: Input, 20 -> 20, none
Layer 1: Dense, 20 -> 10, sigmoid
Layer 2: Dense, 10 -> 4, softmax


In [290]:
model.predict(X, classes=False).shape

(1000, 4)

In [291]:
model.get_cost(X, y, penalty=1)

8.30120814335824

In [292]:
model.get_weights().shape

(254,)

In [293]:
model.get_weights_grads(X, y)

ValueError: shapes (1000,4) and (11,3) not aligned: 4 (dim 1) != 11 (dim 0)