In [7]:
import numpy as np
from random import random


class MLP(object):
    """A Multilayer Perceptron class.
    """

    def __init__(self, num_inputs=3, hidden_layers=[3, 3], num_outputs=2, act_f = None):
        """Constructor for the MLP. Takes the number of inputs,
            a variable number of hidden layers, and number of outputs

        Args:
            num_inputs (int): Number of inputs
            hidden_layers (list): A list of ints for the hidden layers
            num_outputs (int): Number of outputs
        """

        self.num_inputs = num_inputs
        self.hidden_layers = hidden_layers
        self.num_outputs = num_outputs

        # create a generic representation of the layers
        layers = [num_inputs] + hidden_layers + [num_outputs]
        
        # Activation functions for each layer
        if act_f == None: self.act_f = ["sigmoid"]*(len(layers) - 1)
        else: self.act_f = act_f

        # create random connection weights for the layers
        weights = []
        for i in range(len(layers) - 1):
            w = np.random.rand(layers[i], layers[i + 1])
            weights.append(w)
        self.weights = weights

        # save derivatives per layer
        derivatives = []
        for i in range(len(layers) - 1):
            d = np.zeros((layers[i], layers[i + 1]))
            derivatives.append(d)
        self.derivatives = derivatives

        # save activations per layer
        activations = []
        for i in range(len(layers)):
            a = np.zeros(layers[i])
            activations.append(a)
        self.activations = activations


    def forward_propagate(self, inputs):
        """Computes forward propagation of the network based on input signals.

        Args:
            inputs (ndarray): Input signals
        Returns:
            activations (ndarray): Output values
        """

        # the input layer activation is just the input itself
        activations = inputs

        # save the activations for backpropogation
        self.activations[0] = activations

        # iterate through the network layers
        for i, w in enumerate(self.weights):
            # calculate matrix multiplication between previous activation and weight matrix
            net_inputs = np.dot(activations, w)

            # apply sigmoid activation function
            cur_act_f = self.map_to_act(self.act_f[i])
            activations = cur_act_f(net_inputs)

            # save the activations for backpropogation
            self.activations[i + 1] = activations

        # return output layer activation
        return activations


    def back_propagate(self, error):
        """Backpropogates an error signal.
        Args:
            error (ndarray): The error to backprop.
        Returns:
            error (ndarray): The final error of the input
        """

        # iterate backwards through the network layers
        for i in reversed(range(len(self.derivatives))):

            # get activation for previous layer
            activations = self.activations[i+1]

            # apply sigmoid derivative function
            # apply sigmoid activation function
            cur_act_f_der = self.map_to_act_derivative(self.act_f[i])
            delta = error * cur_act_f_der(activations)

            # reshape delta as to have it as a 2d array
            delta_re = delta.reshape(delta.shape[0], -1).T

            # get activations for current layer
            current_activations = self.activations[i]

            # reshape activations as to have them as a 2d column matrix
            current_activations = current_activations.reshape(current_activations.shape[0],-1)

            # save derivative after applying matrix multiplication
            self.derivatives[i] = np.dot(current_activations, delta_re)

            # backpropogate the next error
            error = np.dot(delta, self.weights[i].T)


    def train(self, inputs, targets, epochs, learning_rate):
        """Trains model running forward prop and backprop
        Args:
            inputs (ndarray): X
            targets (ndarray): Y
            epochs (int): Num. epochs we want to train the network for
            learning_rate (float): Step to apply to gradient descent
        """
        # now enter the training loop
        for i in range(epochs):
            sum_errors = 0

            # iterate through all the training data
            for j, input in enumerate(inputs):
                target = targets[j]

                # activate the network!
                output = self.forward_propagate(input)

                error = target - output

                self.back_propagate(error)

                # now perform gradient descent on the derivatives
                # (this will update the weights
                self.gradient_descent(learning_rate)

                # keep track of the MSE for reporting later
                sum_errors += self._mse(target, output)

            # Epoch complete, report the training error
            print("Error: {} at epoch {}".format(sum_errors / len(items), i+1))

        print("Training complete!")
        print("=====")


    def gradient_descent(self, learningRate=1):
        """Learns by descending the gradient
        Args:
            learningRate (float): How fast to learn.
        """
        # update the weights by stepping down the gradient
        for i in range(len(self.weights)):
            weights = self.weights[i]
            derivatives = self.derivatives[i]
            weights += derivatives * learningRate
            
    def map_to_act(self, name):
        if name == "sigmoid":
            return lambda x: self._sigmoid(x)
        if name == "linear":
            return lambda x: self._linear(x)
        if name == "relu":
            return lambda x: self._relu(x)
        if name == "tanh":
            return lambda x: self._tanh(x)
        else:
            raise ValueError("No such activation function found")
            
    def map_to_act_derivative(self, name):
        if name == "sigmoid":
            return lambda x: self._sigmoid_derivative(x)
        if name == "linear":
            return lambda x: self._linear_derivative(x)
        if name == "relu":
            return lambda x: self._relu_derivative(x)
        if name == "tanh":
            return lambda x: self._tanh_derivative(x)
        else:
            raise ValueError("No such activation function found")


    def _sigmoid(self, x):
        y = 1.0 / (1 + np.exp(-x))
        return y

    def _sigmoid_derivative(self, x):
        return x * (1.0 - x)
    
    def _relu(self, x):
        return np.maximum(0, x)

    def _relu_derivative(self, x):
        return np.where(x > 0, 1, 0)
    
    def _tanh(self, x):
        return np.tanh(x)

    def _tanh_derivative(self, x):
        return 1 - np.tanh(x)**2
    
    def _linear(self, x):
        return x

    def _linear_derivative(self, x):
        return 1


    def _mse(self, target, output):
        """Mean Squared Error loss function
        Args:
            target (ndarray): The ground trut
            output (ndarray): The predicted values
        Returns:
            (float): Output
        """
        return np.average((target - output) ** 2)



In [2]:
items = np.array([[random()/2 for _ in range(2)] for _ in range(1000)])
targets = np.array([[i[0] + i[1]] for i in items])

mlp = MLP(2, [5], 1, ["linear", "linear"])

mlp.train(items, targets, 50, 0.1)

input = np.array([3, 0.1])
target = np.array([0.4])

output = mlp.forward_propagate(input)

print()
print("Our network believes that {} + {} is equal to {}".format(input[0], input[1], output[0]))

Error: 0.0004678107787784005 at epoch 1
Error: 6.095544072771768e-10 at epoch 2
Error: 1.351024188843233e-12 at epoch 3
Error: 2.9945283699442304e-15 at epoch 4
Error: 6.6373456605878654e-18 at epoch 5
Error: 1.4711620467724028e-20 at epoch 6
Error: 3.2608587950938983e-23 at epoch 7
Error: 7.229003979558104e-26 at epoch 8
Error: 1.5781609711291676e-28 at epoch 9
Error: 4.180766954670143e-31 at epoch 10
Error: 7.943251596777583e-32 at epoch 11
Error: 4.967677494710597e-32 at epoch 12
Error: 3.9201304941843536e-32 at epoch 13
Error: 3.6600303450027655e-32 at epoch 14
Error: 3.434946912402229e-32 at epoch 15
Error: 3.168605245568057e-32 at epoch 16
Error: 3.0013382298980256e-32 at epoch 17
Error: 2.8928876100968116e-32 at epoch 18
Error: 2.748707679635122e-32 at epoch 19
Error: 2.764650768456003e-32 at epoch 20
Error: 2.775147086652913e-32 at epoch 21
Error: 2.794483423294561e-32 at epoch 22
Error: 2.7638996557776916e-32 at epoch 23
Error: 2.754115931660205e-32 at epoch 24
Error: 2.735781

## Complex Weight Initialization

In [65]:
import numpy as np
from random import random


class MLP(object):
    """A Multilayer Perceptron class.
    """

    def __init__(self, num_inputs=3, hidden_layers=[3, 3], num_outputs=2, act_f = None):
        """Constructor for the MLP. Takes the number of inputs,
            a variable number of hidden layers, and number of outputs

        Args:
            num_inputs (int): Number of inputs
            hidden_layers (list): A list of ints for the hidden layers
            num_outputs (int): Number of outputs
        """

        self.num_inputs = num_inputs
        self.hidden_layers = hidden_layers
        self.num_outputs = num_outputs

        # create a generic representation of the layers
        layers = [num_inputs] + hidden_layers + [num_outputs]
        
        # Activation functions for each layer
        if act_f == None: self.act_f = ["sigmoid"]*(len(layers) - 1)
        else: self.act_f = act_f

        # create random connection weights for the layers
        weights = []
        for i in range(len(layers) - 1):
            w = np.random.rand(layers[i], layers[i + 1]) * np.exp(1j*np.pi / 2)
            #w = np.random.rand(layers[i], layers[i + 1]) + 1j*np.random.rand(layers[i], layers[i + 1])
            weights.append(w)
        self.weights = weights

        # save derivatives per layer
        derivatives = []
        for i in range(len(layers) - 1):
            d = np.zeros((layers[i], layers[i + 1]))
            derivatives.append(d)
        self.derivatives = derivatives

        # save activations per layer
        activations = []
        for i in range(len(layers)):
            a = np.zeros(layers[i])
            activations.append(a)
        self.activations = activations


    def forward_propagate(self, inputs):
        """Computes forward propagation of the network based on input signals.

        Args:
            inputs (ndarray): Input signals
        Returns:
            activations (ndarray): Output values
        """

        # the input layer activation is just the input itself
        activations = inputs

        # save the activations for backpropogation
        self.activations[0] = activations

        # iterate through the network layers
        for i, w in enumerate(self.weights):
            # calculate matrix multiplication between previous activation and weight matrix
            net_inputs = np.dot(activations, w)

            # apply sigmoid activation function
            cur_act_f = self.map_to_act(self.act_f[i])
            activations = cur_act_f(net_inputs)

            # save the activations for backpropogation
            self.activations[i + 1] = activations

        # return output layer activation
        return activations


    def back_propagate(self, error):
        """Backpropogates an error signal.
        Args:
            error (ndarray): The error to backprop.
        Returns:
            error (ndarray): The final error of the input
        """

        # iterate backwards through the network layers
        for i in reversed(range(len(self.derivatives))):

            # get activation for previous layer
            activations = self.activations[i+1]

            # apply sigmoid derivative function
            # apply sigmoid activation function
            cur_act_f_der = self.map_to_act_derivative(self.act_f[i])
            delta = error * cur_act_f_der(activations)

            # reshape delta as to have it as a 2d array
            delta_re = delta.reshape(delta.shape[0], -1).T

            # get activations for current layer
            current_activations = self.activations[i]

            # reshape activations as to have them as a 2d column matrix
            current_activations = current_activations.reshape(current_activations.shape[0],-1)

            # save derivative after applying matrix multiplication
            self.derivatives[i] = np.dot(current_activations, delta_re)

            # backpropogate the next error
            error = np.dot(delta, self.weights[i].T)


    def train(self, inputs, targets, epochs, learning_rate):
        """Trains model running forward prop and backprop
        Args:
            inputs (ndarray): X
            targets (ndarray): Y
            epochs (int): Num. epochs we want to train the network for
            learning_rate (float): Step to apply to gradient descent
        """
        # now enter the training loop
        for i in range(epochs):
            sum_errors = 0

            # iterate through all the training data
            for j, input in enumerate(inputs):
                target = targets[j]

                # activate the network!
                output = self.forward_propagate(input)

                error = target - output

                self.back_propagate(error)

                # now perform gradient descent on the derivatives
                # (this will update the weights
                self.gradient_descent(learning_rate)

                # keep track of the MSE for reporting later
                sum_errors += self._mse(target, output)

            # Epoch complete, report the training error
            print("Error: {} at epoch {}".format(sum_errors / len(items), i+1))

        print("Training complete!")
        print("=====")


    def gradient_descent(self, learningRate=1):
        """Learns by descending the gradient
        Args:
            learningRate (float): How fast to learn.
        """
        # update the weights by stepping down the gradient
        for i in range(len(self.weights)):
            weights = self.weights[i]
            derivatives = self.derivatives[i]
            weights += derivatives * learningRate
            
    def map_to_act(self, name):
        if name == "sigmoid":
            return lambda x: self._sigmoid(x)
        if name == "linear":
            return lambda x: self._linear(x)
        if name == "relu":
            return lambda x: self._relu(x)
        if name == "tanh":
            return lambda x: self._tanh(x)
        else:
            raise ValueError("No such activation function found")
            
    def map_to_act_derivative(self, name):
        if name == "sigmoid":
            return lambda x: self._sigmoid_derivative(x)
        if name == "linear":
            return lambda x: self._linear_derivative(x)
        if name == "relu":
            return lambda x: self._relu_derivative(x)
        if name == "tanh":
            return lambda x: self._tanh_derivative(x)
        else:
            raise ValueError("No such activation function found")


    def _sigmoid(self, x):
        y = 1.0 / (1 + np.exp(-x))
        return y

    def _sigmoid_derivative(self, x):
        return x * (1.0 - x)
    
    def _relu(self, x):
        return np.maximum(0, x)

    def _relu_derivative(self, x):
        return np.where(x > 0, 1, 0)
    
    def _tanh(self, x):
        return np.tanh(x)

    def _tanh_derivative(self, x):
        return 1 - np.tanh(x)**2
    
    def _linear(self, x):
        return x

    def _linear_derivative(self, x):
        return np.where(x > 0, 1, 1)


    def _mse(self, target, output):
        """Mean Squared Error loss function
        Args:
            target (ndarray): The ground trut
            output (ndarray): The predicted values
        Returns:
            (float): Output
        """
        return np.abs(np.average((target - output) ** 2))



In [62]:
abs(2*np.exp(1j*np.pi / 2))
res = np.random.rand(10, 5)*np.exp(1j*np.pi / 2)
res

array([[1.26425343e-17+0.20646825j, 2.53538736e-17+0.41406018j,
        5.17136241e-17+0.84454757j, 4.70742204e-17+0.76878036j,
        1.34806250e-17+0.22015531j],
       [5.97517210e-17+0.97581966j, 4.45270098e-17+0.72718125j,
        3.85220473e-18+0.06291128j, 2.92420291e-17+0.47755858j,
        3.69145269e-18+0.060286j  ],
       [3.78665571e-17+0.61840781j, 5.00101128e-17+0.81672712j,
        2.65342407e-17+0.43333704j, 1.36885519e-17+0.22355102j,
        1.50939264e-17+0.24650252j],
       [5.49037087e-17+0.89664561j, 1.07507449e-17+0.17557299j,
        2.78705517e-17+0.45516065j, 3.32914645e-17+0.54369087j,
        4.34645428e-17+0.70982985j],
       [1.12621893e-17+0.18392551j, 4.51077465e-17+0.73666541j,
        1.73444178e-17+0.28325584j, 3.77989413e-17+0.61730356j,
        2.19195316e-17+0.35797312j],
       [5.57210350e-17+0.90999356j, 2.95346112e-17+0.48233681j,
        4.34562373e-17+0.70969421j, 3.62390404e-17+0.59182844j,
        9.21318667e-18+0.15046276j],
       [6.

In [84]:
items = np.array([[random()/1.9 for _ in range(2)] for _ in range(1000)])
targets = np.array([[i[0] + i[1]] for i in items])

mlp = MLP(2, [10, 5], 1, ["linear", "linear", "linear"])

mlp.train(items, targets, 50, 0.1)

input = np.array([3, 0.1])
target = np.array([0.3])

output = mlp.forward_propagate(input)

print()
print("Our network believes that {} + {} is equal to {}".format(input[0], input[1], output[0]))

Error: 0.016113556066521102 at epoch 1
Error: 0.0017091393552608365 at epoch 2
Error: 1.6883297175821061e-06 at epoch 3
Error: 1.6212053120294535e-09 at epoch 4
Error: 1.5005733251427388e-12 at epoch 5
Error: 1.3854451022504825e-15 at epoch 6
Error: 1.2791029580993921e-18 at epoch 7
Error: 1.180929363703168e-21 at epoch 8
Error: 1.0902985048689648e-24 at epoch 9
Error: 1.0088334357685943e-27 at epoch 10
Error: 1.0604484261338763e-30 at epoch 11
Error: 3.0918723615793123e-32 at epoch 12
Error: 2.595034984076604e-32 at epoch 13
Error: 2.7007338249432697e-32 at epoch 14
Error: 2.2114197444764203e-32 at epoch 15
Error: 2.2290068353711644e-32 at epoch 16
Error: 2.2139213840263765e-32 at epoch 17
Error: 2.2108927728417667e-32 at epoch 18
Error: 2.3248897483453608e-32 at epoch 19
Error: 2.1297909111056844e-32 at epoch 20
Error: 1.9612609381500867e-32 at epoch 21
Error: 1.9806098018980565e-32 at epoch 22
Error: 2.12487121470975e-32 at epoch 23
Error: 1.945691706072306e-32 at epoch 24
Error: 2.

In [85]:
items = np.array([[random()/1.8 for _ in range(2)] for _ in range(1000)])
targets = np.array([[i[0] + i[1]] for i in items])

mlp = MLP(2, [10, 5], 1, ["linear", "linear", "linear"])

mlp.train(items, targets, 50, 0.1)

input = np.array([3, 0.1])
target = np.array([0.3])

output = mlp.forward_propagate(input)

print()
print("Our network believes that {} + {} is equal to {}".format(input[0], input[1], output[0]))

Error: nan at epoch 1
Error: nan at epoch 2
Error: nan at epoch 3
Error: nan at epoch 4
Error: nan at epoch 5
Error: nan at epoch 6
Error: nan at epoch 7
Error: nan at epoch 8
Error: nan at epoch 9
Error: nan at epoch 10
Error: nan at epoch 11
Error: nan at epoch 12
Error: nan at epoch 13
Error: nan at epoch 14
Error: nan at epoch 15
Error: nan at epoch 16
Error: nan at epoch 17
Error: nan at epoch 18
Error: nan at epoch 19
Error: nan at epoch 20
Error: nan at epoch 21
Error: nan at epoch 22
Error: nan at epoch 23
Error: nan at epoch 24
Error: nan at epoch 25
Error: nan at epoch 26
Error: nan at epoch 27
Error: nan at epoch 28
Error: nan at epoch 29
Error: nan at epoch 30
Error: nan at epoch 31
Error: nan at epoch 32
Error: nan at epoch 33
Error: nan at epoch 34
Error: nan at epoch 35
Error: nan at epoch 36
Error: nan at epoch 37
Error: nan at epoch 38
Error: nan at epoch 39
Error: nan at epoch 40
Error: nan at epoch 41
Error: nan at epoch 42
Error: nan at epoch 43
Error: nan at epoch 

NaNs are observed because complex values easily explode on multiplication as long as the weights have magnitude strictly less than one. Very sensitve, even a factor of 0.1 causes relsults to be drastcially altered.
$$a \cdot c = ac$$
$$(a + bi) \cdot (c + di) = (ac - bd) + (bc + ad)i \implies |(a + bi) \cdot (c + di)| = \sqrt{(ac - bd)^2 + (bc + ad)^2}$$
The weight updates can drastically alter their magnitude shooting up the forward output to extremely large values.

Solutions: 
1. A bi-segmented network architecture that considers independet treatment of magnitude and phase as well as their interaction (similar to a graph model?).
2. Keeping weights always real, but difficlut treatment.