# Neural network
To predict whether or not a patient has Alzheimer's disease, a neural network is used here.

In [None]:
import numpy as np
from data_reader import get_data_dict, split_data
data = get_data_dict('./data/alzheimers_disease_data.csv')
metadata, internal_factors, external_factors = split_data()

### Values used
From our original exploration of the data, we found that there were six variables that showed some sort of relationship to the diagnosis of a patient on their own:
1. `Hypertension`
2. `MemoryComplaints`
3. `BehavioralProblems`
4. `MMSE`
5. `FunctionalAssessment`
6. `ADL`

For creating the neural network, our data set will be split into a training set and a test set. The test set will contain the last 500 rows of the set, so there are 1650 datapoints left for the training.

In [None]:
columns = ['Age', 'Hypertension', 'MemoryComplaints', 'BehavioralProblems', 'MMSE',
           'FunctionalAssessment', 'ADL']
test_set_size = 500
learning_set_size = len(data['PatientID']) - test_set_size

feature_matrix = np.vstack(tuple(data[col][:learning_set_size] for col in columns)).T
target = np.array(data['Diagnosis'][:learning_set_size])
test_matrix = np.vstack(tuple(data[col][learning_set_size:] for col in columns)).T
test_target = np.array(data['Diagnosis'][learning_set_size:])


### Structure of the NN
Deciding the structure of an NN is more of an art than a science (_Rein van den Boogaard_), so to start with we use three layers, starting with vectors of size 6, then 6, then 3 which reduces to 1 to end up at one final value. The feature vectors are of size 6, so the weight matrices should match that size. For the activation function, the sigmoid function is used. Using the ReLU function does not do much here, as the values in the data are not extremely small or big.

In [None]:
class NN():
    def __init__(self, layerSizes=[7,6,5,5,3,2]):
        self.layers = layerSizes + [1]
        self.n_layers = len(layerSizes)

        self.weights = []
        self.biases = []

        # First, the weights and biases should be randomly initialized
        for i in range(len(self.layers) - 1):
            # Since sigmoid is used as an activation function, Xavier init is
            # used here
            limit = np.sqrt(6 / sum((self.layers[i], self.layers[i + 1])))
            W = np.random.uniform(-limit, limit, size=(self.layers[i + 1], self.layers[i]))
            b = np.random.uniform(-limit, limit, size=(self.layers[i + 1],))
            self.weights.append(W)
            self.biases.append(b)

    def loss(self, y_hat, y_true):
        # Since we need a binary outcome, we use binary cross-entropy loss
        y_hat = np.clip(y_hat, 1e-6, 1 - 1e-6)
        return -np.mean(y_true * np.log(y_hat) + (1 - y_true) * np.log(1 - y_hat))

    def activate(self, v):
        # The activation function (sigmoid)
        return 1/(1 + np.exp(-v))

    def d_activate(self, v):
        # Derivative of the activation function
        return self.activate(v) * (1 - self.activate(v))

    def predict(self, x):
        # Given a feature vector x, predict the target value using the network
        self.z = []
        self.a = [x]
        for i in range(len(self.layers) - 1):
            z = x @ self.weights[i].T + self.biases[i]
            self.z.append(z)
            x = self.activate(z)
            self.a.append(x)
        return x

    def back_propagation(self, y):
        # Update the weight values according to an outcome a
        pass

    def train(self, x, y, alpha=0.1, batch_size=32):
        # Floor divide to see the last iteration (which might be smaller than
        # the batch size) a special case
        for i in range(len(x)//batch_size):
            derivs = [0] * self.n_layers
            prediction = self.predict(x[batch_size*i:batch_size*i + batch_size])
            # Perform a back propagation based on the loss
            l = self.loss(prediction, y)
            derivs[-1] = l
            for j in range(len(derivs) - 2, 0, -1):
                derivs[j] = (self.d_activate(self.z[j])*derivs[j + 1]) @ self.weights[j]
            for j in range(len(self.layers) - 2):
                self.biases[j] -= alpha * np.sum(self.d_activate(self.z[j])*derivs[j + 1])
                self.weights[j] -= alpha * (self.d_activate(self.z[j])*derivs[j + 1]).T @ self.a[j]


nn = NN()
prediction = nn.predict(test_matrix[:10])
rounded = np.where(prediction < 0.5, 0, 1)
print(rounded.T[0])
print(test_target[:10])
nn.train(feature_matrix, target)
prediction = nn.predict(test_matrix[:10])
rounded = np.where(prediction < 0.5, 0, 1)
print(rounded.T[0])
print(test_target[:10])

In [None]:
test_set_size = 500
learning_set_size = len(data['PatientID']) - test_set_size

columns = list(data.keys())
columns.remove('DoctorInCharge')
columns.remove('Diagnosis')

feature_matrix = np.vstack(tuple(data[col][:learning_set_size] for col in columns)).T
target = np.array(data['Diagnosis'][:learning_set_size])
test_matrix = np.vstack(tuple(data[col][learning_set_size:] for col in columns)).T
test_target = np.array(data['Diagnosis'][learning_set_size:])

nn = NN(layerSizes=[33, 16, 8, 4, 2])
prediction = nn.predict(test_matrix[:10])
rounded = np.where(prediction < 0.5, 0, 1)
print(rounded.T[0])
print(test_target[:10])
nn.train(feature_matrix, target)
prediction = nn.predict(test_matrix[:10])
rounded = np.where(prediction < 0.5, 0, 1)
print(rounded.T[0])
print(test_target[:10])

Sadly, the neural network did not produce good results. Even other implementations, that definitely work, did not yield good predictions at all. Even when they were tested with the full feature vector instead of just the six fields mentioned above. Therefore, we decided to scrap this idea and instead use a different data set to test the accuracy our regression model with.