# Deep Neural Networks (L2)

## 1. Perceptrons as Logical Operators

In [1]:
import numpy as np

### AND Perceptron
<img src='//aind-notes.s3-website-eu-west-1.amazonaws.com/img/and-perceptron.png' />

In [14]:
weight1 = 1.0
weight2 = 1.0
bias = -1.0

x = np.array([
    [0,0],
    [0,1],
    [1,0],
    [1,1]
])

correct_outputs = [False, False, False, True]

In [15]:
output = x.dot([weight1, weight2]) + bias
[x > 0 for x in output]

[False, False, False, True]

### OR Perceptron
<img src='//aind-notes.s3-website-eu-west-1.amazonaws.com/img/or-perceptron.png'>

In [20]:
weight1 = 2.0
weight2 = 1.0
bias = 0.

x = np.array([
    [0,0],
    [0,1],
    [1,0],
    [1,1]
])

correct_outputs = [False, True, True, True]

In [21]:
output = x.dot([weight1, weight2]) + bias
[x > 0 for x in output]

[False, True, True, True]

### NOT Perceptron

In [24]:
weight1 = 1.0
weight2 = -4.0
bias = 2.0

x = np.array([
    [0,0],
    [0,1],
    [1,0],
    [1,1]
])

correct_outputs = [True, False, True, False]

In [25]:
output = x.dot([weight1, weight2]) + bias
[x > 0 for x in output]

[True, False, True, False]

### XOR Perceptron
<img src='//aind-notes.s3-website-eu-west-1.amazonaws.com/img/xor-perceptron.png'>
<img src='//aind-notes.s3-website-eu-west-1.amazonaws.com/img/xor-nn.png'>

### Perceptron Algorithm
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/pa-pseudo.png" />

In [193]:
import numpy as np
np.random.seed(42)

def stepFunction(t):
    if t >= 0:
        return 1
    return 0

def prediction(X, W, b):
    return stepFunction((np.matmul(X,W)+b)[0])

# The function should receive as inputs the data X, the labels y,
# the weights W (as an array), and the bias b,
# update the weights and bias W, b, according to the perceptron algorithm,
# and return W and b.
def perceptronStep(X, y, W, b, learn_rate = 0.01):
    for i in range(len(X)):
        y_hat = prediction(X[i], W, b)
        # pred is neg, but pos label
        if y[i] - y_hat == 1:
            # add lr*X to W
            W += X[i].reshape(2,1) * learn_rate
            # add lr to b
            b += learn_rate
        # pred is pos, but neg label
        elif y[i] - y_hat == -1:
            # sub lr*X from W
            W -= X[i].reshape(2,1) * learn_rate
            # sub lr from b
            b -= learn_rate
    return W, b
    
# This function runs the perceptron algorithm repeatedly on the dataset,
# and returns a few of the boundary lines obtained in the iterations,
# for plotting purposes.
# Feel free to play with the learning rate and the num_epochs,
# and see your results plotted below.
def trainPerceptronAlgorithm(X, y, learn_rate = 0.01, num_epochs = 25):
    x_min, x_max = min(X.T[0]), max(X.T[0])
    y_min, y_max = min(X.T[1]), max(X.T[1])
    W = np.array(np.random.rand(2,1))
    b = np.random.rand(1)[0] + x_max
    # These are the solution lines that get plotted below.
    boundary_lines = []
    for i in range(num_epochs):
        # In each epoch, we apply the perceptron step.
        W, b = perceptronStep(X, y, W, b, learn_rate)
        boundary_lines.append((-W[0]/W[1], -b/W[1]))
    return boundary_lines

In [194]:
import pandas as pd
data = pd.read_csv('data/perceptron.csv', header=None).values
X = data[:,:-1]
y = data[:,2]

In [196]:
boundary_lines = trainPerceptronAlgorithm(X, y)
boundary_lines[-1]

(array([-0.51307359]), array([ 0.70672918]))

## 2. Error Functions

### Discrete vs. Continouos
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/disc-vs-con.png">
Since the steps in our error (loss, cost) functions are calculated by taking derivatives it has to be continous and differentiable. -> small variations in position will transform to small variations in height (mountain).
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/logloss1.png" style="height: 150px; display: inline-block;"/>
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/logloss2.png" style="height: 150px; display: inline-block" />

We use **Gradient Descent** to gradually reduce the error.


#### Predictions
Continous error functions are better, when it comes to optimizing. For this we also need to make continous predictions.

* Discrete predictions is Yes or No
* Continous predictions is a number (e.g. probability)
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/disc-vs-con1.png" />

The way to move from discrete to continoues predictions is to change our **activation function** from our previous **step function** (perceptron) to e.g. a **sigmoid function**.

We simply combine our linear function `Wx+b` with our sigmoid function:
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/activation.png" />
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/preds.png" style="height: 200px; display: inline-block;"/>
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/perceptron-sigmoid.png" style="height: 200px; display: inline-block" />

## 3. Softmax

* Multi-Class Classification
* Equivalent to an activation function
* Probabilities `sum` to 1
* Softmax uses `exp()` to deal with negative numbers

In [215]:
import numpy as np

# Input is a list of numbers, and returns
# the list of values given by the softmax function.
def softmax(L):
    total = np.sum(np.exp(L))
    probs = [i/total for i in np.exp(L)]
    return probs

### One-Hot Encoding
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/onehotenc.png" />

## 4. Maximum Likelihood
Pick the model that gives the existing labels the highest probabilities, thus by maximizing the probability we can pick the best model.
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/maxlike.png"/>
The better model gives better probabilities. How do we maximize these probabilities?
With the right **error function**, minimizing the error is equivalent to maximizing the correct probabilities.

But, calculating the best model by using products `(0.7*0.9*...)` isn't optimal for different reasons:
* hard to calculate for large amount of probabilities
* the product will be very tiny
* a change in one number might change the product drastically

Instead we want to `sum` over probabilities and for that we use `ln()` (natural log):
`ln(ab) = ln(a) + ln(b)`

To get positive numbers we use `-ln()`. The sum over all `-ln(p1), ...-ln(pn)` is called **Cross-Entropy**


A good model will result in a low cross-entropy:
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/cross-ent.png" />
Calculating the ln for each point is equivalent to the error of each point. The points that are missclassified has larger values (errors).
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/cross-ent1.png" />
Therefor, our goal changes from maximizing probability to minimizing error in order to obtain the best model! And the error function we'll use is **Cross-Entropy**.

## 5. Cross-Entropy
If we have a number of events and probabilities. How likely is it for the events to happen based on the probabilities? If it's very likely, the the cross-entropy is small, otherwise if it's unlikely, cross-entropy is large.

<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/cross-ent2.png" />
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/cross-ent3.png" />

In [310]:
import numpy as np

# Write a function that takes as input two lists Y, P,
# and returns the float corresponding to their cross-entropy.
def cross_entropy(Y, P):
    Y = np.array(Y)
    P = np.array(P)
    errors = Y * np.log(P) + (1-Y) * np.log(1-P)
    return -np.sum(errors)

In [315]:
P = [1, 1, 0]
Y = [0.8, 0.7, 0.1]
# passed the test, but kernel has issues with ln(0)
# cross_entropy(Y,P)

### Multi-Class Cross-Entropy
<img src="img/cross-ent-m.png"/>

## 6. Logistic Regression
1. Take your data
2. Pick a random model
3. Calculate the error
4. Minimize the error and obtain a better model

### Calculating the Error Function
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/errfn.png" />
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/errfn1.png" style="display: inline-block; height: 185px" />
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/errfn2.png" style="display: inline-block; height: 185px" />
### Minimizing the Error Function
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/minerrfn.png" />
We'll gradually minimize the error function using gradient descent.

## 7. Gradient Descent

<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/gd.png" />
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/gd1.png" />

### Gradient Calculation

#### Sigmoid
The sigmoid function has a really nice derivative:

\begin{equation}
\sigma'(x) = \sigma(x)(1-\sigma(x))
\end{equation}

The reason for this is the following, we can calculate it using the quotient formula:

\begin{equation}
\sigma'(x) = \frac{\partial}{\partial x} \frac{1}{1+e^{-x}} \\
           = \frac{e^{-x}}{(1+e^{-x})^2} \\
           = \frac{1}{1+e^{-x}} * \frac{e^{-x}}{1+e^{-x}} \\
           = \sigma(x)(1-\sigma(x))
\end{equation}

#### Error
if we have `m` points labelled `x1, x2, ... xm` the error formula is:

\begin{equation}
E = -\frac{1}{m} \sum_{i=1}^m (y_iln(\hat y_i) + (1-y_i)ln(1-\hat y_i)
\end{equation}

Where the prediction is given by:

\begin{equation}
\hat y_i = \sigma(Wx^{(i)} + b)
\end{equation}

Our **goal** is to calculate the gradient of `E`, at a point `x=(x1,...,xm)`, given by the partial derivatives:

\begin{equation}
\nabla E = (\frac{\partial}{\partial w_1}E, ... , \frac{\partial}{\partial w_n}E, \frac{\partial}{\partial b}E)
\end{equation}


1. We'll calculate the derivative of the prediction:
\begin{equation}
\frac{\partial}{\partial w_j}\hat y = \frac{\partial}{\partial w_j}{\sigma(Wx + b)} \\
= \sigma(Wx + b)(1 - \sigma(Wx + b)) * \frac{\partial}{\partial w_j}(Wx + b) \\
= \hat y(1-\hat y) * \frac{\partial}{\partial w_j}(Wx + b) \\
= \hat y(1-\hat y) * \frac{\partial}{\partial w_j}(w_1x_1 + ... + w_jx_j + ... + w_nx_n + b) \\
= \hat y(1-\hat y) * x_j
\end{equation}

2. Now we can calculate the derivative of the Error:

\begin{equation}
\frac{\partial}{\partial w_j}E = \frac{\partial}{\partial w_j} (-\frac{1}{m} \sum_{i=1}^m (y_iln(\hat y_i) + (1-y_i)ln(1-\hat y_i)) \\
= -\frac{1}{m}\sum_{i=1}^m y_i\frac{\partial}{\partial w_j}ln(\hat y_i) + (1-y_i)\frac{\partial}{\partial w_j}ln(1-\hat y_i) \\
= -\frac{1}{m}\sum_{i=1}^m y_i\frac{1}{\hat y_i}\frac{\partial}{\partial w_j}\hat y_i + (1-y_i)\frac{-1}{1-\hat y_i}\frac{\partial}{\partial w_j}\hat y_i \\
= -\frac{1}{m}\sum_{i=1}^m y_i\frac{1}{\hat y_i}\hat y_i(1-\hat y_i)x^{(i)}_j + (1-y_i)\frac{-1}{1-\hat y_i}\hat y_i(1-\hat y_i)x^{(i)}_j\\
= -\frac{1}{m}\sum_{i=1}^m y_i(1-\hat y_i)x^{(i)}_j - (1-y_i)\hat y_i x^{(i)}_j\\
= -\frac{1}{m}\sum_{i=1}^m (y_i - \hat y_i)x^{(i)}_j
\end{equation}

A similar calculation will show that:
\begin{equation}
\frac{\partial}{\partial b}E = -\frac{1}{m} \sum_{i=1}^m (y_i - \hat y_i)
\end{equation}

For a point with coordinates `(x1,...,xn)`, label `y` and prediction `y_hat`, the gradient of the error function at that point is `((y - y_hat)x1,...,(y - y_hat)xn,(y - y_hat))`.
In summary:

\begin{equation}
\nabla E(W,b) = (y - \hat y)(x_1,...,x_n, 1)
\end{equation}


    The gradient is actually a scalar times the coordinates of the point.
    This scalar is the difference between the label and the prediction. If the label is close to the prediction (well classified) the gradient is small, else it is big.
    

### Gradient Descent Algorithm
<img src="//aind-notes.s3-website-eu-west-1.amazonaws.com/img/gd-alg.png"/>

In [1]:
import numpy as np
# Setting the random seed, feel free to change it and see different solutions.
np.random.seed(42)

def sigmoid(x):
    return 1/(1+np.exp(-x))
def sigmoid_prime(x):
    return sigmoid(x)*(1-sigmoid(x))
def prediction(X, W, b):
    return sigmoid(np.matmul(X,W)+b)
def error_vector(y, y_hat):
    return [-y[i]*np.log(y_hat[i]) - (1-y[i])*np.log(1-y_hat[i]) for i in range(len(y))]
def error(y, y_hat):
    ev = error_vector(y, y_hat)
    return sum(ev)/len(ev)

# Calculate the gradient of the error function.
# The result should be a list of three lists:
# The first list should contain the gradient (partial derivatives) with respect to w1
# The second list should contain the gradient (partial derivatives) with respect to w2
# The third list should contain the gradient (partial derivatives) with respect to b
def dErrors(X, y, y_hat):
    errors = np.array([y[i] - y_hat[i] for i in range(len(y))])
    DErrorsDx1 = X[0] * errors
    DErrorsDx2 = X[1] * errors
    DErrorsDb = errors
    return DErrorsDx1, DErrorsDx2, DErrorsDb

# The function should receive as inputs the data X, the labels y,
# the weights W (as an array), and the bias b.
# It should calculate the prediction, the gradients, and use them to
# update the weights and bias W, b. Then return W and b.
# The error e will be calculated and returned for you, for plotting purposes.
def gradientDescentStep(X, y, W, b, learn_rate = 0.01):
    # Calculate the prediction
    y_hat = prediction(X, W, b)
    # Calculate the gradient
    dx1, dx2, db = dErrors(X, y, y_hat)
    # Update the weights
    W += np.sum([dx1, dx2]) * learn_rate
    b += np.sum(db) * learn_rate
    # This calculates the error
    e = error(y, y_hat)
    return W, b, e

# This function runs the perceptron algorithm repeatedly on the dataset,
# and returns a few of the boundary lines obtained in the iterations,
# for plotting purposes.
# Feel free to play with the learning rate and the num_epochs,
# and see your results plotted below.
def trainLR(X, y, learn_rate = 0.01, num_epochs = 100):
    x_min, x_max = min(X.T[0]), max(X.T[0])
    y_min, y_max = min(X.T[1]), max(X.T[1])
    # Initialize the weights randomly
    W = np.array(np.random.rand(2,1))*2 -1
    b = np.random.rand(1)[0]*2 - 1
    # These are the solution lines that get plotted below.
    boundary_lines = []
    errors = []
    for i in range(num_epochs):
        # In each epoch, we apply the gradient descent step.
        W, b, error = gradientDescentStep(X, y, W, b, learn_rate)
        boundary_lines.append((-W[0]/W[1], -b/W[1]))
        errors.append(error)
    return boundary_lines, errors

In [2]:
import pandas as pd
data = pd.read_csv('data/perceptron.csv', header=None).values
X = data[:,:-1]
y = data[:,2]

In [3]:
result = trainLR(X, y)