In [34]:
"""
Street Light: Teach a neural network to translate a streetlight pattern into the correct stop/walk pattern.

1. The important takeaway is that an infinite number of matrices exist that perfectly reflect the streetlight patterns in the dataset. It’s
   important to recognize that the underlying pattern isn’t the same as the matrix. It’s a property of the matrix. In fact, it’s a
   property of all of these matrices. The pattern is what each of these matrices is expressing. The pattern also existed in the streetlights.

2. This input data pattern is what you want the neural network to learn to transform into the output data pattern.
"""
    


import numpy as np

def neural_network(streetlights, walk_vs_stop, weights, alpha):
    for iteration in range(40):
        error_for_all_lights = 0
        for row_index in range(len(walk_vs_stop)):
            input = streetlights[row_index]
            goal_prediction = walk_vs_stop[row_index]
        
            prediction = input.dot(weights)
            error = (goal_prediction - prediction) ** 2
            error_for_all_lights += error

            delta = prediction - goal_prediction
            weights = weights - (alpha * (input * delta))
        print("Weight:", weights)
        print("Error: " + str(error_for_all_lights) + "\n")

    
weights = np.array([0.5, 0.48, -0.7])
alpha = 0.1

# You want the neural network to take any matrix containing the same underlying pattern as streetlights and transform it
# into a matrix that contains the underlying pattern of walk_vs_stop. 
streetlights = np.array([[ 1, 0, 1 ],
                         [ 0, 1, 1 ],
                         [ 0, 0, 1 ],
                         [ 1, 1, 1 ],
                         [ 0, 1, 1 ],
                         [ 1, 0, 1 ]])
walk_vs_stop = np.array([ 0, 1, 0, 1, 1, 0 ])

neural_network(streetlights, walk_vs_stop, weights, alpha)

Weight: [ 0.540848  0.72112  -0.400432]
Error: 2.6561231104

Weight: [ 0.49944225  0.84194128 -0.28583891]
Error: 0.9628701776715985

Weight: [ 0.43945363  0.91252555 -0.23598163]
Error: 0.5509165866836797

Weight: [ 0.38100946  0.95888996 -0.20924067]
Error: 0.36445836852222424

Weight: [ 0.32955689  0.99146612 -0.19117291]
Error: 0.2516768662079895

Weight: [ 0.28576471  1.01500784 -0.17673288]
Error: 0.17797575048089034

Weight: [ 0.24888702  1.03207349 -0.16412797]
Error: 0.12864460733422164

Weight: [ 0.21787885  1.0442761  -0.15269426]
Error: 0.09511036950476208

Weight: [ 0.19174429  1.05275286 -0.14216186]
Error: 0.07194564247043436

Weight: [ 0.16962672  1.05835728 -0.1324006 ]
Error: 0.05564914990717743

Weight: [ 0.15081598  1.0617496  -0.12333176]
Error: 0.04394763937673939

Weight: [ 0.13473149  1.06344721 -0.11489715]
Error: 0.035357967050948465

Weight: [ 0.12090095  1.06385744 -0.10704822]
Error: 0.02890700056547436

Weight: [ 0.10894061  1.06330118 -0.099742  ]
Error: 

In [None]:
"""
Full/Batch/Stochastic Gradient Descent

1. Ths idea of learning one example at a time is a variant on gradient descent called stochastic gradient descent, and it’s one of the
   handful of methods that can be used to learn an entire dataset. It performs a prediction and weight update for each training example
   separately. In other words, it takes the first streetlight, tries to predict it, calculates the weight_delta, and updates the weights.
   Then it moves on to the second streetlight, and so on. It iterates through the entire dataset many times until it can find a weight
   configuration that works well for all the training examples.

2. Another method for learning an entire dataset is gradient descent (or average/full gradient descent). Instead of updating the weights once
   for each training example, the network calculates the average weight_delta over the entire dataset, changing the weights only each time
   it computes a full average.

3. The third configuration that sort of splits the difference between stochastic gradient descent and full gradient descent. Instead of
   updating the weights after just one example or after the entire dataset of examples, you choose a batch size (typically between 8 and 256)
   of examples, after which the weights are updated.
"""

In [50]:
"""
Neural networks learn correlation

1. In the process of gradient descent, each training example asserts either up pressure or down pressure on the weights. On average,
   there was more up pressure for the middle weight and more down pressure for the other weights.

2. Each node is individually trying to correctly predict the output given the input. For the most part, each node ignores all the other
   nodes when attempting to do so. The only cross communication occurs in that all three weights must share the same error measure. The
   weight update is nothing more than taking this shared error measure and multiplying it by each respective input. A key part of why
   neural networks learn is error attribution, which means given a shared error, the network needs to figure out which weights contributed
   (so they can be adjusted) and which weights did not contribute (so they can be left alone). 

3. The Weight Pressure table helps describe the effect of each training example on each respective weight. + indicates that
   it has pressure toward 1, and – indicates that it has pressure toward 0. Zeros (0) indicate that there is no pressure because
   the input datapoint is 0, so that weight won’t be changed.

4. The prediction is a weighted sum of the inputs. The learning algorithm rewards inputs that correlate with the output with upward pressure
   (toward 1) on their weight while penalizing inputs with discorrelation with downward pressure. The weighted sum of the inputs find perfect
   correlation between the input and the output by weighting decorrelated inputs to 0. (Rewarding correlation with pressure toward 1 and
   penalizing decorrelation with pressure toward 0)
"""

def neural_network(streetlights, walk_vs_stop, weights, alpha):
    for iteration in range(40):
        error_for_all_lights = 0
        old_weights = weights
        for row_index in range(len(walk_vs_stop)):
            input = streetlights[row_index]
            goal_prediction = walk_vs_stop[row_index]
        
            prediction = input.dot(weights)
            error = (goal_prediction - prediction) ** 2
            error_for_all_lights += error

            delta = prediction - goal_prediction
            new_weights = weights - (alpha * (input * delta))
            
            pressue = []
            data = []
            for i in range(len(new_weights)):
                if abs(new_weights[i]) < abs(weights[i]):
                    pressue.append('-')
                elif abs(new_weights[i]) > abs(weights[i]):
                    pressue.append('+')
                else:
                    pressue.append('0')
                data.append(str(streetlights[row_index].tolist()[i]))
            data.append('-->\t' + str(walk_vs_stop[row_index]))
            pressue.append('-->\t' + str(walk_vs_stop[row_index])) # append output
            print("\t".join(data) + '\t\t' + "\t".join(pressue))
            weights = new_weights
        
        print("Old Weight:", old_weights)
        print("New Weight:", new_weights)
        print("Error: " + str(error_for_all_lights) + "\n")
    
weights = np.array([0.5, 0.48, -0.7])
neural_network(streetlights, walk_vs_stop, weights, alpha)

1	0	1	-->	0		+	0	-	-->	0
0	1	1	-->	1		0	+	-	-->	1
0	0	1	-->	0		0	0	-	-->	0
1	1	1	-->	1		+	+	-	-->	1
0	1	1	-->	1		0	+	-	-->	1
1	0	1	-->	0		-	0	+	-->	0
Old Weight: [ 0.5   0.48 -0.7 ]
New Weight: [ 0.540848  0.72112  -0.400432]
Error: 2.6561231104

1	0	1	-->	0		-	0	+	-->	0
0	1	1	-->	1		0	+	-	-->	1
0	0	1	-->	0		0	0	-	-->	0
1	1	1	-->	1		-	-	+	-->	1
0	1	1	-->	1		0	+	-	-->	1
1	0	1	-->	0		-	0	+	-->	0
Old Weight: [ 0.540848  0.72112  -0.400432]
New Weight: [ 0.49944225  0.84194128 -0.28583891]
Error: 0.9628701776715985

1	0	1	-->	0		-	0	+	-->	0
0	1	1	-->	1		0	+	-	-->	1
0	0	1	-->	0		0	0	-	-->	0
1	1	1	-->	1		-	-	+	-->	1
0	1	1	-->	1		0	+	-	-->	1
1	0	1	-->	0		-	0	+	-->	0
Old Weight: [ 0.49944225  0.84194128 -0.28583891]
New Weight: [ 0.43945363  0.91252555 -0.23598163]
Error: 0.5509165866836797

1	0	1	-->	0		-	0	+	-->	0
0	1	1	-->	1		0	+	-	-->	1
0	0	1	-->	0		0	0	-	-->	0
1	1	1	-->	1		-	-	+	-->	1
0	1	1	-->	1		0	+	-	-->	1
1	0	1	-->	0		-	0	+	-->	0
Old Weight: [ 0.43945363  0.91252555 -0.23598163]
New W

In [2]:
"""
Edge Cases:

1. Deep learning's greatest weakness: Overfitting. Error is shared among all the weights. If a particular configuration of weights
   accidentally creates perfect correlation between the prediction and the output dataset (such that error == 0) without giving the heaviest
   weight to the best inputs, the neural network will stop learning. （Sometimes correlation hanpens accidentally）

2. Neural networks are so flexible that they can find many, many different weight configurations that will correctly predict for a subset of
   training data. If you trained this neural network on the first two training examples, it would likely stop learning at a point where it
   did not work well for the other training examples. In essence, it memorized the two training examples instead of finding the correlation
   that will generalize to any possible streetlight configuration. 

3. The greatest challenge you’ll face with deep learning is convincing your neural network to generalize instead of just memorize.

4. Edge case 2: Conflicting pressure. Sometimes correlation fights itself.
   <Training Data>              <Weight Pressure>
   1  0  1  -->  0				-  0  -  -->  0
   0  1  1  -->  1				0  +  +  -->  1
   0  0  1  -->  0				0  0  -  -->  0
   1  1  1  -->  1				+  +  +  -->  1
   0  1  1  -->  1				0  +  +  -->  1
   1  0  1  -->  0				-  0  -  -->  0
   This column seems to have an equal number of upward and downward pressure moments. But the network correctly pushes this (far-right)
   weight down to 0, which means the downward pressure moments must be larger than the upward ones. How does this work? As other nodes learn,
   they absorb some of the error; they absorb part of the correlation. They cause the network to predict with moderate correlative power,
   which reduces the error. The other weights then only try to adjust their weights to correctly predict what’s left. In this case, because
   the middle weight has consistent signal to absorb all the correlation (because of the 1:1 relationship between the middle input and the
   output), the error when you want to predict 1 becomes very small, but the error to predict 0 becomes large, pushing the middle weight
   downward. 

5. As a preview, the regularization is advantageous because if a weight has equal pressure upward and downward, it isn’t good for anything.
   It’s not helping either direction. In essence, regularization aims to say that only weights with really strong correlation can stay on;
   everything else should be silenced because it’s contributing noise. It’s sort of like natural selection, and as a side effect it would
   cause the neural network to train faster.

6. There is no correlation between any input column and the output column. Every weight has an equal amount of upward pressure and downward
   pressure. This dataset is a real problem for the neural network. Previously, you could solve for input datapoints that had both upward
   and downward pressure because other nodes would start solving for either the positive or negative predictions, drawing the balanced node
   to favor up or down. But in this case, all the inputs are equally balanced between positive and negative pressure.
   <Training Data>              <Weight Pressure>
   1  0  1  -->  1				+  0  +  -->  1
   0  1  1  -->  1				0  +  +  -->  1
   0  0  1  -->  0				0  0  -  -->  0
   1  1  1  -->  0				-  -  -  -->  0
"""




In [None]:
"""
Learning Indirect Correlation

1. Previously, I described a neural network as an instrument that searches for correlation between input and output datasets. I want to
   refine this just a touch. In reality, neural networks search for correlation between their input and output layers.

2. Because the input dataset doesn’t correlate with the output dataset, you’ll use the input dataset to create an intermediate dataset
   that does have correlation with the output. You basically stack two neural networks on top of each other. The middle layer of nodes
   (layer_1) represents the intermediate dataset. The goal is to train this network so that even though there’s no correlation between
   the input dataset and output dataset (layer_0 and layer_2), the layer_1 dataset that you create using layer_0 will have correlation
   with layer_2.

3. The output of the first lower network (layer_0 to layer_1) is the input to the second upper neural network (layer_1 to layer_2). The
   prediction for each of these networks is identical to what you saw before. If you ignore the lower weights and consider their output
   to be the training set, the top half of the neural network (layer_1 to layer_2) is just like the networks trained in the preceding
   chapter. You can use all the same learning logic to help them learn. The part that you don’t yet understand is how to update the weights
   between layer_0 and layer_1.
"""

In [None]:
"""
Backpropagation:  Long-distance error attribution

1. What’s the prediction from layer_1 to layer_2? It’s a weighted average of the values at layer_1. If layer_2 is too high by x amount,
   how do you know which values at layer_1 contributed to the error? The ones with higher weights contributed more. The ones with lower
   weights from layer_1 to layer_2 contributed less. 

2. Important Notes: The weights from layer_1 to layer_2 exactly describe how much each layer_1 node contributes to the layer_2 prediction.
   This means those weights also exactly describe how much each layer_1 node contributes to the layer_2 error. 

3. How do you use the delta at layer_2 to figure out the delta at layer_1? You multiply it by each of the respective weights for layer_1.
   It’s like the prediction logic in reverse. This process of moving delta signal around is called backpropagation.

4. The delta variable told you the direction and amount the value of this node should change next time. If you want this node to be x amount
   higher, then each of these previous four nodes needs to be x*weights_1_2 amount higher/lower, because these weights were amplifying the
   prediction by weights_1_2 times. When used in reverse, the weights_1_2 matrix amplifies the error by the appropriate amount. It amplifies
   the error so you know how much each layer_1 node should move up or down. 
"""

In [1]:
"""
Linear vs Nonlinear

1. For any two consecutive weighted sums of the input, there exists a single weighted sum with exactly identical behavior. Anything that the
   three-layer network can do, the two-layer network can also do. Stacking two neural nets doesn’t give you any more power. Two consecutive
   weighted sums is just a more expensive version of one weighted sum.

2. Each node in the middle layer subscribes to a certain amount of correlation with each input node. If the weight from an input to the
   middle layer is 1.0, then it subscribes to exactly 100% of that node’s movement. If that node goes up by 0.3, the middle node will
   follow. If the weight connecting two nodes is 0.5, each node in the middle layer subscribes to exactly 50% of that node’s movement. 
   The middle nodes don’t get to have correlation of their own. They’re more or less correlated to various input nodes. But because
   you know that in the new dataset there is no correlation between any of the inputs and the output, how can the middle layer help?
   It mixes up a bunch of correlation that’s already useless. What you really need is for the middle layer to be able to selectively
   correlate with the input. 

3. You want the middle layer to sometimes correlate with an input, and sometimes not correlate. That gives it correlation of its own.
   This gives the middle layer the opportunity to not just always be x% correlated to one input and y% correlated to another input. Instead,
   it can be x% correlated to one input only when it wants to be, but other times not be correlated at all. This is called conditional
   correlation or sometimes correlation.

4. By turning off any middle node whenever it would be negative, you allow the network to sometimes subscribe to correlation from various
   inputs. This is impossible for two-layer neural networks, thus adding power to three-layer nets.

5. The fancy term for this “if the node would be negative, set it to 0” logic is nonlinearity. Without this tweak, the neural network is
   linear. Without this technique, the output layer only gets to pick from the same correlation it had in the two-layer network.

"""




In [1]:
"""
The First Deep Neural Network

1. Previously, you learned: You can compute the relationship between the error and any one of the weights so that you know how changing
   the weight changes the error. You can then use this to reduce the error to 0.

2. Now: Adjusting the weights to reduce the error over a series of training examples ultimately searches for correlation between the input
   and the output layers. If no correlation exists, then the error will never reach 0.
   
3. How do you calculate the deltas for layer_1? First, do the obvious: multiply the output delta by each weight attached to it. This gives
   a weighting of how much each weight contributed to that error. There’s one more thing to factor in. If relu set the output to a
   layer_1 node to be 0, then it didn’t contribute to the error. When this is true, you should also set the delta of that node to 0.
   Multiplying each layer_1 node by the relu2deriv function accomplishes this. relu2deriv is either 1 or 0, depending on whether the layer_1
   value is greater than 0.
"""
import numpy as np
np.random.seed(1)

# nonlinearity
def relu(x):
    # returns x if x > 0 otherwise return 0
    return (x > 0) * x 

def relu2deriv(output):
    # returns 1 if output > 0 otherwise return 0
    return output > 0

# hyper parameters
alpha = 0.2
hidden_size = 4

# input & ouput
streetlights = np.array([[ 1, 0, 1 ],
                         [ 0, 1, 1 ],
                         [ 0, 0, 1 ],
                         [ 1, 1, 1 ]])

walk_vs_stop = np.array([[ 1, 1, 0, 0]]).T

# weight initialization
weights_0_1 = 2 * np.random.random((3, hidden_size)) - 1  # make weights belong to the range of (-1, 1)
weights_1_2 = 2 * np.random.random((hidden_size, 1)) - 1

# predict, compare and learn
for iteration in range(60):
    layer_2_error = 0
    for i in range(len(streetlights)):
        layer_0 = streetlights[i:i+1]
        layer_1 = relu(np.dot(layer_0,weights_0_1))
        layer_2 = np.dot(layer_1,weights_1_2)
        
        # squared sum error
        layer_2_error += np.sum((layer_2 - walk_vs_stop[i:i+1]) ** 2)

        # delta in layer2 and layer1
        layer_2_delta = (walk_vs_stop[i:i+1] - layer_2)
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * relu2deriv(layer_1) # only new code

        # update weight
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
        
    if(iteration % 10 == 9):
        print("Error:" + str(layer_2_error))

Error:0.6342311598444467
Error:0.35838407676317513
Error:0.0830183113303298
Error:0.006467054957103705
Error:0.0003292669000750734
Error:1.5055622665134859e-05


In [33]:
import numpy as np

l1 = [0, 2, 1, 0, 0, 0, 0, 0]
l2 = [2, 0, 0, 1, 0, 1, 0, 0]
l3 = [1, 0, 0, 0, 0, 0, 1, 0]
l4 = [0, 1, 0, 0, 1, 0, 0, 0]
l5 = [0, 0, 0, 1, 0, 0, 0, 1]
l6 = [0, 1, 0, 0, 0, 0, 0, 1]
l7 = [0, 0, 1, 0, 0, 0, 0, 1]
l8 = [0, 0, 0, 0, 1, 1, 1, 0]

ll = [3, 2, 1, 1, 1, 1, 1, 3] 
t  = 13

matrix = []
matrix.append(l1)
matrix.append(l2)
matrix.append(l3)
matrix.append(l4)
matrix.append(l5)
matrix.append(l6)
matrix.append(l7)
matrix.append(l8)

for x in range(len(l1)):
    for y in range(len(l2)):
        if matrix[x][y] != 0:
            value = float(matrix[x][y] * t) / float(ll[x] * ll[y])
            matrix[x][y] = math.log(value)

for x in range(len(l1)):
    for y in range(len(l2)):
        if matrix[x][y] > 0:
            print('%.2f' % matrix[x][y], end='\t')
        else:
            print(0, end='\t')
    print('\n')

0	1.47	1.47	0	0	0	0	0	

1.47	0	0	1.87	0	1.87	0	0	

1.47	0	0	0	0	0	2.56	0	

0	1.87	0	0	2.56	0	0	0	

0	0	0	2.56	0	0	0	1.47	

0	1.87	0	0	0	0	0	1.47	

0	0	2.56	0	0	0	0	1.47	

0	0	0	0	1.47	1.47	1.47	0	

