In [7]:
"""
Gradient Descent aslo Works with Multiple Inputs
"""

# 1. A network with multiple inputs
def w_sum(a, b):
    assert(len(a) == len(b))
    output = 0
    for i in range(len(a)):
        output += (a[i] * b[i])
    return output


weights = [0.1, 0.2, -.1]
def neural_network(input, weights):
    pred = w_sum(input, weights)
    return pred


# 2. Data
toes  = [8.5 , 9.5, 9.9, 9.0]
wlrec = [0.65, 0.8, 0.8, 0.9]
nfans = [1.2 , 1.3, 0.5, 1.0]
win_or_lose_binary = [1, 1, 0, 1]

true = win_or_lose_binary[0]
input = [toes[0], wlrec[0], nfans[0]]


# 3. Predict & Evaluate
pred = neural_network(input, weights)
error = (pred - true) ** 2
delta = pred - true


# 4. Learning
def ele_mul(number, vector):
    output = [0, 0, 0]
    assert(len(output) == len(vector))
    for i in range(len(vector)):
        output[i] = number * vector[i]
    return output

# The three weights share the same output node, they also share that node’s delta. 
# But the weights have different weight deltas owing to their different input values.
weight_deltas = ele_mul(delta, input)


# 5. Updating
alpha = 0.01

for i in range(len(weights)):
    weights[i] -= alpha * weight_deltas[i]
print("Delta: " + str(delta))
print("Weights: " + str(weights))
print("Weight Deltas: " + str(weight_deltas))

Delta: -0.1399999999999999
Weights: [0.1119, 0.20091, -0.09832]
Weight Deltas: [-1.189999999999999, -0.09099999999999994, -0.16799999999999987]


In [10]:
"""
Watch the several steps of learning

1. delta is a measure of how much you want a node’s value to be different. weight_delta, on the other hand, is an estimate of the direction
   and amount to move the weights to reduce node_delta, inferred by the derivative. How do you transform delta into a weight_delta? You
   multiply delta by a weight’s input. Thus, weight_delta is a sort of input-modified version of delta.

2. In multi-input version of neural network, each weight has a unique input and a shared delta, you use each respective weight’s input
   multiplied by delta to create each respective weight_delta, and then use it to update each respective weight.
   
3. Here are a few additional takeaways. Most of the learning (weight changing) was performed on the weight with the largest input a , because
   the input changes the slope significantly. This isn’t necessarily advantageous in all settings. A subfield called normalization helps
   encourage learning across all weights despite dataset characteristics such as this. 
"""

weights = [0.1, 0.2, -.1]
alpha = 0.01

for iter in range(3):
    pred  = neural_network(input, weights)
    error = (pred - true) ** 2
    delta = pred - true
    weight_deltas = ele_mul(delta, input)
    
    print("Iteration:" + str(iter+1))
    print("Pred:" + str(pred))
    print("Error:" + str(error))
    print("Delta:" + str(delta))
    print("Weights:" + str(weights))
    print("Weight_Deltas:")
    print(str(weight_deltas))
    print()
    
    # Most of the learning (weight changing) was performed on the weight with the largest input
    for i in range(len(weights)):
        weights[i] -= alpha * weight_deltas[i]
    

Iteration:1
Pred:0.8600000000000001
Error:0.01959999999999997
Delta:-0.1399999999999999
Weights:[0.1, 0.2, -0.1]
Weight_Deltas:
[-1.189999999999999, -0.09099999999999994, -0.16799999999999987]

Iteration:2
Pred:0.9637574999999999
Error:0.0013135188062500048
Delta:-0.036242500000000066
Weights:[0.1119, 0.20091, -0.09832]
Weight_Deltas:
[-0.30806125000000056, -0.023557625000000044, -0.04349100000000008]

Iteration:3
Pred:0.9906177228125002
Error:8.802712522307997e-05
Delta:-0.009382277187499843
Weights:[0.11498061250000001, 0.20114557625, -0.09788509000000001]
Weight_Deltas:
[-0.07974935609374867, -0.006098480171874899, -0.011258732624999811]



In [11]:
"""
Freezing One Weight: Leave fist weight unchanged

1. This reveals a potentially damaging property of neural networks: 1st weight may be a powerful input with lots of predictive power,
   but if the network accidentally figures out how to predict accurately on the training data without it, then it will never learn to
   incorporate a into its prediction. 

2. Also notice how 1st weight finds the bottom of the bowl. Instead of the black dot moving, the curve seems to move to the left. What does
   this mean? The black dot can move horizontally only if the weight is updated. Because the weight for 1st is frozen for this experiment,
   the dot must stay fixed. But error clearly goes to 0.

3. The error is determined by the training data. Any network can have any weight value, but the value of error given any particular weight
   configuration is 100% determined by data. You’ve already seen how the steepness of the U shape is affected by the input data. What
   you’re really trying to do with the neural network is find the lowest point on this big error plane, where the lowest point refers to
   the lowest error. 
"""

weights = [0.1, 0.2, -.1]
alpha = 0.3

for iter in range(3):
    pred  = neural_network(input, weights)
    error = (pred - true) ** 2
    delta = pred - true
    weight_deltas = ele_mul(delta, input)
    weight_deltas[0] = 0
    
    print("Iteration:" + str(iter+1))
    print("Pred:" + str(pred))
    print("Error:" + str(error))
    print("Delta:" + str(delta))
    print("Weights:" + str(weights))
    print("Weight_Deltas:")
    print(str(weight_deltas))
    print()

    for i in range(len(weights)):
        weights[i] -= alpha * weight_deltas[i]


Iteration:1
Pred:0.8600000000000001
Error:0.01959999999999997
Delta:-0.1399999999999999
Weights:[0.1, 0.2, -0.1]
Weight_Deltas:
[0, -0.09099999999999994, -0.16799999999999987]

Iteration:2
Pred:0.9382250000000001
Error:0.003816150624999989
Delta:-0.06177499999999991
Weights:[0.1, 0.2273, -0.04960000000000005]
Weight_Deltas:
[0, -0.040153749999999946, -0.07412999999999989]

Iteration:3
Pred:0.97274178125
Error:0.000743010489422852
Delta:-0.027258218750000007
Weights:[0.1, 0.239346125, -0.02736100000000008]
Weight_Deltas:
[0, -0.017717842187500006, -0.032709862500000006]



In [12]:
"""
Gradient Descent Learning with One Input and Multiple Outputs
"""

# 1. A network with multiple outputs
weights = [0.3, 0.2, 0.9]

def neural_network(input, weights):
    pred = ele_mul(input, weights)
    return pred


# 2. Predict and calcuate error and delta
wlrec = [0.65, 1.0, 1.0, 0.9] # input

hurt  = [0.1, 0.0, 0.0, 0.1]  # output (label)
win  =  [1,   1,   0,   1]
sad =   [0.1, 0.0, 0.1, 0.2]

input = wlrec[0]
true = [hurt[0], win[0], sad[0]]
pred = neural_network(input, weights)

error = [0, 0, 0]
delta = [0, 0, 0]
for i in range(len(true)):
    error[i] = (pred[i] - true[i]) ** 2
    delta[i] =  pred[i] - true[i]


# 3. Compare
def scalar_ele_mul(number, vector):
    output = [0, 0, 0]
    assert(len(output) == len(vector))
    for i in range(len(vector)):
        output[i] = number * vector[i]
    return output

# You calculate each delta the same way and then multiply them all by the same, single input. (Delta of each input is different)
# This becomes each weight’s weight_delta. 
weight_deltas = scalar_ele_mul(input,delta)


# 4. Learning
for i in range(len(weights)):
    weights[i] -= (weight_deltas[i] * alpha)

print("Weights:" + str(weights))
print("Weight Deltas:" + str(weight_deltas))

Weights:[0.281475, 0.36965000000000003, 0.8054250000000001]
Weight Deltas:[0.061750000000000006, -0.5655, 0.3152500000000001]


In [13]:
"""
Gradient Descent Learning with Multiple Inputs and Multiple Outputs
"""

# 1. A network with multiple inputs and outputs
weights = [ [0.1, 0.1, -0.3],
            [0.1, 0.2,  0.0],
            [0.0, 1.3,  0.1]]

def vect_mat_mul(vect, matrix):
    assert(len(vect) == len(matrix))
    output = [0, 0, 0]
    for i in range(len(vect)):
        output[i] += w_sum(vect, matrix[i])
    return output

def neural_network(input, weights):
    pred = vect_mat_mul(input, weights)
    return pred

# 2. Predict
toes  = [8.5,  9.5, 9.9, 9.0]
wlrec = [0.65, 0.8, 0.8, 0.9]
nfans = [1.2,  1.3, 0.5, 1.0]

hurt = [0.1, 0.0, 0.0, 0.1]
win  = [ 1,  1,   0,   1]
sad  = [0.1, 0.0, 0.1, 0.2]

alpha = 0.01
input = [toes[0], wlrec[0], nfans[0]]
true  = [hurt[0], win[0],   sad[0]]

pred = neural_network(input,weights)

error = [0, 0, 0]
delta = [0, 0, 0]

for i in range(len(true)):
    error[i] = (pred[i] - true[i]) ** 2
    delta[i] = pred[i] - true[i]

# 3. Compare
def zeros_matrix(len_a, len_b):
    z_mat = []
    for i in range(len_a):
        mat = []
        for j in range(len_b):
            mat.append(0)
        z_mat.append(mat)
    return z_mat

def outer_prod(vec_a, vec_b):
    out = zeros_matrix(len(vec_a), len(vec_b))
    for i in range(len(vec_a)):
        for j in range(len(vec_b)):
            out[i][j] = vec_a[i] * vec_b[j]   # 外积： 3×1 vector * 1×3 vector = 3 × 3 matrix
    return out

print(input)
print(delta)
weight_deltas = outer_prod(input, delta)

# 4. Learn
for i in range(len(weights)):
    for j in range(len(weights[0])):
        weights[i][j] -= alpha * weight_deltas[i][j]
print("Weights:" + str(weights))
print("Weight Deltas:" + str(weight_deltas))


[8.5, 0.65, 1.2]
[0.45500000000000007, -0.019999999999999907, 0.8650000000000001]
Weights:[[0.061325, 0.1017, -0.373525], [0.0970425, 0.20013, -0.005622500000000002], [-0.0054600000000000004, 1.30024, 0.08962]]
Weight Deltas:[[3.8675000000000006, -0.1699999999999992, 7.352500000000001], [0.29575000000000007, -0.01299999999999994, 0.5622500000000001], [0.546, -0.023999999999999886, 1.038]]



In [None]:
"""
Other Notes:

1. If the weight is high, it means the model believes there’s a high degree of correlation between that pixel and the number 2. If the
   number is very low (negative), then the network believes there is a very low correlation (perhaps even negative correlation) between that
   pixel and the number 2. 

2. A dot product is a loose measurement of similarity between two vectors. What does this mean for the weights and inputs? Well, if the
   weight vector is similar to the input vector for 2, then it will output a high score because the two vectors are similar. Inversely, if
   the weight vector is not similar to the input vector for 2, it will output a low score.
"""