In [3]:
import numpy as np

### Gauss-Newtown Example 1 parameter
f(x) = exp(w*x)

(x,y)=> 
(0.5, 1), 
(1.5, 2),
(3,3)

In [12]:
def GN_algo(w):
    J = np.array(
                 [[0.5*np.exp(0.5*w)],
                 [1.5*np.exp(1.5*w)],
                 [3*np.exp(3*w)]])
    
    deltaY = np.array(
        [[1-np.exp(0.5*w)],
         [2-np.exp(1.5*w)],
         [3-np.exp(3*w)]]
    )
    
    deltaW = np.dot(np.dot(np.linalg.inv(np.dot(J.T, J)), J.T), deltaY)
    newW = w+deltaW
    return deltaW, newW

In [13]:
deltaW, newW = GN_algo(.65217391)
print(deltaW)
print(newW)

[[-0.19145178]]
[[0.46072213]]


In [6]:
np.ndarray.item(newW)

0.7153083763175929

In [14]:
currentW = 0
for i in range(20):
    _ , currentW = GN_algo(currentW)
    currentW = np.ndarray.item(currentW)
    print(currentW)

0.6521739130434783
0.46072212855937256
0.38249402324483234
0.37269499512534165
0.3726266610135114
0.37262716247383704
0.3726271587441802
0.37262715877191704
0.3726271587717108
0.3726271587717124
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123


### Gradient Descent Example 1 parameter
f(x) = exp(w*x)

(x,y)=> 
(0.5, 1), 
(1.5, 2),
(3,3)

In [16]:
def GD_algo(w,x,y,eta):
    part1 = np.exp(w*x) - y
    part2 = x*np.exp(w*x)
    gradientEw = np.dot(part1, part2)
    return w - eta*2*gradientEw

In [36]:
w =1
x = np.array([0.5, 1.5, 3])
y = np.array([1,2,3])
part1 = np.exp(w*x) - y
print(part1)
part2 = x*np.exp(w*x)
print(part2)
gradientEw = np.dot(part1, part2)
Newchange= .001*2*gradientEw
print(Newchange)

[ 0.64872127  2.48168907 17.08553692]
[ 0.82436064  6.72253361 60.25661077]
2.0934691332443265


In [40]:
x = np.array([0.5, 1.5, 3])
y = np.array([1,2,3])

currentW = 1
for i in range(1000):
    currentW = GD_algo(currentW,x,y,.0001)
    print(currentW)

0.7906530866755673
0.7396928867002286
0.7044748393412105
0.6774550991813554
0.6555547540242649
0.6371791027707508
0.6213863426921297
0.6075708244384116
0.5953188487429314
0.5843347560071823
0.5743996576257536
0.5653468409744757
0.5570463368266771
0.5493948223832702
0.5423087819943974
0.5357197368454077
0.5295708332497331
0.5238143490294092
0.5184098359431865
0.513322712499349
0.5085231819109661
0.5039853888589073
0.49968675438590493
0.49560744552950464
0.491729948170912
0.48803871986988223
0.4845199053425404
0.48116110147834124
0.4779511618860076
0.47488003324354705
0.4719386174346916
0.46911865474293507
0.4664126243567878
0.46381365919558465
0.4613154726514224
0.45891229530119915
0.45659882000383056
0.4543701540841759
0.45222177753395176
0.45014950634371126
0.4481494602285241
0.4462180341307344
0.44435187298183
0.44254784928646973
0.4408030431585627
0.43911472449469857
0.43748033701635164
0.4358974839508388
0.4343639151533661
0.4328775154997563
0.43143629440249576
0.4300383763222876
0

## Play with the learning rate and the starting w. Notice how easily things can go wrong. 

### That's why most ML algos have : 

1) a very slow learning rate <br>
2) require inputs to be normalized (nothing exponential! everything is typically on a log scale) <br>
3) Great care is taken with an initial weight (related problems of vanishing gradient or exploding gradient)