In [None]:
import numpy as np

### Gauss-Newtown Example 1 parameter
f(x) = exp(w*x)

(x,y)=> 
(0.5, 1), 
(1.5, 2),
(3,3)

In [2]:
def GN_algo(w):
    J = np.array(
                 [[0.5*np.exp(0.5*w)],
                 [1.5*np.exp(1.5*w)],
                 [3*np.exp(3*w)]])
    
    deltaY = np.array(
        [[1-np.exp(0.5*w)],
         [2-np.exp(1.5*w)],
         [3-np.exp(3*w)]]
    )
    
    deltaW = np.dot(np.dot(np.linalg.inv(np.dot(J.T, J)), J.T), deltaY)
    newW = w+deltaW
    return deltaW, newW

In [3]:
deltaW, newW = GN_algo(1)
print(deltaW)
print(newW)

[[-0.28469162]]
[[0.71530838]]


In [4]:
np.ndarray.item(newW)

0.7153083763175929

In [5]:
currentW = 1
for i in range(20):
    _ , currentW = GN_algo(currentW)
    currentW = np.ndarray.item(currentW)
    print(currentW)

0.7153083763175929
0.4988061722819472
0.3925818490187722
0.3730513164563157
0.37262426747160476
0.3726271802861603
0.3726271586117129
0.37262715877290226
0.3726271587717035
0.3726271587717124
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123
0.3726271587717123


### Gradient Descent Example 1 parameter
f(x) = exp(w*x)

(x,y)=> 
(0.5, 1), 
(1.5, 2),
(3,3)

In [36]:
def GD_algo(w,x,y,eta):
    part1 = np.exp(w*x) - y
    part2 = x*np.exp(w*x)
    gradientEw = np.dot(part1, part2)
    return w - eta*2*gradientEw

In [58]:
x = np.array([0.5, 1.5, 3])
y = np.array([1,2,3])

currentW = 1
for i in range(1000):
    currentW = GD_algo(currentW,x,y,0.001)
    print(currentW)

-1.0934691332443265
-1.0915059997447711
-1.0895362925732195
-1.0875599616994525
-1.085576956510969
-1.083587225803901
-1.0815907177737505
-1.0795873800059457
-1.0775771594662098
-1.075560002490741
-1.0735358547761966
-1.0715046613694785
-1.0694663666573134
-1.067420914355625
-1.0653682474986912
-1.0633083084280823
-1.0612410387813747
-1.0591663794806336
-1.0570842707206614
-1.0549946519570026
-1.0528974618937033
-1.050792638470815
-1.0486801188516404
-1.0465598394097104
-1.0444317357154909
-1.0422957425228065
-1.04015179375498
-1.037999822490675
-1.0358397609494385
-1.0336715404769334
-1.0314950915298529
-1.0293103436605098
-1.0271172255010916
-1.024915664747573
-1.0227055881432756
-1.020486921462068
-1.0182595894911943
-1.0160235160137232
-1.0137786237906057
-1.011524834542333
-1.009262068930183
-1.0069902465370422
-1.0047092858477966
-1.0024191042292743
-1.0001196179097318
-0.9978107419578689
-0.9954923902613608
-0.9931644755048934
-0.9908269091476878
-0.9884796014005003
-0.986122461

## Play with the learning rate and the starting w. Notice how easily things can go wrong. 

### That's why most ML algos have : 

1) a very slow learning rate <br>
2) require inputs to be normalized (nothing exponential! everything is typically on a log scale) <br>
3) Great care is taken with an initial weight (related problems of vanishing gradient or exploding gradient)