In [102]:
import numpy as np

In [None]:
w  = 10.0        # Random weight initialization
lr = 0.01        # Learning rate

###### We are trying to find the appropriate value of w. So that we could find the minimum value of F(w).
#### $$F(w) = w^2 - 6w + 1971$$

In [99]:
def F(w):
    return (w**2.0) - (6.0*w) + 1971

###### The first-order derivative of the above F(w), is
#### $$F'(w) = 2w - 6$$

In [100]:
def dF(w):
    return (2.0*w) - 6.0

###### Gradient Descent Optimization, is
#### $$w = w - \eta \nabla_{w} F(w) $$
###### Gradient Ascent Optimization, is
#### $$w = w + \eta \nabla_{w} F(w) $$
###### But, we are using gradient descent optimization here. Yes, you can use gradient ascent optimization also. :) Oh! wait! Please, multiply by -1 with dF(x). Because, minimizing the F(x) is the same as maximizing the -F(x).

In [101]:
# Gradient Descent Optimization with Momemtum
Vdw = 0.0
Sdw = 0.0
beta1 = 0.9
beta2 = 0.999 # 0.9 won't work here!
for epoch in range(1, 1001): # Epoch
    print('Epoch: {}, function value: f({:.4f})={:.4f}'.format(epoch, w, F(w)))
    ## Basic GD
    # w = w - (lr * dF(w))
    
    ## Momentum
    # Vdw = (beta1 * Vdw) + ((1-beta1) * dF(w)) # VdW = B*VdW + (1-B)*dW
    # w = w - (lr * Vdw)                        # W   = W - (lr * VdW)
    
    # RMSprop
    # df = dF(w)
    # Sdw = (beta2 * Sdw) + ((1-beta2) * (df **2.0)) # SdW = B*SdW + (1-B)*(dW^2)
    # w = w - lr * (df/np.sqrt(Sdw))                 # W   = W - (lr * VdW)
    
    # Adam (Momentum + RMSprop)
    df  = dF(w)
    Vdw = (beta1 * Vdw) + ((1-beta1) * df)           # VdW = B1*VdW + (1-B1)*dW
    Sdw = (beta2 * Sdw) + ((1-beta2) * (df ** 2.0))  # SdW = B2*SdW + (1-B2)*(dW^2)
    
    # Vdw = Vdw / (1-np.power(beta1, epoch))
    # Sdw = Sdw / (1-np.power(beta2, epoch))
    
    w   = w - ( lr * (Vdw / np.sqrt(Sdw)) )

print('\nThe approximate argument of the function: {:.4f}'.format(w))
print('The minimum approximate value of the function: {:.4f}'.format(F(w)))
print('\n\033[1;31mPlease observe the values of the function, and you will see that gradient descent is trying to accurate in every iteration.')

Epoch: 1, function value: f(10.0000)=2011.0000
Epoch: 2, function value: f(9.9684)=2010.5583
Epoch: 3, function value: f(9.9259)=2009.9679
Epoch: 4, function value: f(9.8764)=2009.2849
Epoch: 5, function value: f(9.8220)=2008.5401
Epoch: 6, function value: f(9.7641)=2007.7536
Epoch: 7, function value: f(9.7037)=2006.9397
Epoch: 8, function value: f(9.6415)=2006.1089
Epoch: 9, function value: f(9.5779)=2005.2693
Epoch: 10, function value: f(9.5136)=2004.4271
Epoch: 11, function value: f(9.4488)=2003.5870
Epoch: 12, function value: f(9.3838)=2002.7529
Epoch: 13, function value: f(9.3188)=2001.9276
Epoch: 14, function value: f(9.2541)=2001.1135
Epoch: 15, function value: f(9.1897)=2000.3122
Epoch: 16, function value: f(9.1258)=1999.5252
Epoch: 17, function value: f(9.0624)=1998.7533
Epoch: 18, function value: f(8.9998)=1997.9973
Epoch: 19, function value: f(8.9378)=1997.2577
Epoch: 20, function value: f(8.8766)=1996.5348
Epoch: 21, function value: f(8.8162)=1995.8287
Epoch: 22, function v