In [55]:
import numpy as np

In [56]:
w  = 10.0        # Random weight initialization
lr = 0.01        # Learning rate

######  We are trying to find the appropriate value of w. So that we could find the minimum value of F(w).
#### $$F(w) = w^2 - 6w + 1971$$

In [57]:
def F(w):
    return (w**2.0) - (6.0*w) + 1971

###### The first-order derivative of the above F(w), is
#### $$F'(w) = 2w - 6$$

In [58]:
def dF(w):
    return (2.0*w) - 6.0

###### Gradient Descent Optimization, is
#### $$w = w - \eta \nabla_{w} F(w) $$
###### Gradient Ascent Optimization, is
#### $$w = w + \eta \nabla_{w} F(w) $$
###### But, we are using gradient descent optimization here. Yes, you can use gradient ascent optimization also. :) Oh! wait! Please, multiply by -1 with dF(x). Because, minimizing the F(x) is the same as maximizing the -F(x).

In [59]:
# Gradient Descent Optimization with Momemtum
Vdw = 0.0
Sdw = 0.0
beta1 = 0.9
beta2 = 0.9999 # 0.9 won't work here!
for epoch in range(1, 5001): # Epoch
    print('Epoch: {}, function value: f({:.4f})={:.4f}'.format(epoch, w, F(w)))
    ##1. Basic GD
    # w = w - (lr * dF(w))
    
    ##2. Momentum
    # Vdw = (beta1 * Vdw) + ((1-beta1) * dF(w)) # VdW = B*VdW + (1-B)*dW
    # w = w - (lr * Vdw)                        # W   = W - (lr * VdW)
    
    ##3. RMSprop
    # df = dF(w)
    # Sdw = (beta2 * Sdw) + ((1-beta2) * (df **2.0)) # SdW = B*SdW + (1-B)*(dW^2)
    # w = w - lr * (df/np.sqrt(Sdw))                 # W   = W - (lr * VdW)
    
    ##4. Adam (Momentum + RMSprop)
    df  = dF(w)
    Vdw = (beta1 * Vdw) + ((1-beta1) * df)           # VdW = B1*VdW + (1-B1)*dW
    Sdw = (beta2 * Sdw) + ((1-beta2) * (df ** 2.0))  # SdW = B2*SdW + (1-B2)*(dW^2)
    
    ## Bias-correction for Adam
    # Vdw = Vdw / (1-(beta1 ** epoch))
    # Sdw = Sdw / (1-(beta2 ** epoch))
    
    w   = w - ( lr * (Vdw / np.sqrt(Sdw)) )

print('\nThe approximate argument of the function: {:.4f}'.format(w))
print('The minimum approximate value of the function: {:.4f}'.format(F(w)))
print('\n\033[1;31mPlease observe the values of the function, and you will see that gradient descent is trying to accurate in every iteration.')

Epoch: 1, function value: f(10.0000)=2011.0000
Epoch: 2, function value: f(9.9000)=2009.6100
Epoch: 3, function value: f(9.7657)=2007.7747
Epoch: 4, function value: f(9.6094)=2005.6846
Epoch: 5, function value: f(9.4379)=2003.4471
Epoch: 6, function value: f(9.2557)=2001.1335
Epoch: 7, function value: f(9.0658)=1998.7944
Epoch: 8, function value: f(8.8708)=1996.4667
Epoch: 9, function value: f(8.6726)=1994.1782
Epoch: 10, function value: f(8.4726)=1991.9492
Epoch: 11, function value: f(8.2721)=1989.7949
Epoch: 12, function value: f(8.0721)=1987.7262
Epoch: 13, function value: f(7.8735)=1985.7506
Epoch: 14, function value: f(7.6769)=1983.8731
Epoch: 15, function value: f(7.4829)=1982.0964
Epoch: 16, function value: f(7.2921)=1980.4218
Epoch: 17, function value: f(7.1048)=1978.8490
Epoch: 18, function value: f(6.9213)=1977.3767
Epoch: 19, function value: f(6.7420)=1976.0027
Epoch: 20, function value: f(6.5671)=1974.7242
Epoch: 21, function value: f(6.3967)=1973.5378
Epoch: 22, function v

In [15]:
"""
How to works?

Arguments:
X -- input data, shape=(4,2)
Y -- label
learning rate -- , scaler

Returns:
paramters -- 
"""

v=10
if v>10:
    pass
else:
    print('0')

0
