In [30]:
import numpy as np
import math

# **Bai tap 1: Gradient Descent**

Objective Function: 
$$
f(w_1, w_2) = 0.1w_1^2 + 2w_2^2 \tag{1}
$$


In [5]:
def function(w1, w2):
    return 0.1 * w1**2 + 2 * w2**2

In [8]:
def df_w(W):
    dW = np.array([0.2*W[0], 4*W[1]])
    return dW

In [9]:
def sgd(W, dW, lr):
    """
    Thực hiện thuật tóa Gradient Descent để update w1 và w2
    Arguments:
    W -- np.array: [w1, w2]
    dW -- np.array: [dw1, dw2], array chứa giá trị đạo hàm theo w1 và w2
    lr -- float: learning rate
    Returns:
    W -- np.array: [w1, w2] w1 và w2 sau khi đã update
    """
    W = W - lr * dW
    ########################################################
    return W

In [26]:
W = np.array([-5, -2])
lr = 0.4
n_epochs = 2
W_log = []

for i in range(n_epochs):
    dW = df_w(W)
    W = sgd(W, dW, lr)
    W_log.append(W)
    print(f'epoch: {i+1} || w1: {W[0]} || w2: {W[1]}')

epoch: 1 || w1: -4.6 || w2: 1.2000000000000002
epoch: 2 || w1: -4.231999999999999 || w2: -0.7200000000000002


In [12]:
W = np.array([-5, -2])
lr = 0.4
n_epochs = 30
W_log = []

for i in range(n_epochs):
    dW = df_w(W)
    W = sgd(W, dW, lr)
    W_log.append(W)
    print(f'epoch: {i+1} || w1: {W[0]} || w2: {W[1]}')

epoch: 1 || w1: -4.6 || w2: 1.2000000000000002
epoch: 2 || w1: -4.231999999999999 || w2: -0.7200000000000002
epoch: 3 || w1: -3.893439999999999 || w2: 0.43200000000000016
epoch: 4 || w1: -3.5819647999999993 || w2: -0.2592000000000001
epoch: 5 || w1: -3.2954076159999994 || w2: 0.1555200000000001
epoch: 6 || w1: -3.0317750067199993 || w2: -0.09331200000000006
epoch: 7 || w1: -2.7892330061823993 || w2: 0.05598720000000004
epoch: 8 || w1: -2.5660943656878072 || w2: -0.03359232000000004
epoch: 9 || w1: -2.360806816432783 || w2: 0.020155392000000022
epoch: 10 || w1: -2.1719422711181604 || w2: -0.012093235200000017
epoch: 11 || w1: -1.9981868894287076 || w2: 0.007255941120000012
epoch: 12 || w1: -1.838331938274411 || w2: -0.0043535646720000085
epoch: 13 || w1: -1.691265383212458 || w2: 0.0026121388032000056
epoch: 14 || w1: -1.5559641525554613 || w2: -0.0015672832819200039
epoch: 15 || w1: -1.4314870203510244 || w2: 0.0009403699691520025
epoch: 16 || w1: -1.3169680587229424 || w2: -0.00056422

# **Bai tap 2: Gradiend Descent + Momentum**
$$
v_t = \beta v_{t-1} + (1 - \beta) dW_t
$$

$$
W_t = W_t - \alpha v_t
$$

In [27]:
def sgd_momentum(W, dW, v_t_1, beta, lr):
    v_t = beta * v_t_1 + (1 - beta) * dW
    W = W - lr * v_t
    return v_t, W

In [28]:
W = np.array([-5, -2])
v_t_1 = np.array([0, 0])
lr = 0.4
beta = 0.9
n_epochs = 2
W_log = []

for i in range(n_epochs):
    dW = df_w(W)
    v_t_1, W = sgd_momentum(W, dW, v_t_1, beta, lr)
    print(f'epoch: {i+1} || w1: {W[0]} || w2: {W[1]}')

epoch: 1 || w1: -4.96 || w2: -1.6800000000000002
epoch: 2 || w1: -4.88432 || w2: -1.1232000000000002


In [29]:
W = np.array([-5, -2])
v_t_1 = np.array([0, 0])
lr = 0.4
beta = 0.9
n_epochs = 30
W_log = []

for i in range(n_epochs):
    dW = df_w(W)
    v_t_1, W = sgd_momentum(W, dW, v_t_1, beta, lr)
    print(f'epoch: {i+1} || w1: {W[0]} || w2: {W[1]}')

epoch: 1 || w1: -4.96 || w2: -1.6800000000000002
epoch: 2 || w1: -4.88432 || w2: -1.1232000000000002
epoch: 3 || w1: -4.77713344 || w2: -0.4423680000000002
epoch: 4 || w1: -4.6424484684800005 || w2: 0.24115967999999988
epoch: 5 || w1: -4.4840924063641605 || w2: 0.8177490431999999
epoch: 6 || w1: -4.305699211208991 || w2: 1.205839623168
epoch: 7 || w1: -4.110699741879667 || w2: 1.36218680543232
epoch: 8 || w1: -3.9023146215482383 || w2: 1.284949380601037
epoch: 9 || w1: -3.683549496277566 || w2: 1.0098437973567165
epoch: 10 || w1: -3.4571924875637405 || w2: 0.6006737648597533
epoch: 11 || w1: -3.2258136398207875 || w2: 0.136312933234926
epoch: 12 || w1: -2.9917661677335636 || w2: -0.30342188454500685
epoch: 13 || w1: -2.7571893135131935 || w2: -0.6506357190197454
epoch: 14 || w1: -2.524012630206755 || w2: -0.8590264550038508
epoch: 15 || w1: -2.2939615141893066 || w2: -0.9091338845889295
epoch: 16 || w1: -2.0685638176600882 || w2: -0.8087691496812717
epoch: 17 || w1: -1.849157380242511 

# **Bai tap 3: RMSProp**
$$
S_t = \gamma S_{t-1} + (1 - \gamma) dW_t^2
$$

$$
W_t = W_{t-1} - \alpha * \frac{dW_t}{\sqrt{S_t + \epsilon}}
$$

In [34]:
def RMSProp(W, dW, S_t_1, gamma, lr):
    S_t = gamma * S_t_1 + (1 - gamma) * dW**2
    W = W - lr * dW / (np.sqrt(S_t + 1e-10))
    return S_t, W

In [35]:
W = np.array([-5, -2])
S_t_1 = np.array([0, 0])
lr = 0.3
gamma = 0.9
n_epochs = 2
W_log = []

for i in range(n_epochs):
    dW = df_w(W)
    S_t_1, W = RMSProp(W, dW, S_t_1, gamma, lr)
    print(f'epoch: {i+1} || w1: {W[0]} || w2: {W[1]}')

epoch: 1 || w1: -4.0513167024238275 || w2: -1.0513167019568976
epoch: 2 || w1: -3.435191235908538 || w2: -0.5915233460714318


In [36]:
W = np.array([-5, -2])
S_t_1 = np.array([0, 0])
lr = 0.3
gamma = 0.9
n_epochs = 30
W_log = []

for i in range(n_epochs):
    dW = df_w(W)
    S_t_1, W = RMSProp(W, dW, S_t_1, gamma, lr)
    print(f'epoch: {i+1} || w1: {W[0]} || w2: {W[1]}')

epoch: 1 || w1: -4.0513167024238275 || w2: -1.0513167019568976
epoch: 2 || w1: -3.435191235908538 || w2: -0.5915233460714318
epoch: 3 || w1: -2.9589298311283407 || w2: -0.32943933595909264
epoch: 4 || w1: -2.5654553360268144 || w2: -0.17756476555095968
epoch: 5 || w1: -2.229197709411829 || w2: -0.09163252413048745
epoch: 6 || w1: -1.9362595744603213 || w2: -0.04494496259456326
epoch: 7 || w1: -1.6781688915366537 || w2: -0.020814215264231956
epoch: 8 || w1: -1.449341941284048 || w2: -0.009035577662982902
epoch: 9 || w1: -1.2458742191916645 || w2: -0.0036459014240193023
epoch: 10 || w1: -1.0648954313269696 || w2: -0.0013535079991089513
epoch: 11 || w1: -0.9041949384287257 || w2: -0.0004564436244772687
epoch: 12 || w1: -0.7619894824288593 || w2: -0.00013756262024208462
epoch: 13 || w1: -0.6367718430685567 || w2: -3.625999829297074e-05
epoch: 14 || w1: -0.5272089789683597 || w2: -8.113344520888707e-06
epoch: 15 || w1: -0.4320726789876681 || w2: -1.4747268910040716e-06
epoch: 16 || w1: -0.3

# **Bai tap 4: Adam**
$$
v_t = \beta_1 v_{t-1} + (1 - \beta_1) dW_t
$$

$$
vcorr_t = \frac{v_t}{1 - \beta_1}
$$

$$
S_t = \beta_2 S_{t-1} + (1 - \beta_2) dW_t^2
$$

$$
Scorr_t = \frac{S_t}{1 - \beta_2}
$$

$$
W_t = W_{t-1} - \alpha * \frac{vcorr_t}{\sqrt{Scorr_t} + \epsilon}
$$

In [40]:
def Adam(W, dW, v_t_1, S_t_1, beta1, beta2, lr):
    v_t = beta1 * v_t_1 + (1 - beta1) * dW
    vcorr_t = v_t / (1 - beta1)
    S_t = beta2 * S_t_1 + (1 - beta2) * dW**2
    Scorr_t = S_t / (1 - beta2)
    W = W - lr * vcorr_t / (np.sqrt(Scorr_t) + 1e-6)
    return v_t, S_t, W

In [41]:
W = np.array([-5, -2])
v_t_1 = np.array([0, 0])
S_t_1 = np.array([0, 0])
beta1 = 0.9
beta2 = 0.9
lr = 0.2
n_epochs = 2
W_log = []

for i in range(n_epochs):
    dW = df_w(W)
    v_t_1, S_t_1, W = Adam(W, dW, v_t_1, S_t_1, beta1, beta2, lr)
    print(f'epoch: {i+1} || w1: {W[0]} || w2: {W[1]}')

epoch: 1 || w1: -4.8000001999998 || w2: -1.8000000249999968
epoch: 2 || w1: -4.524376784924456 || w2: -1.5247011705704885


In [42]:
W = np.array([-5, -2])
v_t_1 = np.array([0, 0])
S_t_1 = np.array([0, 0])
beta1 = 0.9
beta2 = 0.9
lr = 0.2
n_epochs = 30
W_log = []

for i in range(n_epochs):
    dW = df_w(W)
    v_t_1, S_t_1, W = Adam(W, dW, v_t_1, S_t_1, beta1, beta2, lr)
    print(f'epoch: {i+1} || w1: {W[0]} || w2: {W[1]}')

epoch: 1 || w1: -4.8000001999998 || w2: -1.8000000249999968
epoch: 2 || w1: -4.524376784924456 || w2: -1.5247011705704885
epoch: 3 || w1: -4.195413081516243 || w2: -1.1974739478610454
epoch: 4 || w1: -3.82533158869789 || w2: -0.8331057098110453
epoch: 5 || w1: -3.4224575282903307 || w2: -0.4452642140343425
epoch: 6 || w1: -2.9932150855427517 || w2: -0.050657339222700326
epoch: 7 || w1: -2.5430697710125774 || w2: 0.3273706307850763
epoch: 8 || w1: -2.0771105687176585 || w2: 0.6578817219630834
epoch: 9 || w1: -1.600506753106917 || w2: 0.9091199919694746
epoch: 10 || w1: -1.118941604474419 || w2: 1.0617540579443505
epoch: 11 || w1: -0.6390669800735427 || w2: 1.1150460045302546
epoch: 12 || w1: -0.16896864617094765 || w2: 1.080712409825397
epoch: 13 || w1: 0.28145315172841634 || w2: 0.974448425475911
epoch: 14 || w1: 0.7004120446842047 || w2: 0.8115995828616974
epoch: 15 || w1: 1.0748111442376107 || w2: 0.6063249003795163
epoch: 16 || w1: 1.3917219424476053 || w2: 0.37237293770780483
epoch