In [2]:
import handcalcs.render

# 6.1 매개변수 갱신

### 1. 확률적 경사 하강법 (SGD)
$$W := W - \eta \frac{\partial L}{\partial W}$$

In [14]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key]

### 2. Momentum
$$v := \alpha v - \eta \frac{\partial L}{\partial W}$$
$$W := W + v$$
<br>
$$v: velocity (속도)$$
$$\alpha: momentum (ex. 0.9)$$

In [15]:
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)
            
        for key in params.keys():
            self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
            params[key] += self.v[key]

### 3. AdaGrad
##### learning rate decay -> Adaptive learning rate fit

$$h := h + \frac{\partial L}{\partial W} \odot \frac{\partial L}{\partial W}$$
$$W := W - \eta \frac{1}{\sqrt h} \frac{\partial L}{\partial W}$$

- h: 각 기울기 원소 값 제곱하여 더한 후 업데이트
- W: 1/sqrt(h) 를 통해 크게 갱신된 원소는 학습률 낮게 업데이트
- 학습 진행할수록 갱신 강도 0으로 수렴 -> 문제 개선: RMSProp (먼 과거의 기울기 잊고 새 기울기 정보 크게 반영)

In [16]:
class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = {}    
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)
        
        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7) # h=0일 경우 대비

### 4. Adam
Momentum + AdaGrad

In [17]:
class Adam:

    """Adam (http://arxiv.org/abs/1412.6980v8)"""

    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = {}, {}
            for key, val in params.items():
                self.m[key] = np.zeros_like(val)
                self.v[key] = np.zeros_like(val)
        
        self.iter += 1
        lr_t  = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)         
        
        for key in params.keys():
            #self.m[key] = self.beta1*self.m[key] + (1-self.beta1)*grads[key]
            #self.v[key] = self.beta2*self.v[key] + (1-self.beta2)*(grads[key]**2)
            self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
            self.v[key] += (1 - self.beta2) * (grads[key]**2 - self.v[key])
            
            params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)
            
            #unbias_m += (1 - self.beta1) * (grads[key] - self.m[key]) # correct bias
            #unbisa_b += (1 - self.beta2) * (grads[key]*grads[key] - self.v[key]) # correct bias
            #params[key] += self.lr * unbias_m / (np.sqrt(unbisa_b) + 1e-7)

# 6.2 가중치 초깃값