# 随机梯度下降法

$$
\frac{2}{m} \cdot X_{b}^{T} \cdot\left(X_{b} \theta-y\right)
$$

$$
\eta=\frac{t_{0}}{i_{-} i t e r s+t_{1}}
$$

In [17]:
import numpy as np
from sklearn import datasets

In [18]:
m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1, 1)
y = 4. * x + 3. + np.random.normal(0, 3, size=m)

In [19]:
 def J(theta, X_b, y):
            try:
                return np.sum((y - X_b.dot(theta)) ** 2) / len(y)
            except:
                return float('inf')
        
def dJ(theta, X_b, y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2.0 / len(X_b)

def gradient_descent(X_b, y, init_theta, eta, n_iters = 1e4, epsilon=1e-8):
    theta = init_theta
    i_iter = 0

    while i_iter < n_iters:
        gradient = dJ(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient

        if(abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break
        
        i_iter += 1
    
    return theta

In [20]:
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
theta = gradient_descent(X_b, y, initial_theta, eta)

Wall time: 491 ms


In [21]:
theta

array([3.00771577, 4.00348501])

## 随机梯度下降

In [22]:
#随机梯度法求梯度方向
def dJ_sgd(theta, X_b_i, y_i):
    return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.0

In [27]:
def sgd(X_b, y, initial_theta, n_iters):
    
    t0 = 5
    t1 = 50

    def learning_rate(t):
        return t0 / (t + t1)
    
    theta = initial_theta
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b))
        gradient = dJ_sgd(theta, X_b[rand_i], y[rand_i])
        theta = theta - learning_rate(cur_iter) * gradient

    return theta

In [28]:
%%time
X_b = np.hstack([np.ones((len(X), 1)), X])
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=len(X_b)//3)

Wall time: 193 ms


In [29]:
theta

array([3.00838716, 4.00107266])