### 批量梯度下降法

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1,1) 
y = 4. * x + 3. + np.random.normal(0.,3.,size=m)

In [3]:
def J(theta,X_b,y):
    try:
        return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)
    except:
        return float('inf')

In [4]:
def dJ(theta,X_b,y):
    return X_b.T.dot(X_b.dot(theta) - y) * 2 / len(X_b)

In [5]:
def gradient_descent(initial_theta,X_b,y,eta,n_iters=1e4,epsilon=1e-8):
    theta = initial_theta
    i_iter = 0
    
    while i_iter < n_iters:
        
        gradient = dJ(theta,X_b,y)
        last_theta = theta
        theta = theta - eta * gradient
        
        if abs(J(theta,X_b,y)-J(last_theta,X_b,y)) < epsilon:
            break
            
        i_iter += 1
    return theta

In [6]:
%%time

X_b = np.hstack([np.ones((len(X),1)),X])
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01

theta = gradient_descent(initial_theta,X_b,y,eta)

Wall time: 1.22 s


In [7]:
theta

array([2.99565462, 3.99677556])

### 随机梯度下降法

In [8]:
def dJ_sgd(theta,X_b_i,y_i):
    return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2

In [9]:
def sgd(X_b,y,initial_theta,n_iters):
    t0 = 5
    t1 = 50
    
    def learning_rate(t):
        return t0 / (t + t1)
    
    theta = initial_theta
    
    for cur_iter in range(n_iters):
        rand_i = np.random.randint(len(X_b))
        gradient = dJ_sgd(theta,X_b[rand_i],y[rand_i])
        theta = theta - learning_rate(cur_iter) * gradient
        
    return theta

In [10]:
X_b = np.hstack([np.ones((len(X),1)),X])
initial = np.zeros(X_b.shape[1])

sgd(X_b,y,initial_theta,n_iters=len(X_b)//3)

array([3.05710708, 4.01275589])

In [11]:
from playML.LinearRegression import LinearRegression

In [12]:
lin_sgd = LinearRegression()
lin_sgd.fit_sgd(X,y,n_iters=2)

LinearRegression()

In [13]:
lin_sgd.coef_

array([3.99597093])

In [14]:
lin_sgd.interception_

2.976122651439

### 用真实数据使用sgd训练

In [15]:
from sklearn import datasets

In [16]:
boston = datasets.load_boston()

X = boston.data
y = boston.target

In [17]:
X = X[y<50]
y = y[y<50]

In [18]:
X.shape

(490, 13)

In [19]:
y.shape

(490,)

In [20]:
from playML.model_selection import train_test_split

In [21]:
X_train,X_test,y_train,y_test = train_test_split(X,y,seed=666)

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
standardscaler = StandardScaler()

In [24]:
standardscaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [25]:
X_train_standard = standardscaler.transform(X_train)

In [26]:
X_test_standard = standardscaler.transform(X_test)

In [27]:
from playML.LinearRegression import LinearRegression

In [28]:
lin_reg = LinearRegression()
lin_reg.fit_sgd(X_train_standard,y_train,n_iters=2)

LinearRegression()

In [29]:
lin_reg.score(X_test_standard,y_test)

0.7923329555425149

In [30]:
%time lin_reg.fit_sgd(X_train_standard,y_train,n_iters=50)
lin_reg.score(X_test_standard,y_test)

Wall time: 230 ms


0.8132440489440966

###  scikit-learn 中的sgd

In [34]:
from sklearn.linear_model.stochastic_gradient import SGDRegressor

In [41]:
sgd_reg = SGDRegressor(max_iter=5)
%time sgd_reg.fit(X_train_standard,y_train)
sgd_reg.score(X_test_standard,y_test)

Wall time: 2 ms


0.8052913229967448

In [38]:
sgd_reg = SGDRegressor(max_iter=100)
%time sgd_reg.fit(X_train_standard,y_train)
sgd_reg.score(X_test_standard,y_test)

Wall time: 15 ms


0.8127547985606133