通常需要初始化几个变量：

- 学习率（learning_rate）
学习率是控制梯度下降幅度的参数，亦称步长，学习率设置过大会阻碍收敛并导致损失函数在最小值附近波动甚至发散；学习率太小又会导致收敛速度缓慢，尤其是在迭代后期，当梯度变动很小的时候，整个收敛过程会变得很缓慢

- 初始权重（theta）
初始权重的个数等于原始样本中特征值的个数加1，其中新增的1个参数主要考虑偏置项()带来的影响

- 程序终止条件（max_iteration_number / tolerance）
    - 最大迭代次数：防止结果不收敛时，对程序进行强制终止
    - 误差容忍度：当结果改善的变动低于某个阈值时，程序提前终止

In [None]:
import numpy as np

In [None]:
class BatchGradientDescent:
    def __init__(self, eta=0.01, n_iter=1000, tolerance=0.001):
        self.eta = eta
        self.n_iter = n_iter
        self.tolerance = tolerance

    def fit(self, X, y):
        n_samples = len(X)
        X = np.c_[np.ones(n_samples), X]  # 增加截距项
        n_features = X.shape[-1]

        self.theta = np.ones(n_features)
        self.loss_ = [0]

        self.i = 0
        while self.i < self.n_iter:
            self.i += 1
            errors = X.dot(self.theta) - y
            loss = 1 / (2 * n_samples) * errors.dot(errors)
            delta_loss = loss - self.loss_[-1]
            self.loss_.append(loss)
            if np.abs(delta_loss) < self.tolerance:
                break
            else:
                gradient = 1 / n_samples * X.T.dot(errors)
                self.theta -= self.eta * gradient

        return self

In [4]:
class StochasticGradientDescent(BatchGradientDescent):
    def __init__(self, shuffle=True, random_state=None, **kwargs):
        super(StochasticGradientDescent, self).__init__(**kwargs)
        self.shuffle = shuffle
        if random_state:
            np.random.seed(random_state)

    def fit(self, X, y):
        X = np.c_[np.ones(len(X)), X]
        n_samples, n_features = X.shape
        self.theta = np.ones(n_features)
        self.loss_ = [0]

        self.i = 0
        while self.i < self.n_iter:
            self.i += 1
            if self.shuffle:
                X, y = self._shuffle(X, y)  # 重新排序
            errors = []
            for xi, yi in zip(X, y):
                error_i = xi.dot(self.theta) - yi
                errors.append(error_i ** 2)
                gradient_i = xi.T.dot(error_i)  # 单个样本的梯度
                self.theta -= self.eta * gradient_i
            loss = 1 / 2 * np.mean(errors)
            delta_loss = loss - self.loss_[-1]
            self.loss_.append(loss)
            if np.abs(delta_loss) < self.tolerance:
                break
        return self

    @staticmethod
    def _shuffle(X, y):
        location = np.random.permutation(len(y))
        return X[location], y[location]

In [5]:
class MiniBatchGradientDescent(StochasticGradientDescent):
    def __init__(self, batch_size=10, **kwargs):
        self.batch_size = batch_size
        super(MiniBatchGradientDescent, self).__init__(**kwargs)

    def fit(self, X, y):
        X = np.c_[np.ones(len(X)), X]
        n_samples, n_features = X.shape
        self.theta = np.ones(n_features)
        self.loss_ = [0]

        self.i = 0
        while self.i < self.n_iter:
            self.i += 1
            if self.shuffle:
                X, y = self._shuffle(X, y)

            errors = []
            for j in range(0, n_samples, self.batch_size):
                mini_X, mini_y = X[j: j + self.batch_size], y[j: j + self.batch_size]
                error = mini_X.dot(self.theta) - mini_y  # 长度与batch_size的长度一致
                errors.append(error.dot(error))
                mini_gradient = 1 / self.batch_size * mini_X.T.dot(error)  # 小批量样本梯度
                self.theta -= self.eta * mini_gradient
            loss = 1 / (2 * self.batch_size) * np.mean(errors)
            delta_loss = loss - self.loss_[-1]
            self.loss_.append(loss)
            if np.abs(delta_loss) < self.tolerance:
                break

        return self