# 线性回归

In [None]:
from time import time
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams["font.sans-serif"] = "SimHei"
plt.rcParams["axes.unicode_minus"] = False
plt.tight_layout()

使用梯度下降法实现线性回归。

In [None]:
class LinearRegression:
    def __init__(self, learning_rate=0.1, max_iter=500):
        self._learning_rate = learning_rate
        self._max_iter = max_iter
        self._losses = []

    def _loss(self, h, y):
        return np.mean((h - y) ** 2) / 2

    def fit(self, X, y):
        m, n = X.shape
        self._theta = np.random.rand(m, 1)

        for iteration in range(self._max_iter):
            h = np.dot(self._theta.T, X)
            gradient = np.dot(X, (h - y).T) / n
            self._theta -= self._learning_rate * gradient

            if (iteration + 1) % 100 == 0:
                self._losses.append(self._loss(h, y))

    def predict(self, X):
        return np.dot(self._theta.T, X)[0]

    def draw_loss(self):
        plt.plot(np.arange(1, len(self._losses) + 1) * 100, self._losses)
        plt.xlabel("迭代次数")
        plt.ylabel("损失函数值")
        plt.title("损失函数曲线")
        plt.show()

加载数据集的 helper 函数。

In [None]:
TRAINSET_SIZE = 400


def load_dataset():
    housing = np.loadtxt("housing.data").T
    housing = housing[:, np.random.permutation(housing.shape[1])]

    houses = housing[:-1, :]
    houses = (houses - houses.mean(axis=1, keepdims=True)) / houses.std(
        axis=1, keepdims=True
    )
    houses = np.vstack((np.ones(houses.shape[1]), houses))
    prices = housing[-1, :]

    train_houses = houses[:, :TRAINSET_SIZE]
    train_prices = prices[:TRAINSET_SIZE]
    test_houses = houses[:, TRAINSET_SIZE:]
    test_prices = prices[TRAINSET_SIZE:]

    sort_indices = test_prices.argsort()
    test_houses = test_houses[:, sort_indices]
    test_prices = test_prices[sort_indices]

    return [train_houses, train_prices], [test_houses, test_prices]

训练模型，绘制损失值曲线。

In [None]:
[train_houses, train_prices], [test_houses, test_prices] = load_dataset()

start_time = time()
model = LinearRegression()
model.fit(train_houses, train_prices)
end_time = time()

print(f"Optimization took {end_time - start_time:.2f} seconds.")
model.draw_loss()

预测房价，绘制散点图与真实值对比。

In [None]:
predictions = [model.predict(x) for x in test_houses.T]

x_axis = np.arange(len(test_prices))
plt.scatter(x_axis, predictions, s=5, c="blue", label="预测值")
plt.scatter(x_axis, test_prices, s=5, c="red", label="真实值")
plt.xlabel("编号")
plt.ylabel("价格")
plt.legend()
plt.show()