## 线性回归api使用
### 正规方程优化损失函数 & 梯度下降优化损失函数
- GD 梯度下降(Gradient Descent)
    - 原始的梯度下降法需要计算所有样本的值才能够得出梯度，计算量大，所以后面才有会一系列的改进。
- SGD 随机梯度下降(Stochastic gradient descent)
    - 是一个优化方法，它在一次迭代时只考虑一个训练样本。
- SAG 随机平均梯度法(Stochasitc Average Gradient)
    - 由于SGD收敛的速度太慢，有人提出SAG等基于梯度下降的算法

In [6]:
from sklearn.datasets import load_boston  # 数据集
from sklearn.model_selection import train_test_split  # 数据集划分
from sklearn.preprocessing import StandardScaler  # 特征值标准化
from sklearn.linear_model import LinearRegression  # 线性回归(正规方程优化损失函数)api
from sklearn.metrics import mean_squared_error  # 均方误差
from sklearn.linear_model import SGDRegressor  # 线性回归(随机梯度下降优化损失函数)api

In [7]:
def linear_model1():
    """
    线性回归:特征方程
    :return:None
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(特征方程)
    estimator = LinearRegression()
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)


def linear_model2():
    """
    线性回归:随机梯度下降法
    :return:None
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(特征方程)
    estimator = SGDRegressor(max_iter=1000)
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)

In [8]:
# 正规方程
linear_model1()

预测值为:
 [28.15624208 31.30869316 20.51485702 31.48205292 19.01722351 18.25171434
 20.57503703 18.45503556 18.46192151 32.94820922 20.36213103 27.24752425
 14.81963448 19.21146435 37.02505033 18.32408346  7.70119888 17.56478207
 30.19561854 23.61297215 18.13379616 33.84017096 28.49921616 16.99629682
 34.76148752 26.227388   34.84170356 26.6267998  18.63962161 13.21549955
 30.36603792 14.70412444 37.18508975  8.91445391 15.06484067 16.12468763
  7.21797311 19.16335583 39.57444328 28.24501235 24.62961494 16.72956407
 37.82734499  5.70546434 21.20919004 24.63811904 18.85963528 19.93919917
 15.20065511 26.3036171   7.4251188  27.14868579 29.19076714 16.28206033
  7.94953105 35.46279456 32.39096932 20.83555382 16.41378444 20.87373635
 22.92853043 23.61293997 19.32937197 38.34148716 23.87879591 18.96954218
 12.59209375  6.13512682 41.45864696 21.09486655 16.23896752 21.48997696
 40.7412586  20.4923302  36.81939833 27.05431089 19.80309379 19.61594823
 24.59557969 21.0926586  30.92608611 19.3365

In [9]:
# 随机梯度下降
linear_model2()

预测值为:
 [28.14697117 31.27170717 20.52568257 31.46387226 19.01034244 18.31458025
 20.59056859 18.43136887 18.4382974  32.94471344 20.37575321 27.26956409
 14.88042161 19.25401228 37.00035843 18.34179366  7.78309668 17.56952605
 30.13483013 23.56989535 18.19762879 33.80096911 28.50236598 17.05993361
 34.70431377 26.21391198 34.73898078 26.54526014 18.71590775 13.17464134
 30.33490388 14.75510695 37.0578326   9.03349873 15.0755012  16.18863066
  7.3477447  19.21076098 39.48772402 28.17145489 24.59856694 16.78627561
 37.83462181  5.82715573 21.25165379 24.62073608 18.930441   19.97767277
 15.23699197 26.35835053  7.51678073 27.09803767 29.17168265 16.37120522
  8.03020223 35.42166118 32.30524395 20.79162621 16.42144765 20.81157406
 22.91462114 23.6020753  19.31895889 38.24888431 23.82943372 19.03847227
 12.65683439  6.26754234 41.43351998 21.06296969 16.29041242 21.44189891
 40.67144234 20.44489817 36.76925509 27.04308891 19.71959067 19.60835909
 24.53944285 21.04286016 30.87665013 19.3088

## 线性回归的改进-岭回归
- 岭回归，其实也是一种线性回归。只不过在算法建立回归方程时候，加上L2正则化的限制，从而达到解决过拟合的效果
### 欠拟合&过拟合
- L2正则化
    - Ridge回归
- L1正则化
    - LASSO回归

In [10]:
from sklearn.linear_model import Ridge, RidgeCV
# Ridge具有L2正则化的线性回归, 实现了SGDRegressor()随机梯度下降 优化损失函数, 即SAG 随机平均梯度法
# RidgeCV具有l2正则化的线性回归，可以进行交叉验证

In [11]:
def linear_model3():
    """
    线性回归:岭回归
    Ridge具有L2正则化的随机梯度下降 线性回归，即SAG 随机平均梯度法
    :return:
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(岭回归)
    estimator = Ridge(alpha=1)
    # estimator = RidgeCV(alphas=(0.1, 1, 10))
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)

In [12]:
linear_model3()

预测值为:
 [28.14336439 31.29120593 20.54384341 31.45949883 19.05713232 18.25154031
 20.59333004 18.46668579 18.49439324 32.90278303 20.39074387 27.19391547
 14.82896742 19.22647169 36.99680592 18.30216415  7.77234952 17.59204777
 30.20233488 23.61819202 18.13165677 33.80976641 28.45514573 16.97450477
 34.72448519 26.19876013 34.77528305 26.63056236 18.62636595 13.34630747
 30.34386216 14.5911294  37.18589518  8.96603866 15.1046276  16.0870778
  7.2410686  19.13817477 39.5390249  28.27770546 24.63218813 16.74118324
 37.8401846   5.70041018 21.17142785 24.60567485 18.90535427 19.95506965
 15.19437924 26.28324334  7.54840338 27.10725806 29.18271353 16.27866225
  7.9813597  35.42054763 32.2845617  20.95634259 16.43407021 20.88411873
 22.93442975 23.58724813 19.3655118  38.2810092  23.98858525 18.95166781
 12.62360991  6.12834839 41.45200493 21.09795707 16.19808353 21.5210458
 40.71914496 20.54014744 36.78495192 27.02863306 19.9217193  19.64062326
 24.60418297 21.26677099 30.94032672 19.337703

## 离线模型的保存和加载
- from sklearn.externals import joblib

    - 保存：joblib.dump(estimator, 'test.pkl')
    - 加载：estimator = joblib.load('test.pkl')

In [15]:
from sklearn.externals import joblib

In [18]:
def load_dump_demo():
    """
    线性回归:岭回归
    :return:
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(岭回归)
    # 4.1 模型训练
#     estimator = Ridge(alpha=1)
#     estimator.fit(x_train, y_train)
    
    # 4.2 模型保存
#     joblib.dump(estimator, "./data/test.pkl")

    # 4.3 模型加载
    estimator = joblib.load("./data/test.pkl")

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)

In [19]:
load_dump_demo()

预测值为:
 [28.14336439 31.29120593 20.54384341 31.45949883 19.05713232 18.25154031
 20.59333004 18.46668579 18.49439324 32.90278303 20.39074387 27.19391547
 14.82896742 19.22647169 36.99680592 18.30216415  7.77234952 17.59204777
 30.20233488 23.61819202 18.13165677 33.80976641 28.45514573 16.97450477
 34.72448519 26.19876013 34.77528305 26.63056236 18.62636595 13.34630747
 30.34386216 14.5911294  37.18589518  8.96603866 15.1046276  16.0870778
  7.2410686  19.13817477 39.5390249  28.27770546 24.63218813 16.74118324
 37.8401846   5.70041018 21.17142785 24.60567485 18.90535427 19.95506965
 15.19437924 26.28324334  7.54840338 27.10725806 29.18271353 16.27866225
  7.9813597  35.42054763 32.2845617  20.95634259 16.43407021 20.88411873
 22.93442975 23.58724813 19.3655118  38.2810092  23.98858525 18.95166781
 12.62360991  6.12834839 41.45200493 21.09795707 16.19808353 21.5210458
 40.71914496 20.54014744 36.78495192 27.02863306 19.9217193  19.64062326
 24.60418297 21.26677099 30.94032672 19.337703