In [30]:
import os

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, LogisticRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, classification_report, roc_auc_score
import joblib
import pandas as pd
import numpy as np

In [31]:
"""
线性回归直接预测房子价格
:return: None
"""
# 获取数据
fe_cal = fetch_california_housing(data_home='data')

print("获取特征值")
print(fe_cal.data.shape)
print('-' * 50)
print("目标值")
print(fe_cal.target) # 单位是10万美金
print(fe_cal.DESCR)
print('-' * 50)
print(fe_cal.feature_names) # 特征列的名字

获取特征值
(20640, 8)
--------------------------------------------------
目标值
[4.526 3.585 3.521 ... 0.923 0.847 0.894]
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hund

MedInc - 中位收入（Median Income）
HouseAge - 房屋年龄（House Age）
AveRooms - 平均房间数（Average Number of Rooms）
AveBedrms - 平均卧室数（Average Number of Bedrooms）
Population - 人口数量（Population）
AveOccup - 平均居住人数（Average Occupancy）
Latitude - 纬度（Latitude）
Longitude - 经度（Longitude）

In [32]:
fe_cal.target.shape

(20640,)

In [33]:
# 分割数据集到训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(fe_cal.data, fe_cal.target, test_size=0.25, random_state=1)

print(x_train.shape)

# 进行特征标准化处理，目标值处理
# 特征值和目标值是都必须进行标准化处理，实例化两个标准API
std_x = StandardScaler()

x_train = std_x.fit_transform(x_train) # 训练集标准化
x_test = std_x.transform(x_test) # 测试集标准化

# 目标值进行了标准化，暂时没有对目标值进行标准化处理
std_y = StandardScaler()

temp = y_train.reshape(-1, 1) # -1代表把剩余的元素都堆到哪一维

# 把标签进行标准化
# 目标值是一维的，这里需要传进去2维的
# y_train = std_y.fit_transform(y_train.reshape(-1, 1))
# print(y_train.shape)
# y_test = std_y.transform(y_test.reshape(-1, 1))
# print(y_test.shape)

(15480, 8)


In [34]:
test1 = np.array([1, 2, 3])
print(test1.shape)
test1.reshape(-1, 1).shape

(3,)


(3, 1)

## 1.模型保存

In [35]:
import os
# estimator预测
# 正规方程求解方式预测结果，正规方程进行线性回归
lr = LinearRegression()
# fit是耗时的
lr.fit(x_train, y_train)
# 回归系数可以看特征与目标之间的相关性
print('回归系数', lr.coef_)

y_predict = lr.predict(x_test)
# 预测测试集的房子价格，通过inverse得到真正的房子价格
# y_lr_predict = std_y.inverse_transform(y_predict)
# 保存训练好的模型，模型中保存的是w的值，也保存了模型结构
# 保存模型放在fit之后即可
# os.unlink('./tmp/test.pkl') # 删除之前的模型文件
joblib.dump(lr, "./tmp/test.pkl")
print("正规方程测试集里面每个房子的预测价格：", y_predict[0:10])
# 下面是求测试集的损失，用均方误差，公式是(y_test-y_predixct) ^ 2 / n
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

回归系数 [ 0.83167028  0.12159502 -0.26758589  0.30983997 -0.00518054 -0.04040421
 -0.90736902 -0.88212727]
正规方程测试集里面每个房子的预测价格： [2.12391852 0.93825754 2.7088455  1.70873764 2.82954754 3.50376456
 3.0147162  1.62781292 1.74317518 2.01897806]
正规方程的均方误差： 0.5356532845422556


## 2.加载保存的模型

In [36]:
# 模拟上线时加载模型
model = joblib.load("./tmp/test.pkl")
# # 因为目标值进行了标准化，一定要把预测后的值逆向转换过来
y_predict = model.predict(x_test)

print("保存的模型预测的结果：", y_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

# print("保存的y标准化后的模型预测的结果：", std+y.inverse_transform(y_predict)[0:10])
# print("正规方程inverse后的均方误差：", mean_squared_error(std_y.inverse_transform(y_test), (t_predict)))

保存的模型预测的结果： [2.12391852 0.93825754 2.7088455  ... 1.24263061 2.73771901 1.75800594]
正规方程的均方误差： 0.5356532845422556


In [37]:
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_squared_error(y_true, y_pred)

0.375

In [38]:
# 人工求均方误差
(np.square(3 - 2.5) + np.square(0.5) + 1) / 4

np.float64(0.375)

## 3.线性回归之梯度下降去进行房价预测

In [39]:
# 梯度下降去进行房价预测，数据量大要用这个
# learning_rate的不同方式，代表学习率变化的算法不一样，比如constant, invscaling, adaptive
# 默认可以去调 eta0 = 0.008, 会改变learning_rate的初始值
# learning_rate = 'optimal', alpha是正则化力度，但是会影响学习率的值，由alpha来算学习率
# penalty代表正则化，分为l1和l2
# eta0 = 0.01, penalty = '12', max_iter = 1000
sgd = SGDRegressor(eta0=0.01, penalty='l2', max_iter=1000)
# # 训练
sgd.fit(x_train, y_train)
#
print('梯度下降的回归系数', sgd.coef_)
#
# 预测测试集的房子价格
#  y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test).reshape(-1, 1))
y_predict = sgd.predict(x_test)
# print("梯度下降测试集里面每个房子的预测价格：", y_sgd_predict)
print("梯度下降的均方误差：", mean_squared_error(y_test, y_predict))
# print("梯度下降的原始房价量纲均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))

梯度下降的回归系数 [ 0.84016373  0.11719273 -0.25160832  0.29837928 -0.00866259 -0.20360152
 -0.89912316 -0.88612971]
梯度下降的均方误差： 0.5327285014812231


In [40]:
w = 1
alpha = 0.7
def loss(w):
    return 2 * w ** 2 + 3 * w + 2
def derivative(w): # derivative:导数
    return 4 * w + 3
for i in range(30):
    w = w - alpha * derivative(w)
    print(f'w{w} 损失{loss(w)}')

w-3.8999999999999995 损失20.71999999999999
w4.919999999999999 损失65.17279999999998
w-10.955999999999996 损失209.19987199999983
w17.620799999999992 损失675.8475852799994
w-33.817439999999976 损失2187.786176307197
w58.77139199999995 损失7086.4672112353155
w-107.8885055999999 损失22958.193764402422
w192.09931007999978 损失74382.58779666382
w-347.87875814399956 损失240997.6244611907
w624.0817646591992 损失780830.3432542577
w-1125.4471763865586 损失2529888.352143795
w2023.7049174958051 损失8196836.300945894
w-3644.768851492449 损失26557747.65506469
w6558.483932686408 损失86047100.4424096
w-11807.371078835533 损失278792603.4734071
w21251.167941903957 损失903288033.2938386
w-38254.202295427116 损失2926653225.912036
w68855.4641317688 损失9482356449.994993
w-123941.93543718383 损失30722834896.023777
w223093.38378693088 损失99541985061.15703
w-401570.1908164755 损失322516031596.1886
w722824.2434696557 损失1044951942369.6908
w-1301085.73824538 损失3385644293275.837
w2341952.228841684 损失10969487510211.748
w-4215516.1119150305 损失3554113953308

## 4.岭回归

In [None]:
# 岭回归去进行房价预测
# 岭回归是对线性回归加入L2正则化，L2正则化是对系数的平方和进行惩罚
# alpha就是补偿的系数
# 正规方程求解，加补偿就可以让正规方程可逆
rd = Ridge(alpha=0.02)

rd.fit(x_train, y_train)

print(rd.coef_)
# 
# # 预测测试集的房子价格
print(rd.predict(x_test).shape)
# y_rd_predict = std_y.inverse_transform(rd.predict(x_test))
y_predict = rd.predict(x_test)
# print("岭回归里面每个房子的预测价格：", y_rd_predict)

print("岭回归的均方误差：", mean_squared_error(y_test, y_predict))
# print("岭回归的均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_ls_predict))

## 5.逻辑回归

In [None]:
np.log(0.5)

In [None]:
np.log(0.3)

In [None]:
x = np.arange(0.001, 1, 0.01) # 0 - 1之间，步长0.01
import matplotlib.pyplot as plt
plt.plot(x, -np.log(x)) # 以e为底部
plt.show()