# 1 线性回归房价预测

In [1]:
import numpy as np 
from sklearn import datasets
from sklearn.linear_model import LinearRegression

#### 加载数据

In [2]:
boston = datasets.load_boston()  # 加载波士顿房价
X = boston['data']  # 数据
y = boston['target']  # 房价
feature_names = boston['feature_names'] # 具体指标
"""
CRIM:犯罪
NOX：空气污染
TAX：税收
"""
feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

#### 查看数据

In [41]:
# X是房间
# 506个统计样本  影响房价的13个属性
X.shape
# X 

(506, 13)

In [42]:
# y是价格
# 506个房间，对应506种价格
# X和y对应 有多少数据，就有多少目标值
y.shape 
# y 

(506,)

#### 数据拆分

In [20]:
# 506个数据，样本
# 拆分为两份，一份为80%的训练数据  20%为验证数据
# 用训练数据进行线性回归  学习，拟合，总结
# 20%用来验证
index = np.arange(506) # 数据索引
np.random.shuffle(index)

In [1]:
# index

In [28]:
train_index = index[:405]
test_index = index[405:]
X_train = X[train_index]
y_train = y[train_index]
display(X.shape, y.shape)

(101,)


(506, 13)

(506,)

In [27]:
X_test = X[test_index]
y_test = y[test_index]
display(X.shape, y.shape)

(506, 13)

(506,)

#### 数据建模

In [31]:
np.set_printoptions(suppress=True) # 去掉科学计数法
model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)
"""
建模取了斜率，有大有小，有正有负 
正：正相关，面积，越大，房价越高
负：正好相反 
coef_:斜率，系数 
intercept_:截距 
"""
display(model.coef_, model.intercept_)

array([ -0.07531944,   0.04463334,   0.01748097,   3.49863232,
       -19.9274301 ,   3.99323799,   0.01156966,  -1.47326196,
         0.27017698,  -0.01002154,  -0.96867479,   0.00895364,
        -0.53831897])

35.65379437559993

In [32]:
feature_names

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

#### 模型验证

In [39]:
y_ = model.predict(X_test).round(2)
y_[:30] 

array([25.23, 14.31, 26.56, 32.92, 19.37, 16.41, 23.91, 22.75, 23.19,
       20.4 , 19.52, 17.05, 34.91, 11.65, 19.06, 23.17, 17.64, 18.69,
       23.01, 18.13, 19.33, 23.69, 37.28, 14.71, 20.31, 13.31, 27.02,
       32.57, 19.31, 30.62])

In [38]:
y_test[:30]

array([19.4, 15.4, 25.2, 31.6, 20.3, 13.1, 19.2, 17.4, 23.8, 17.8, 14.6,
       19.1, 34.6, 15.6, 16.4, 22.4, 17.2, 18.4, 19.8, 12.7, 20.4, 20.1,
       43.1, 11.7, 19.2, 13.4, 22.3, 28.2, 18.5, 32.9])

#### 模型评估

In [43]:
# 最大值是1,可以小于0
model.score(X_test, y_test)

0.6816170032421283

$R^2=1 - u/v$

In [45]:
"""
评分的实现过程
y_true= y_test   真实值
y_predict = model.predict(X_test)  预测值
u  ((y_true - y_pred) ** 2).sum() 
v  ((y_true - y_true.mean()) ** 2).sum().
"""

'\nu  ((y_true - y_pred) ** 2).sum() \nv  ((y_true - y_true.mean()) ** 2).sum().\n'

In [3]:
# 最小二乘法
from sklearn.metrics import mean_squared_error

In [48]:
# 测试数据 y_true和y_pre
mean_squared_error(y_test, y_)

25.94515544554455

In [50]:
# 80%训练数据 y_true
mean_squared_error(y_train, model.predict(X_train))

21.11230787831343