In [1]:
import os

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, LogisticRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, classification_report, roc_auc_score
import joblib
import pandas as pd
import numpy as np

In [2]:
"""
线性回归直接预测房子价格
:return: None
"""
# 获取数据
fe_cal = fetch_california_housing(data_home='data')

print("获取特征值")
print(fe_cal.data.shape)
print('-' * 50)
print("目标值")
print(fe_cal.target) # 单位是10万美金
print(fe_cal.DESCR)
print('-' * 50)
print(fe_cal.feature_names) # 特征列的名字

获取特征值
(20640, 8)
--------------------------------------------------
目标值
[4.526 3.585 3.521 ... 0.923 0.847 0.894]
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hund

In [3]:
fe_cal.target.shape

(20640,)

In [4]:
# 分割数据集到训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(fe_cal.data, fe_cal.target, test_size=0.25, random_state=1)

print(x_train.shape)

# 进行特征标准化处理，目标值处理
# 特征值和目标值是都必须进行标准化处理，实例化两个标准API
std_x = StandardScaler()

x_train = std_x.fit_transform(x_train) # 训练集标准化
x_test = std_x.transform(x_test) # 测试集标准化

# 目标值进行了标准化，暂时没有对目标值进行标准化处理
std_y = StandardScaler()

temp = y_train.reshape(-1, 1) # -1代表把剩余的元素都堆到哪一维

# 把标签进行标准化
# 目标值是一维的，这里需要传进去2维的
# y_train = std_y.fit_transform(y_train.reshape(-1, 1))
# print(y_train.shape)
# y_test = std_y.transform(y_test.reshape(-1, 1))
# print(y_test.shape)

(15480, 8)


In [5]:
test1 = np.array([1, 2, 3])
print(test1.shape)
test1.reshape(-1, 1).shape

(3,)


(3, 1)

In [6]:
import os
# estimator预测
# 正规方程求解方式预测结果，正规方程进行线性回归
lr = LinearRegression()
# fit是耗时的
lr.fit(x_train, y_train)
# 回归系数可以看特征与目标之间的相关性
print('回归系数', lr.coef_)

y_predict = lr.predict(x_test)
# 预测测试集的房子价格，通过inverse得到真正的房子价格
# y_lr_predict = std_y.inverse_transform(y_predict)
# 保存训练好的模型，模型中保存的是w的值，也保存了模型结构
# 保存模型放在fit之后即可


回归系数 [ 0.83167028  0.12159502 -0.26758589  0.30983997 -0.00518054 -0.04040421
 -0.90736902 -0.88212727]
