In [26]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

In [27]:
df = pd.read_csv('beijing_house_price.csv')

In [28]:
df.head()

Unnamed: 0,公交,写字楼,医院,商场,地铁,学校,小区名字,建造时间,房型,楼层,每平米价格,面积
0,18,18,10,0,2,49,远洋山水,2006,2室1厅,26,60937,96
1,17,42,10,0,4,37,椿树园,1998,3室1厅,14,88686,130
2,18,36,9,0,1,24,永乐小区,1989,3室1厅,18,46621,74
3,15,49,13,0,2,45,主语家园,2007,4室3厅,2,86147,462
4,6,0,0,0,0,0,天伦锦城,2007,1室1厅,13,42500,64


In [29]:
df = df.drop_duplicates()

In [30]:
df.corr()

Unnamed: 0,公交,写字楼,医院,商场,地铁,学校,建造时间,楼层,每平米价格,面积
公交,1.0,0.529293,0.503114,,0.552599,0.717265,-0.014225,-0.013684,0.151266,-0.057429
写字楼,0.529293,1.0,0.744593,,0.819552,0.676085,-0.059332,0.120277,0.328803,-0.036594
医院,0.503114,0.744593,1.0,,0.703238,0.680142,-0.048411,0.092576,0.308416,-0.056122
商场,,,,,,,,,,
地铁,0.552599,0.819552,0.703238,,1.0,0.632699,-0.037625,0.071634,0.29767,-0.045099
学校,0.717265,0.676085,0.680142,,0.632699,1.0,-0.05231,0.073141,0.308136,-0.09327
建造时间,-0.014225,-0.059332,-0.048411,,-0.037625,-0.05231,1.0,0.024791,-0.074439,-0.033438
楼层,-0.013684,0.120277,0.092576,,0.071634,0.073141,0.024791,1.0,0.050279,-0.13701
每平米价格,0.151266,0.328803,0.308416,,0.29767,0.308136,-0.074439,0.050279,1.0,0.132288
面积,-0.057429,-0.036594,-0.056122,,-0.045099,-0.09327,-0.033438,-0.13701,0.132288,1.0


In [31]:
pearson_max = np.abs(df.corr().iloc[-2]).sort_values(ascending=False)[1:4]

In [32]:
features_names = pearson_max.index.values

In [33]:
features_names

array(['写字楼', '医院', '学校'], dtype=object)

In [34]:
features = df[features_names]

In [35]:
features

Unnamed: 0,写字楼,医院,学校
0,18,10,49
1,42,10,37
2,36,9,24
3,49,13,45
4,0,0,0
5,0,0,0
6,0,0,0
7,0,0,0
8,0,0,39
9,0,0,0


In [36]:
target = df.iloc[:,[-2]]

In [37]:
target

Unnamed: 0,每平米价格
0,60937
1,88686
2,46621
3,86147
4,42500
5,28368
6,97646
7,57083
8,61111
9,26627


In [38]:
x_train,x_test,y_train,y_test = train_test_split(features,target,test_size=0.3,random_state=10)

In [39]:
x_train

Unnamed: 0,写字楼,医院,学校
1549,31,11,31
292,0,0,0
523,0,0,0
1673,48,8,21
1014,49,7,49
1616,49,3,20
2279,0,0,0
2941,0,0,0
2593,0,0,0
299,49,5,27


In [40]:
poly_features = PolynomialFeatures(degree=3)

In [41]:
x_train_feature = poly_features.fit_transform(x_train)

In [42]:
x_train_feature

array([[1.0000e+00, 3.1000e+01, 1.1000e+01, ..., 3.7510e+03, 1.0571e+04,
        2.9791e+04],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.0000e+00, 4.9000e+01, 1.4000e+01, ..., 9.8000e+03, 3.5000e+04,
        1.2500e+05]])

In [None]:
x_test_feature = poly_features.fit_transform(x_test)

In [None]:
model = LinearRegression()

In [None]:
model.fit(x_train_feature, y_train)

In [None]:
y_pred = model.predict(x_test_feature)

In [None]:
mae = mean_absolute_error(y_test, y_pred)

In [None]:
mae