# 台湾房价预测
## 模型评价与验证

---
## 第一步. 导入数据
利用台湾的房屋信息数据训练和测试一个模型，并对模型的性能和预测能力进行测试。通过该数据训练后的好的模型可以被用来对房屋做特定预测---尤其是对房屋的价值。

In [21]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from sklearn.model_selection import ShuffleSplit
import pymongo
import math

client = pymongo.MongoClient("localhost", 27017)
db = client.taiwanHouse
items = db.house.find()
data =  pd.DataFrame(list(items))
print(data.shape)

(74942, 59)


---
## 第二步. 处理数据

In [22]:
data = data[['_id','age','areaBuilding','buildingFront','cityId','cityName', 'houselandtype', 'floor','floors','isHasBalcony','isParking','layout','latitude','longitude','monthlyFee','pingUsed','totalPrice','zipName']]
data = data.convert_dtypes(infer_objects = True, convert_string = True, convert_integer = True, convert_boolean = True)

# age 字段处理预测二手房，去除新房
data = data[~data.age.isin(["預售", "--"])]
data['age'] = data.age.map(lambda x: math.floor(float(x.strip('年')))) 
data = data[data.age>=0]

# floor 楼层处理，只留楼层是单数的数据，其他的不好判断是数据意义
data["floor"] = pd.to_numeric(data["floor"],errors='coerce')
data = data[~data.floor.isnull()]
data = data[data.floor>=0]

# isParking 处理，只有 true false 两种取值，使用 1 0 
data['isParking'] = data.isParking.map(lambda x: 1 if x=='true' else 0) 

# layout 处理，包含房 廳 衛 的为有效数据
data = data[(data['layout'].str.contains('房')) & (data['layout'].str.contains('廳')) & (data['layout'].str.contains('衛')) ]
bedroom  = []
livingroom = []
bathroom = []
for index, row in data.iterrows():
    row.layout.split('房')
    bedroom.append(math.ceil(float(row.layout.split('房')[0])))
    livingroom.append(math.ceil(float(row.layout.split('房')[1].split('廳')[0])))
    bathroom.append(math.ceil(float(row.layout.split('房')[1].split('廳')[1].split('衛')[0])))    
data['bedroom'] = bedroom
data['livingroom'] = livingroom
data['bathroom'] = bathroom

# monthlyFee 物业费应该是，只保留 >= 0
data["monthlyFee"] = pd.to_numeric(data["monthlyFee"],errors='coerce')
data = data[~data.monthlyFee.isnull()]
data = data[data.monthlyFee>=0]

# houselandtype 处理 去除不使用的 houselandtype 类型
data = data[~((data['houselandtype'].str.contains('D')) | (data['houselandtype'].str.contains('H')) | (data['houselandtype'].str.contains('G')) | (data['houselandtype'].str.contains('I')) | (data['houselandtype'].str.contains('J')) | (data['houselandtype'].str.contains('K')))]
# 保留的 houselandtype 类型，将对应值保存在 landTypeA landTypeB landTypeC landTypeE landTypeF landTypeL landTypeM 属性中， 1:属于该类型  0：不属于 
landTypes = ['A', 'B', 'C', 'E', 'F', 'L', 'M']
for i, j in enumerate(landTypes):
    exec("landType%s=[]"%j) 

for index, row in data.iterrows():
    for i, j in enumerate(landTypes):
      exec("landType%s.append(1 if row['houselandtype'].find(j) != -1 else 0)"%j)      
    
for i, j in enumerate(landTypes):
    tempKey = "landType" + j
    data[tempKey] = eval(tempKey)  

Y = data['totalPrice']
# 去除不使用的属性
data = data.drop(['layout', '_id', 'houselandtype', 'cityId', 'latitude', 'longitude', 'totalPrice'], axis=1)

# 非数字特征数据进行独热编码
X =  pd.get_dummies(data, columns=['buildingFront', 'cityName', 'zipName'])

print(data.dtypes)
print(data.shape)

age                int64
areaBuilding     Float64
buildingFront     string
cityName          string
floor            float64
floors             Int64
isHasBalcony       Int64
isParking          int64
monthlyFee       float64
pingUsed         Float64
zipName           string
bedroom            int64
livingroom         int64
bathroom           int64
landTypeA          int64
landTypeB          int64
landTypeC          int64
landTypeE          int64
landTypeF          int64
landTypeL          int64
landTypeM          int64
dtype: object
(15419, 21)


---
## 第三步. 模型建立-训练-评估

###  定义衡量标准

$R^2$ 的数值范围从0至1，表示**目标变量**的预测值和实际值之间的相关程度平方的百分比。一个模型的 $R^2$ 值为0还不如直接用**平均值**来预测效果好；而一个 $R^2$ 值为1的模型则可以对目标变量进行完美的预测。从0至1之间的数值，则表示该模型中目标变量中有百分之多少能够用**特征**来解释。模型也可能出现负值的 $R^2$，这种情况下模型所做预测有时会比直接计算目标变量的平均值差很多。

In [23]:
from sklearn.metrics import r2_score

def performance_metric(y_true, y_predict):
    score = r2_score(y_true, y_predict)
    return score

###  数据分割与重排

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=43)
print("Training and testing split was successful.")

Training and testing split was successful.


###  建立 训练 评估

In [25]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.metrics import make_scorer

def fit_model(X, y):
    cv_sets = ShuffleSplit(n_splits=10, test_size=0.20, random_state=42)
    regressor = DecisionTreeRegressor()
    params = {'max_depth': range(1, 11)}
    scoring_fnc = make_scorer(performance_metric)
    grid = GridSearchCV(estimator=regressor, param_grid=params, scoring=scoring_fnc, cv=cv_sets)
    grid = grid.fit(X, y)
    return grid.best_estimator_

reg = fit_model(X_train, y_train)

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


  return f(*args, **kwargs)


In [26]:
print("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))

r2 = reg.score(X_test, y_test)
print("Optimal model has R^2 score {:,.2f} on test data".format(r2))

Parameter 'max_depth' is 7 for the optimal model.
Optimal model has R^2 score 0.83 on test data


  return f(*args, **kwargs)
