In [0]:
# 导入数据
from sklearn.datasets import load_boston
data = load_boston()
x, y = data['data'], data['target']

In [0]:
# 导入常用模型包
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [0]:
# 划分数据集
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=2019)

In [0]:
# 创建xgb矩阵
d_x = xgb.DMatrix(x_train)
d_y = xgb.DMatrix(y_train.reshape(-1, 1))

In [24]:
# 构建几个回归模型
lr = LinearRegression()
ridge = Ridge()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor(n_estimators=100)

xgbr = xgb.XGBRegressor()
# 不调参看一下效果
lr.fit(x_train, y_train)
pred = lr.predict(x_valid)
print("MSE:{} in LR".format(mean_squared_error(y_valid, pred)))

ridge.fit(x_train, y_train)
pred = ridge.predict(x_valid)
print("MSE:{} in Ridge".format(mean_squared_error(y_valid, pred)))

dtr.fit(x_train, y_train)
pred = dtr.predict(x_valid)
print("MSE:{} in DTR".format(mean_squared_error(y_valid, pred)))

rfr.fit(x_train, y_train)
pred = rfr.predict(x_valid)
print("MSE:{} in RFR".format(mean_squared_error(y_valid, pred)))

xgbr.fit(x_train, y_train)
pred = xgbr.predict(x_valid)
print("MSE:{} in XGBR".format(mean_squared_error(y_valid, pred)))

MSE:26.202748180423672 in LR
MSE:26.779801430066296 in Ridge
MSE:21.141470588235308 in DTR
MSE:13.922911254901953 in RFR
MSE:13.184715539021276 in XGBR


In [38]:
# 需要调整的参数
# cv_params = {'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2]}
# 不调整的参数
other_params = {'n_estimators': 500, 'seed': 0, 'n_estimators': 600, 'max_depth': 4, 'min_child_weight' :1, 'gamma': 0.2, 'subsample': 0.8, 'colsample_bytree': 0.9, 'reg_alpha':0.1, 'reg_lambda':0.05, 'learning_rate':0.1,}
xgbr = xgb.XGBRegressor(**other_params)
grid = GridSearchCV(estimator=xgbr, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=8)
grid.fit(x_train, y_train)
# print('每轮迭代运行结果:{0}'.format(grid.cv_results_))
print('参数的最佳取值：{0}'.format(grid.best_params_))
print('最佳模型得分:{0}'.format(grid.best_score_))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 out of  25 | elapsed:    4.7s finished


参数的最佳取值：{'learning_rate': 0.1}
最佳模型得分:0.9086713978467437


In [58]:
# 现在使用新的参数构建模型进行预测计算损失
params = {'n_estimators': 10, 'seed': 0, 'n_estimators': 600, 'max_depth': 6, 'min_child_weight' :1, 'gamma': 0.2, 'subsample': 0.8, 'colsample_bytree': 0.9, 'reg_alpha':0.1, 'reg_lambda':0.05, 'learning_rate':0.01}
model = xgb.XGBRegressor(params=params, booster='dart')
model.fit(x_train, y_train)
pred = model.predict(x_valid)
print("MSE:{}".format(mean_squared_error(y_valid, pred)))

MSE:13.184715392998996
