# 导入库以及数据集

In [2]:
# 基本工具
import numpy as np
import pandas as pd 
import time
import os #修改环境变量

#算法/损失/评估指标等
import sklearn
from sklearn.ensemble import RandomForestRegressor as RFR # 随机森林回归
from sklearn.model_selection import KFold,cross_validate

# 优化器
from bayes_opt import BayesianOptimization

In [5]:
data = pd.read_csv(r"C:\Users\ZARD\Desktop\超参\程序学习\AutoML与超参数优化——菜菜\train_encode.csv",index_col=0)

# 查看数据集的信息

In [6]:
data.head(5)

Unnamed: 0,Id,住宅类型,住宅区域,街道接触面积(英尺),住宅面积,街道路面状况,巷子路面状况,住宅形状(大概),住宅现状,水电气,...,泳池面积,泳池质量,篱笆质量,其他配置,其他配置的价值,销售月份,销售年份,销售类型,销售状态,SalePrice
0,0.0,5.0,3.0,36.0,327.0,1.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,8.0,4.0,208500
1,1.0,0.0,3.0,51.0,498.0,1.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,1.0,8.0,4.0,181500
2,2.0,5.0,3.0,39.0,702.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2.0,8.0,4.0,223500
3,3.0,6.0,3.0,31.0,489.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,0.0,140000
4,4.0,5.0,3.0,55.0,925.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,2.0,8.0,4.0,250000


In [4]:
x = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [5]:
x.head()

Unnamed: 0,Id,住宅类型,住宅区域,街道接触面积(英尺),住宅面积,街道路面状况,巷子路面状况,住宅形状(大概),住宅现状,水电气,...,半开放式门廊面积,泳池面积,泳池质量,篱笆质量,其他配置,其他配置的价值,销售月份,销售年份,销售类型,销售状态
0,0.0,5.0,3.0,36.0,327.0,1.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,8.0,4.0
1,1.0,0.0,3.0,51.0,498.0,1.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,8.0,4.0
2,2.0,5.0,3.0,39.0,702.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2.0,8.0,4.0
3,3.0,6.0,3.0,31.0,489.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,0.0
4,4.0,5.0,3.0,55.0,925.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,2.0,8.0,4.0


In [7]:
y.head() # 价格

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [6]:
x.shape

(1460, 80)

# 做贝叶斯优化（BayesianOptimization）

## 定义目标函数

In [8]:
# 定义目标函数
def bayesopt_objective(n_estimators,max_depth,max_features,min_impurity_decrease):
    # 定义评估器
    # 需要调整的超参数等于目标函数的输入，不需要调整的超参数则直接等于固定值
    # 默认参数输入一定是浮点数，因此需要套上int函数处理成整数
    reg = RFR(
        n_estimators=int(n_estimators)
        ,max_depth=int(max_depth)
        ,max_features=int(max_features)
        ,min_impurity_decrease=min_impurity_decrease
        ,random_state=1412
        ,verbose=False # 可自行决定是否开启森林建树的verbose
        ,n_jobs=-1
    )
    
    # 定义损失的输出，5折交叉验证下的结果，输出负根均方误差（-RMSE）
    # 注意，交叉验证需要使用数据，但我们不能让数据x,y成为目标函数的输入
    cv = KFold(n_splits=5,shuffle=True,random_state=1412)
    validation_loss = cross_validate(
        reg,x,y
        ,scoring="neg_root_mean_squared_error" # 负根均方误差
        ,cv=cv
        ,verbose=False # 不需要打印具体流程
        ,n_jobs=-1 
        ,error_score='raise' # 默认值为None，此时返回“空”
        # 如果交叉验证中算法执行报错，则告诉我们错误的理由
    )
    # 交叉验证输出的评估指标是负根均方误差，因此本来就是负的损失
    # 目标函数可直接输出该损失的均值
    return np.mean(validation_loss["test_score"])

## 设置备选参数空间

In [9]:
#备选参数空间
param_grid_simple = {
    'n_estimators':(80,100)
    ,'max_depth':(10,25)
    ,'max_features':(10,20)
    ,'min_impurity_decrease':(0,1)
    }

## 定义优化目标函数的具体流程

In [9]:
# 定义优化目标函数的具体流程
def param_bayes_opt(init_points,n_iter):
    
    # 定义优化器，先实例化优化器
    opt = BayesianOptimization(
          bayesopt_objective # 需要优化的目标函数
        ,param_grid_simple # 备选参数空间
        ,random_state = 1412 # 随机数种子，虽然无法控制
    )
    
    # 使用优化器，其中bayes_opt只支持最大化
    opt.maximize(
        init_points = init_points  # 抽取多少个观测值
        ,n_iter = n_iter # 一共观测/迭代多少次
    )

    #优化完成，取出最佳最佳参数与分数
    params_best = opt.max['params']
    score_best = opt.max["target"]

    #打印最佳参数与最佳分数
    print("\n","\n","best params: ",params_best,
         "\n","\n","best cvscore: ",score_best_best,
         )
    
    # 返回最佳参数与最佳分数
    return params_best,score_best


## 定义验证函数

In [11]:
# 定义验证函数

def bayes_opt_validation(params_best):
    
    reg = RFR(
        n_estimators = int(params_best["n_estimators"])
        ,max_depth=int(params_best["max_depth"])
        ,min_impurity_decrease = params_best["min_impurity_decrease"]
        ,random_state=1412
        ,verbose=Flase
        ,n_jobs=-1
    )
    cv = KFold(n_splits=5,shuffle=True,random_state=1412)
    validation_loss = cross_validate(
        reg,x,y
        ,scoring = "neg_root_mean_squared_error"
        ,cv = cv
        ,verbose = False
        ,n_jobs = -1
    )# 执行实际优化流程
start = time.time()
params_best,score_best = param_bayes_opt(20,280) # 初始看20个观测值，后面迭代280次
print('It take %s minutes'% ((time.time()-start)/60))
validation_score = bayes_opt_validation(params_best)
print("\n","\n","validation_score: ",validation_score)
    
    return np.mean(validation_loss["test_score"])
    

## 执行实际优化流程

In [12]:
# 执行实际优化流程
start = time.time()
params_best,score_best = param_bayes_opt(20,280) # 初始看20个观测值，后面迭代280次
print('It take %s minutes'% ((time.time()-start)/60))
validation_score = bayes_opt_validation(params_best)
print("\n","\n","validation_score: ",validation_score)

|   iter    |  target   | max_depth | max_fe... | min_im... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-2.948e+0[0m | [0m23.2     [0m | [0m17.52    [0m | [0m0.06379  [0m | [0m88.79    [0m |
| [95m2        [0m | [95m-2.909e+0[0m | [95m14.8     [0m | [95m17.61    [0m | [95m0.9214   [0m | [95m97.58    [0m |
| [95m3        [0m | [95m-2.9e+04 [0m | [95m15.86    [0m | [95m15.56    [0m | [95m0.2661   [0m | [95m87.98    [0m |
| [95m4        [0m | [95m-2.887e+0[0m | [95m14.05    [0m | [95m16.84    [0m | [95m0.06744  [0m | [95m89.72    [0m |
| [0m5        [0m | [0m-2.887e+0[0m | [0m18.71    [0m | [0m19.17    [0m | [0m0.9315   [0m | [0m83.7     [0m |
| [0m6        [0m | [0m-2.895e+0[0m | [0m17.7     [0m | [0m19.58    [0m | [0m0.7127   [0m | [0m89.18    [0m |
| [0m7        [0m | [0m-2.968e+0[0m | [0m14.21    [0m | [0m12.62    [0m | [0m0.3381   [0m | 

| [0m67       [0m | [0m-2.88e+04[0m | [0m19.42    [0m | [0m17.58    [0m | [0m0.0      [0m | [0m92.78    [0m |
| [0m68       [0m | [0m-2.932e+0[0m | [0m18.38    [0m | [0m17.41    [0m | [0m0.4656   [0m | [0m92.52    [0m |
| [0m69       [0m | [0m-2.895e+0[0m | [0m20.4     [0m | [0m17.11    [0m | [0m0.0      [0m | [0m92.6     [0m |
| [0m70       [0m | [0m-2.932e+0[0m | [0m19.99    [0m | [0m18.57    [0m | [0m0.0      [0m | [0m92.48    [0m |
| [0m71       [0m | [0m-2.92e+04[0m | [0m19.6     [0m | [0m16.9     [0m | [0m0.0      [0m | [0m93.7     [0m |
| [0m72       [0m | [0m-2.891e+0[0m | [0m17.3     [0m | [0m19.87    [0m | [0m0.9839   [0m | [0m80.93    [0m |
| [0m73       [0m | [0m-2.898e+0[0m | [0m20.3     [0m | [0m17.89    [0m | [0m0.9251   [0m | [0m89.54    [0m |
| [0m74       [0m | [0m-2.928e+0[0m | [0m19.45    [0m | [0m18.65    [0m | [0m0.03773  [0m | [0m89.87    [0m |
| [0m75       [0m | [

| [0m134      [0m | [0m-2.913e+0[0m | [0m10.01    [0m | [0m16.65    [0m | [0m0.3608   [0m | [0m89.52    [0m |
| [0m135      [0m | [0m-2.855e+0[0m | [0m15.43    [0m | [0m18.05    [0m | [0m0.2011   [0m | [0m83.39    [0m |
| [0m136      [0m | [0m-2.9e+04 [0m | [0m20.93    [0m | [0m19.89    [0m | [0m0.0004996[0m | [0m91.1     [0m |
| [0m137      [0m | [0m-2.855e+0[0m | [0m15.39    [0m | [0m18.38    [0m | [0m0.873    [0m | [0m83.47    [0m |
| [0m138      [0m | [0m-2.907e+0[0m | [0m14.87    [0m | [0m18.23    [0m | [0m0.4442   [0m | [0m84.1     [0m |
| [0m139      [0m | [0m-2.878e+0[0m | [0m15.09    [0m | [0m17.6     [0m | [0m0.8051   [0m | [0m83.35    [0m |
| [0m140      [0m | [0m-2.852e+0[0m | [0m15.78    [0m | [0m18.09    [0m | [0m0.9062   [0m | [0m81.62    [0m |
| [0m141      [0m | [0m-2.908e+0[0m | [0m14.83    [0m | [0m18.6     [0m | [0m0.957    [0m | [0m83.29    [0m |
| [0m142      [0m | [

| [0m202      [0m | [0m-2.865e+0[0m | [0m23.9     [0m | [0m19.33    [0m | [0m0.6924   [0m | [0m90.25    [0m |
| [0m203      [0m | [0m-2.888e+0[0m | [0m24.41    [0m | [0m19.55    [0m | [0m0.9851   [0m | [0m91.42    [0m |
| [0m204      [0m | [0m-2.89e+04[0m | [0m24.27    [0m | [0m19.76    [0m | [0m0.2488   [0m | [0m90.17    [0m |
| [0m205      [0m | [0m-2.865e+0[0m | [0m23.16    [0m | [0m19.4     [0m | [0m0.616    [0m | [0m89.93    [0m |
| [0m206      [0m | [0m-2.917e+0[0m | [0m23.05    [0m | [0m18.71    [0m | [0m0.3733   [0m | [0m90.52    [0m |
| [0m207      [0m | [0m-2.865e+0[0m | [0m23.52    [0m | [0m19.64    [0m | [0m0.3483   [0m | [0m89.4     [0m |
| [0m208      [0m | [0m-2.863e+0[0m | [0m23.53    [0m | [0m19.91    [0m | [0m0.8679   [0m | [0m90.58    [0m |
| [0m209      [0m | [0m-2.863e+0[0m | [0m23.12    [0m | [0m19.86    [0m | [0m0.8781   [0m | [0m89.0     [0m |
| [0m210      [0m | [

| [0m270      [0m | [0m-2.949e+0[0m | [0m25.0     [0m | [0m17.7     [0m | [0m1.0      [0m | [0m80.0     [0m |
| [0m271      [0m | [0m-2.907e+0[0m | [0m25.0     [0m | [0m15.46    [0m | [0m1.0      [0m | [0m80.0     [0m |
| [0m272      [0m | [0m-2.871e+0[0m | [0m23.44    [0m | [0m19.99    [0m | [0m0.6317   [0m | [0m84.55    [0m |
| [0m273      [0m | [0m-2.904e+0[0m | [0m22.99    [0m | [0m19.29    [0m | [0m0.6946   [0m | [0m84.0     [0m |
| [0m274      [0m | [0m-2.928e+0[0m | [0m25.0     [0m | [0m10.0     [0m | [0m0.0      [0m | [0m100.0    [0m |
| [0m275      [0m | [0m-2.887e+0[0m | [0m24.14    [0m | [0m19.84    [0m | [0m0.004388 [0m | [0m97.23    [0m |
| [0m276      [0m | [0m-2.899e+0[0m | [0m24.36    [0m | [0m19.94    [0m | [0m0.8132   [0m | [0m83.4     [0m |
| [0m277      [0m | [0m-2.864e+0[0m | [0m23.24    [0m | [0m19.96    [0m | [0m0.9658   [0m | [0m97.67    [0m |
| [0m278      [0m | [

NameError: name 'score_best_best' is not defined