In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

from model_helper.hyper_parameter_tuning import param_search

In [2]:
# 构建测试数据
df, label = make_classification(
    n_samples=5000,
    n_features=300,
    n_informative=12,
    n_redundant=7,
    random_state=134985745,
)
df = pd.DataFrame(df, columns=[f"f{i}" for i in range(df.shape[1])])

print("df shape:", df.shape)
print("label: ", label)
df.head()

df shape: (5000, 300)
label:  [1 1 0 ... 1 1 0]


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16,f17,f18,f19,f20,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35,f36,f37,f38,f39,f40,f41,f42,f43,f44,f45,f46,f47,f48,f49,f50,f51,f52,f53,f54,f55,f56,f57,f58,f59,f60,f61,f62,f63,f64,f65,f66,f67,f68,f69,f70,f71,...,f228,f229,f230,f231,f232,f233,f234,f235,f236,f237,f238,f239,f240,f241,f242,f243,f244,f245,f246,f247,f248,f249,f250,f251,f252,f253,f254,f255,f256,f257,f258,f259,f260,f261,f262,f263,f264,f265,f266,f267,f268,f269,f270,f271,f272,f273,f274,f275,f276,f277,f278,f279,f280,f281,f282,f283,f284,f285,f286,f287,f288,f289,f290,f291,f292,f293,f294,f295,f296,f297,f298,f299
0,-0.994557,-1.136878,0.169768,0.768031,0.014296,0.524148,1.023236,-0.172799,0.01204,-0.310725,-0.360682,-0.009369,-1.745508,-0.28582,-0.732836,-8.454865,-0.55282,1.5157,0.150757,0.794646,-0.170764,0.158164,-1.026552,1.363109,-0.199189,-1.35081,-2.83032,-0.934891,0.282713,3.924545,-0.440672,0.833457,1.067953,1.716262,-0.41254,-1.456003,-0.857324,-0.537075,0.504901,1.137019,-1.734692,0.088075,-0.096549,-0.432578,0.608646,-0.306824,-0.213374,-0.354016,0.367735,-0.342739,0.206341,-0.429546,-0.602545,-0.119167,0.326396,-0.538259,0.898635,-2.332112,2.669598,0.840676,-0.797064,-0.469316,0.563072,-1.437217,2.172028,0.300796,-0.205605,-0.22638,-0.496111,-0.789304,-0.538536,-0.029335,...,1.335241,-1.007284,1.640652,3.398284,-1.582772,0.293247,0.388747,-1.167973,0.44149,0.556139,-0.545364,-1.0251,-0.927151,-0.425784,0.360076,0.805664,-0.549137,-0.537014,-0.188337,0.024976,-0.721833,0.141061,-0.121944,0.754324,0.332226,-1.185077,0.838498,-0.226968,-13.374565,-0.757486,0.461798,-1.089996,1.862257,0.417729,-0.29705,-0.482758,0.464344,2.146531,1.513262,-1.389643,-0.47343,-0.597523,0.233291,0.774206,0.689711,-0.382393,-1.162316,-1.234355,2.41624,-2.876688,-0.041498,0.539115,-0.049585,-1.331033,-0.717556,-0.521484,0.139602,-0.199763,3.456952,-0.333085,-0.455004,-0.072169,1.074111,-0.392414,-0.380007,1.463322,0.95008,-0.374694,0.024485,-0.036063,-0.910353,-0.137185
1,-0.601184,-0.470369,-1.054326,0.352207,-0.431754,-0.186422,1.362683,-0.126976,-2.522448,0.626738,0.509184,1.073414,0.970938,-1.163877,0.368396,-0.342664,1.37219,2.367848,-0.032828,0.296787,0.947888,-1.290605,-0.726592,-0.508369,-0.453301,-0.471973,0.262014,0.221347,0.84324,3.416713,0.908989,1.455648,-0.516664,0.43506,-0.705554,0.60666,-0.10829,0.594338,0.317117,-0.350836,-0.175509,-0.438238,-1.652411,0.350066,1.259202,2.180739,1.574884,-0.542642,0.565118,1.645375,0.376335,-0.809269,2.130345,-0.122436,0.291321,-2.206792,1.080359,-0.768021,1.54117,1.185313,-0.066006,0.018902,1.335862,0.730105,-0.836299,0.848551,1.46689,-0.021652,0.897386,-0.291702,-1.820991,-0.250441,...,0.240545,0.505837,0.626965,0.273111,1.070617,-0.859533,-0.231231,2.357933,-0.431621,0.130922,-0.343335,1.509881,-1.085815,1.128897,-0.882096,-0.621342,-0.9208,0.460677,0.505478,1.326087,-2.296611,-1.283698,0.265664,0.142616,-1.747888,0.045265,0.050653,0.778549,-0.570501,-0.099234,0.833948,0.856969,-1.458339,-0.655688,-0.980949,-0.194226,-1.953573,-0.193864,-0.552708,1.05511,-0.972686,-0.216315,0.98842,0.107106,-0.412148,0.600358,0.311073,-0.838539,0.846744,-0.952445,-0.649137,-1.422914,-0.915237,0.898117,-0.650475,-0.526929,-0.260181,1.378182,1.964127,-0.401667,1.823332,-0.98432,0.34633,-2.16588,2.037716,-0.715749,-0.328964,0.091847,-1.090315,-0.632209,0.886497,0.914958
2,-1.597741,0.777435,0.267835,-1.18658,-0.048285,0.638156,0.627173,-0.367807,-2.083738,1.277261,0.233863,0.131141,-1.373667,0.414823,-0.173497,5.956362,-0.149717,1.260773,-1.064643,-1.312648,1.12219,-0.171009,0.267673,-1.726734,0.119917,-1.501998,-1.165411,-0.93271,1.319535,-1.861041,-0.733492,-0.234975,0.069513,0.381036,1.050863,-0.148127,-0.577237,-2.538462,-1.738688,0.129584,-0.485468,1.630482,-0.478737,0.139484,0.006428,-0.633077,-0.062994,-0.693916,-0.151163,0.747786,0.378337,-0.456834,-0.633313,0.411016,-0.996164,1.967859,1.168394,-0.959444,-0.832629,-0.326027,1.202331,0.348863,-0.542074,0.568868,-1.227099,1.106278,0.461695,0.985369,0.75663,-0.23423,-0.714289,-0.067891,...,0.071879,0.464393,-0.876301,4.926311,-1.258687,-1.438384,-0.070598,-0.645337,-1.486247,-1.592182,1.736985,1.889882,0.732795,-0.173453,-0.675912,0.002458,0.897642,-2.905841,-0.129962,-2.894047,0.517944,-0.794494,1.999267,-1.143539,1.226223,3.712762,-0.088807,1.627577,3.178682,-0.068179,0.654648,1.207134,-0.142716,-0.357668,-0.002619,-0.692077,1.537404,1.0956,-1.141594,-0.053931,-0.11917,-0.141481,0.497925,0.284489,0.950702,0.76578,-1.023197,-2.161261,-0.355705,-0.546029,-0.213186,-0.491287,0.303025,-0.850505,0.982653,0.698245,0.820335,-0.367143,-1.703033,-0.180803,-0.101001,0.143985,-2.057728,1.391847,-0.760141,-0.700443,-3.04624,0.323482,1.074514,-1.064665,-1.457375,0.504237
3,0.909784,0.910874,0.435565,-1.951355,0.577979,-0.603668,-0.209334,0.787032,-2.518156,-0.189924,-1.456707,-0.332381,-1.351855,-1.515931,0.918289,4.017587,-0.956495,-0.621731,0.255666,0.711909,-1.155692,0.230411,-0.942624,-1.318674,0.746506,0.80002,-2.35451,-0.717182,1.741811,-3.758324,1.339042,-0.693326,-0.585986,1.256353,1.179539,0.703261,0.724731,0.354745,-0.120537,0.349746,-1.570842,-0.910761,-2.001774,0.631649,-1.680781,-0.992122,-0.059625,0.203114,-0.500091,-0.02585,-0.315273,0.20422,1.095041,1.985156,0.432752,-1.013223,-0.092385,-0.15762,-0.166755,-0.585622,1.310983,-0.116419,0.00936,0.839376,-0.927028,0.205657,0.674152,0.008571,-0.825423,-0.948867,-0.049615,0.585919,...,-1.021141,0.286586,0.347222,-4.902816,0.015721,0.355722,-1.775694,0.204048,-0.590624,1.063819,0.505151,0.015249,1.924744,0.186532,-0.530073,-0.442152,0.1407,-0.529442,0.783219,-3.185634,-0.508324,-0.68459,0.124627,-0.982565,-0.278718,-4.208872,0.200652,-0.131117,4.688569,-0.242202,0.879977,-0.752611,-0.346746,-0.629414,1.621588,0.469005,0.039196,-0.303846,-0.006462,-0.4168,0.197389,1.11428,-1.510443,0.355572,1.48323,-0.341241,-0.293479,-1.548713,-0.108598,-0.179256,0.726325,1.829506,-1.581339,0.044962,0.442572,2.442752,-1.817412,0.798624,-5.368131,1.044332,-1.96026,1.915259,0.052121,0.144603,0.474985,0.024548,0.138589,1.339473,0.518763,0.312907,0.992612,-1.121993
4,0.121554,-1.045082,-1.070315,-0.441583,-1.718309,0.138502,-0.534873,2.212596,0.001896,-1.008905,-1.052491,0.289625,0.793922,0.613178,0.604801,0.435008,0.331924,-0.42009,0.318561,0.099825,0.201441,0.607827,-1.110293,-0.75449,0.939156,2.509579,-0.933295,0.219973,-1.389583,-0.304271,3.290526,0.083557,-0.194253,1.990401,1.237638,0.521053,-0.0566,-0.304837,-0.008689,-1.317874,0.86038,-0.063057,2.087216,-0.714741,-1.198157,0.470778,0.929448,2.205578,-0.585321,-0.107177,-0.201145,0.070646,-0.272854,-0.264822,0.950107,0.06124,0.671026,1.884223,0.211707,0.363608,-1.936927,0.301095,-0.402428,0.094371,0.859033,-1.94995,-0.591759,-1.194383,0.688613,0.413873,0.166518,0.5016,...,2.315708,0.610742,2.062684,2.619779,1.601514,-0.842461,0.564719,0.777999,0.044064,0.112837,-0.049014,-0.797425,-0.820319,-1.656572,-0.555399,1.052226,-0.932612,0.423432,0.35792,-0.78402,-0.626823,0.629402,0.754399,-1.523319,-1.039588,0.6928,0.414251,0.783829,2.097237,1.640938,1.461065,-1.741251,-1.124331,-1.5223,1.321969,0.911481,0.615799,1.010708,0.470715,-0.527636,0.05679,1.147405,-0.875006,-0.907738,-1.183286,2.802049,0.492197,0.005261,0.759036,-0.22327,-2.317633,0.339907,-0.082266,-0.118919,0.788914,-0.317833,-1.461239,-0.382739,-2.467649,0.375166,0.028804,-0.404333,1.267492,2.010374,-1.055319,0.85212,-2.115959,1.801926,-0.32789,-1.539215,0.214853,2.003376


## 超参数选择

目前超参数选择部分，共实现了四种算法，分别是网格搜索、随机搜索、bayes_opt和hyper_opt，下面详细介绍这几种方式的使用方法。

其中网格搜索和随机搜索这两种方式放置于同一个方法中，首先介绍这两种方式，

## 1. param_search

网格搜索搜索，即是从超参数的网格中给出所有可能的参数组合，然后依次遍历每一个参数组合，看一下哪个参数组合效果最优；而随机搜索则是在网格搜索的简化版，在所有可能的组合中随机抽取部分参数组合进行遍历，然后选取最优，往往来说，如果所制定的参数组合较多且数据量较大时，可以考虑随机搜索，可以加快超参数选择的速度。
### 1.1 方法1
该方法使用交叉验证进行参数组合的评分，最终返回最优效果和最佳参数组合，下面看代码使用，

> 内部交叉验证的实现是调用的sklearn的`cross_val_score`方法进行实现，相应的参数传递可以参考下面方法的介绍

In [3]:
# 定义使用的模型和所需要的参数网格
clf = DecisionTreeClassifier()
param_grid = {"max_depth": [1, 2, 3, 4], "min_samples_leaf": [1, 10, 100, 200], "criterion": ["gini", "entropy"]}

# 调用方法进行超参数的遍历
best_effect, best_param = param_search(df, label, clf, param_grid, method="grid", k_fold=3, random_state=666)
print(f"best effect is {best_effect}, best param combination is {best_param}.")

initialize effect 0.7664000213162648, cost time 5, with feat_dim 300, with param {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
round 1/32 start...
round 1/32 end, effect subset is 0.7555990602599767, cost time 1, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 200}
round 2/32 start...
round 2/32 end, effect subset is 0.7789999023004524, cost time 2, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1}
round 3/32 start...
round 3/32 end, effect subset is 0.7949986233245547, cost time 3, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 10}
round 4/32 start...
round 4/32 end, effect subset i

### 1.2 方法2
该方法是方法1的升级版，相当于可以传递自定义的参数至`cross_val_score`中，有关`cross_val_score`的API可以参考[链接](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)。

下面的使用中定义了`cross_val_score`的`scoring`和`n_jobs`两个参数。
> 该方法中参数`X`, `y`和`cv`参数分别对应param_search中的`train_x`, `train_y`和`cv`，不需要再额外定义。

In [4]:
# 定义使用的模型和所需要的参数网格
clf = DecisionTreeClassifier()
param_grid = {"max_depth": [1, 2, 3, 4], "min_samples_leaf": [1, 10, 100, 200], "criterion": ["gini", "entropy"]}

# 定义`cross_val_score`方法的参数
cross_val_param = {"scoring": lambda clf, X, y: roc_auc_score(y_true=y, y_score=clf.predict_proba(X)[:, 1]),
                   "n_jobs": None}

best_effect, best_param = param_search(df, label, clf, param_grid, method="grid", random_state=666, k_fold=5, cross_val_param=cross_val_param)
print(f"best effect is {best_effect}, best param combination is {best_param}.")

initialize effect 0.797015727262909, cost time 12, with feat_dim 300, with param {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
round 1/32 start...
round 1/32 end, effect subset is 0.7559000252001008, cost time 0, with feature dim is 300, with param {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 200}
round 2/32 start...
round 2/32 end, effect subset is 0.8192379785519142, cost time 2, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 200}
round 3/32 start...
round 3/32 end, effect subset is 0.8667630342521371, cost time 4, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 200}
round 4/32 start...
round 4/32 end, effect subset i

### 1.3 方法3

上面两种方式的核心是通过交叉验证的方式进行参数组合的评估，显然当数据量较大时速度会很慢。下面这种方式是通过随机划分验证集的方式进行参数组合的评估。   
此时需要指定三组参数`create_valid`&`valid_ratio`&`metric_func`，参数`create_valid`是表示需要从输入数据中划分验证集，`valid_ratio`表示划分验证集的比例，最后一组参数`metric_func`则是表示评估验证效果的函数（输入是y_true,y_pred，返回应当只有一个值）。

In [5]:
# 定义使用的模型和所需要的参数网格
clf = DecisionTreeClassifier()
param_grid = {"max_depth": [1, 2, 3, 4], "min_samples_leaf": [1, 10, 100, 200], "criterion": ["gini", "entropy"]}

best_effect, best_param = param_search(df, label, clf, param_grid, create_valid=True, valid_ratio=0.2, metric_func=roc_auc_score, random_state=666)
print(f"best effect is {best_effect}, best param combination is {best_param}.")

initialize effect 0.7970273245921329, cost time 2, with feat_dim 300, with param {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
round 1/32 start...
round 1/32 end, effect subset is 0.8370213191872685, cost time 0, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 200}
round 2/32 start...
round 2/32 end, effect subset is 0.8595375838254429, cost time 0, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1}
round 3/32 start...
round 3/32 end, effect subset is 0.8809008107296568, cost time 1, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 10}
round 4/32 start...
round 4/32 end, effect subset i

### 1.4 方法4
方法3中通过随机划分验证集进行参数的评估，当训练样本中虽在时序相关时（即样本是严格按照时间来产生）时，这种方式欠妥，存在拿未来的样本训练来预测过去的样本。    
下面这种方式支持自定义输入验证集。需要指定参数`valid_x`&`valid_y`&`metric_func`。

In [6]:
# 定义使用的模型和所需要的参数网格
clf = DecisionTreeClassifier()
param_grid = {"max_depth": [1, 2, 3, 4], "min_samples_leaf": [1, 10, 100, 200], "criterion": ["gini", "entropy"]}

best_effect, best_param = param_search(df, label, clf, param_grid, valid_x=df[:100], valid_y=label[:100], metric_func=roc_auc_score, random_state=666)
print(f"best effect is {best_effect}, best param combination is {best_param}.")

initialize effect 1.0, cost time 3, with feat_dim 300, with param {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
round 1/32 start...
round 1/32 end, effect subset is 0.8124999999999999, cost time 0, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 200}
round 2/32 start...
round 2/32 end, effect subset is 0.8666801948051948, cost time 1, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1}
round 3/32 start...
round 3/32 end, effect subset is 0.8851461038961038, cost time 1, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 10}
round 4/32 start...
round 4/32 end, effect subset is 0.73051948051

### 1.5 方法5
当验证方式使用验证集进行参数组合的评估时，该方法提供了一种高阶的使用方式。该方式最初想法是想融入`early_stopping_rounds`至调参的过程中，即每次调参时最优训练轮数是一个动态调整的值。

该方式需要额外指定一组参数，该参数类型是dict，里面可包含key值为`model_fit_param`或者`set_eval_set`或者`update_param_func`，
- `model_fit_param`  
模型训练时，`fit`方法包含的参数，是一个dict
- `set_eval_set`   
在训练时是否指定验证集合，如果设置为True，会在fit时添加一组参数`eval_set = [(valid_x, valid_y)]`，否则则不添加，往往early_stopping_rounds时需要指定。
- `update_param_func`   
更新参数的function，输入为model和param，输出也是param，在使用early_stopping_rounds需要返回最佳的训练轮次，则可以启用该函数。


In [7]:
# 定义使用的模型和所需要的参数网格
clf = XGBClassifier()
param_grid = {"max_depth": [1, 2, 3], "learning_rate": [0.01, 0.5, 0.1], "reg_lambda": [0, 1, 10]}

def _update(model, param):
    if param is None:
        return model.get_params()
    else:
        param["n_estimators"] = model.best_iteration
    return param
valid_set_param = {"model_fit_param": {"eval_metric": "auc", "verbose": False, "early_stopping_rounds": 5},
                   "set_eval_set": True,
                   "update_param_func": _update}

best_effect, best_param = param_search(df, label, clf, param_grid, method="grid", random_state=666, create_valid=True,
                                       valid_ratio=0.2, metric_func=roc_auc_score, valid_set_param=valid_set_param)
print(f"best effect is {best_effect}, best param combination is {best_param}.")

initialize effect 0.9579461515363827, cost time 7, with feat_dim 300, with param {'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': None, 'n_estimators': 99, 'n_jobs': 1, 'nthread': None, 'objective': 'binary:logistic', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': True, 'subsample': 1}
round 1/27 start...
round 1/27 end, effect subset is 0.9083274947452707, cost time 2, with feature dim is 300, with param {'learning_rate': 0.1, 'max_depth': 1, 'reg_lambda': 0, 'n_estimators': 99}
round 2/27 start...
round 2/27 end, effect subset is 0.8904173756380742, cost time 4, with feature dim is 300, with param {'learning_rate': 0.01, 'max_depth': 2, 'reg_lambda': 0, 'n_estimators': 64}
round 3/27 start...
round 3/27 end, effect subset is 0.9575858272445201, cost time 9, with feature dim is 300, with para

### 1.6 方法6
以上方式均是单机版一次次进行遍历进行，如果内存允许的情况下，可以启用多进程。下面以交叉验证作为参数组合评选标准进行，对验证集的方式评估该方式完全一致。

In [8]:
# 定义使用的模型和所需要的参数网格
clf = DecisionTreeClassifier()
param_grid = {"max_depth": [1, 2, 3, 4], "min_samples_leaf": [1, 10, 100, 200], "criterion": ["gini", "entropy"]}


# 未启用多进程，测试耗时
s = datetime.now()
best_effect, best_param = param_search(df, label, clf, param_grid, method="grid", k_fold=5, random_state=666, verbose=False)
print(f"best effect is {best_effect}, best param combination is {best_param}.")
e = datetime.now()
print(f"do not use multiprocess cost time {(e-s).seconds}")  
    
# 启用多进程，测试耗时
s = datetime.now()
best_effect, best_param = param_search(df, label, clf, param_grid, method="grid", k_fold=5, random_state=666,
             enable_multiprocess=True, n_jobs=2, verbose=False)
print(f"best effect is {best_effect}, best param combination is {best_param}.")
e = datetime.now()
print(f"use multiprocess cost time {(e-s).seconds}")  

best effect is 0.7974031184031184, best param combination is {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 1}.
do not use multiprocess cost time 101
best effect is 0.7974031184031184, best param combination is {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 1}.
use multiprocess cost time 78


### 1.7 方法7
该方式可以指定算法为随机搜索，以及对应的随机搜索的轮次，其余使用方式和方面的使用方式完全一致，下面随便挑选一种方式示例。

In [9]:
# 定义使用的模型和所需要的参数网格
clf = DecisionTreeClassifier()
param_grid = {"max_depth": [1, 2, 3, 4], "min_samples_leaf": [1, 10, 100, 200], "criterion": ["gini", "entropy"]}

best_effect, best_param = param_search(df, label, clf, param_grid, method="random", max_iter=10,
                                       create_valid=True, valid_ratio=0.2, 
                                       metric_func=roc_auc_score, random_state=666)
print(f"best effect is {best_effect}, best param combination is {best_param}.")

initialize effect 0.7970273245921329, cost time 2, with feat_dim 300, with param {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}
round 1/10 start...
round 1/10 end, effect subset is 0.7768591732559305, cost time 0, with feature dim is 300, with param {'criterion': 'gini', 'max_depth': 1, 'min_samples_leaf': 200}
round 2/10 start...
round 2/10 end, effect subset is 0.8370213191872685, cost time 0, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 2, 'min_samples_leaf': 1}
round 3/10 start...
round 3/10 end, effect subset is 0.7768591732559305, cost time 0, with feature dim is 300, with param {'criterion': 'entropy', 'max_depth': 1, 'min_samples_leaf': 10}
round 4/10 start...
round 4/10 end, effect subset is 0

### 1.8 分布式调参
该方式和上面的介绍一致，但融入了spark的分布式计算能力，首先在driver端读取数据，之后分发数据至executor端，然后进行参数组合的遍历，
之后汇聚每组参数的效果至driver端，进行比较选择最优的参数。

其使用方式和上面基本一致，不同之处需要传入一个spark session，因为本地没有搭建分布式环境，具体的使用方式可以参考目录`../tests/test_distribute_param_search.py`

## 2. bayes_opt

该方式主要对bayes_opt包进行了封装，该包的具体使用可以参考[链接](https://github.com/rmcantin/bayesopt)。
封装主要保留了可以指定交叉验证或者验证集评估的方式，其余类似更新参数和多进程方式目前均不支持。   

### 2.1 方法1
该方式通过交叉验证的来评估每一轮的迭代效果。

In [10]:
from model_helper.hyper_parameter_tuning import bayes_search

clf = RandomForestClassifier()

param_space = {"max_features": {"interval": (0.1, 0.9), "type": float},
               "n_estimators": {"interval": (10, 250), "type": int},
               "min_samples_split": {"interval": (2, 25), "type": int}
               }
best_result, best_params = bayes_search(df[:100], label[:100], model=clf, param_space=param_space, n_iter=10,
                                        k_fold=3, random_state=666)
print(f"best_result is {best_result}, best_param is {best_params}")

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.7212  [0m | [0m 0.6603  [0m | [0m 21.42   [0m | [0m 172.4   [0m |
| [0m 2       [0m | [0m 0.6991  [0m | [0m 0.6823  [0m | [0m 23.88   [0m | [0m 13.05   [0m |
| [0m 3       [0m | [0m 0.5925  [0m | [0m 0.4309  [0m | [0m 3.123   [0m | [0m 33.98   [0m |
| [95m 4       [0m | [95m 0.731   [0m | [95m 0.5065  [0m | [95m 6.606   [0m | [95m 188.6   [0m |
| [0m 5       [0m | [0m 0.6801  [0m | [0m 0.2543  [0m | [0m 18.12   [0m | [0m 80.37   [0m |
| [0m 6       [0m | [0m 0.7089  [0m | [0m 0.1     [0m | [0m 25.0    [0m | [0m 250.0   [0m |
| [0m 7       [0m | [0m 0.7004  [0m | [0m 0.9     [0m | [0m 2.0     [0m | [0m 250.0   [0m |
| [0m 8       [0m | [0m 0.7102  [0m | [0m 0.1863  [0m | [0m 24.99   [0m | [0m 211.2   [0m |
| [0m 9       [0m | [0m 0.6593  [0m | [0m 0.1524  

### 2.2 方法2
通过随机产生验证集的方式进行效果的评估。

In [11]:
from model_helper.hyper_parameter_tuning import bayes_search

clf = RandomForestClassifier()

param_space = {"max_features": {"interval": (0.1, 0.9), "type": float},
               "n_estimators": {"interval": (10, 250), "type": int},
               "min_samples_split": {"interval": (2, 25), "type": int}
               }
best_result, best_params = bayes_search(df[:100], label[:100], model=clf, param_space=param_space, n_iter=10,
                                        create_valid=True, valid_ratio=0.2, metric_func=roc_auc_score, random_state=666)
print(f"best_result is {best_result}, best_param is {best_params}")

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.8125  [0m | [0m 0.6603  [0m | [0m 21.42   [0m | [0m 172.4   [0m |
| [0m 2       [0m | [0m 0.7917  [0m | [0m 0.6823  [0m | [0m 23.88   [0m | [0m 13.05   [0m |
| [0m 3       [0m | [0m 0.651   [0m | [0m 0.4309  [0m | [0m 3.123   [0m | [0m 33.98   [0m |
| [0m 4       [0m | [0m 0.7917  [0m | [0m 0.5065  [0m | [0m 6.606   [0m | [0m 188.6   [0m |
| [0m 5       [0m | [0m 0.7917  [0m | [0m 0.2543  [0m | [0m 18.12   [0m | [0m 80.37   [0m |
| [0m 6       [0m | [0m 0.7708  [0m | [0m 0.1     [0m | [0m 25.0    [0m | [0m 250.0   [0m |
| [95m 7       [0m | [95m 0.875   [0m | [95m 0.9     [0m | [95m 25.0    [0m | [95m 123.4   [0m |
| [0m 8       [0m | [0m 0.7917  [0m | [0m 0.9     [0m | [0m 2.0     [0m | [0m 250.0   [0m |
| [0m 9       [0m | [0m 0.8646  [0m | [0m 0.8053  

### 2.3 方法3
通过自定义验证集的方式进行评估。

In [12]:
from model_helper.hyper_parameter_tuning import bayes_search

clf = RandomForestClassifier()

param_space = {"max_features": {"interval": (0.1, 0.9), "type": float},
               "n_estimators": {"interval": (10, 250), "type": int},
               "min_samples_split": {"interval": (2, 25), "type": int}
               }
best_result, best_params = bayes_search(df[:100], label[:100], model=clf, param_space=param_space, n_iter=10,
                                        valid_x=df[100:150], valid_y=label[100:150], metric_func=roc_auc_score, random_state=666)
print(f"best_result is {best_result}, best_param is {best_params}")

|   iter    |  target   | max_fe... | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.8117  [0m | [0m 0.6603  [0m | [0m 21.42   [0m | [0m 172.4   [0m |
| [0m 2       [0m | [0m 0.7597  [0m | [0m 0.6823  [0m | [0m 23.88   [0m | [0m 13.05   [0m |
| [0m 3       [0m | [0m 0.7744  [0m | [0m 0.4309  [0m | [0m 3.123   [0m | [0m 33.98   [0m |
| [0m 4       [0m | [0m 0.7955  [0m | [0m 0.5065  [0m | [0m 6.606   [0m | [0m 188.6   [0m |
| [0m 5       [0m | [0m 0.7711  [0m | [0m 0.2543  [0m | [0m 18.12   [0m | [0m 80.37   [0m |
| [0m 6       [0m | [0m 0.7841  [0m | [0m 0.1     [0m | [0m 25.0    [0m | [0m 250.0   [0m |
| [0m 7       [0m | [0m 0.8084  [0m | [0m 0.1417  [0m | [0m 24.98   [0m | [0m 197.5   [0m |
| [0m 8       [0m | [0m 0.7987  [0m | [0m 0.1091  [0m | [0m 24.83   [0m | [0m 157.2   [0m |
| [0m 9       [0m | [0m 0.7963  [0m | [0m 0.8963  [0m 

## 2. hyper_opt

该方式主要对hyper_opt包进行了封装，该包的具体使用可以参考[链接](http://hyperopt.github.io/hyperopt/)。
封装主要保留了可以指定交叉验证或者验证集评估的方式，其余类似更新参数和多进程方式目前均不支持。   

### 2.1 方法1
该方式通过交叉验证的来评估每一轮的迭代效果。

In [13]:
from hyperopt import hp
from model_helper.hyper_parameter_tuning import hyperopt_search

clf = RandomForestClassifier()

param_space = {"max_features": hp.uniform("max_features", 0.1, 0.9),
               "n_estimators": hp.choice("n_estimators", range(10, 100)),
               "min_samples_split": hp.choice("min_samples_split", [2, 10, 100])
               }
trials, best_params = hyperopt_search(df[:100], label[:100], model=clf, param_space=param_space, n_iter=10,
                                      k_fold=3, random_state=666)
for i in trials:
    print(i["result"]["stuff"])
print(f"best_param is {best_params}")

100%|██████████| 10/10 [00:03<00:00,  3.32it/s, best loss: -0.701593137254902]
{'param': {'max_features': 0.7717534098692105, 'min_samples_split': 10, 'n_estimators': 63}, 'effect': 0.6415441176470589}
{'param': {'max_features': 0.26018003099419995, 'min_samples_split': 10, 'n_estimators': 61}, 'effect': 0.7009803921568628}
{'param': {'max_features': 0.21371575812946564, 'min_samples_split': 2, 'n_estimators': 41}, 'effect': 0.6709558823529411}
{'param': {'max_features': 0.41707981574580155, 'min_samples_split': 10, 'n_estimators': 19}, 'effect': 0.6819852941176471}
{'param': {'max_features': 0.6356237134216275, 'min_samples_split': 100, 'n_estimators': 69}, 'effect': 0.5600490196078431}
{'param': {'max_features': 0.6379048296438755, 'min_samples_split': 10, 'n_estimators': 19}, 'effect': 0.6617647058823529}
{'param': {'max_features': 0.4336835533075999, 'min_samples_split': 100, 'n_estimators': 78}, 'effect': 0.5600490196078431}
{'param': {'max_features': 0.8945511738511079, 'min_samp

### 2.2 方法2
通过随机产生验证集的方式进行效果的评估。

In [14]:
from hyperopt import hp
from model_helper.hyper_parameter_tuning import hyperopt_search

clf = RandomForestClassifier()

param_space = {"max_features": hp.uniform("max_features", 0.1, 0.9),
               "n_estimators": hp.choice("n_estimators", range(10, 100)),
               "min_samples_split": hp.choice("min_samples_split", [2, 10, 100])
               }
trials, best_params = hyperopt_search(df[:100], label[:100], model=clf, param_space=param_space, n_iter=10,
                                      create_valid=True, valid_ratio=0.2, metric_func=roc_auc_score, random_state=666)
for i in trials:
    print(i["result"]["stuff"])
print(f"best_param is {best_params}")

100%|██████████| 10/10 [00:01<00:00,  8.15it/s, best loss: -0.9583333333333334]
{'param': {'max_features': 0.7717534098692105, 'min_samples_split': 10, 'n_estimators': 63}, 'effect': 0.78125}
{'param': {'max_features': 0.26018003099419995, 'min_samples_split': 10, 'n_estimators': 61}, 'effect': 0.7291666666666667}
{'param': {'max_features': 0.21371575812946564, 'min_samples_split': 2, 'n_estimators': 41}, 'effect': 0.765625}
{'param': {'max_features': 0.41707981574580155, 'min_samples_split': 10, 'n_estimators': 19}, 'effect': 0.9583333333333334}
{'param': {'max_features': 0.6356237134216275, 'min_samples_split': 100, 'n_estimators': 69}, 'effect': 0.5}
{'param': {'max_features': 0.6379048296438755, 'min_samples_split': 10, 'n_estimators': 19}, 'effect': 0.65625}
{'param': {'max_features': 0.4336835533075999, 'min_samples_split': 100, 'n_estimators': 78}, 'effect': 0.5}
{'param': {'max_features': 0.8945511738511079, 'min_samples_split': 2, 'n_estimators': 83}, 'effect': 0.8177083333333

### 2.3 方法3
通过自定义验证集的方式进行评估。

In [15]:
from hyperopt import hp
from model_helper.hyper_parameter_tuning import hyperopt_search

clf = RandomForestClassifier()

param_space = {"max_features": hp.uniform("max_features", 0.1, 0.9),
               "n_estimators": hp.choice("n_estimators", range(10, 100)),
               "min_samples_split": hp.choice("min_samples_split", [2, 10, 100])
               }
trials, best_params = hyperopt_search(df[:100], label[:100], model=clf, param_space=param_space, n_iter=10,
                                      valid_x=df[100:150], valid_y=label[100:150], metric_func=roc_auc_score, random_state=666)
for i in trials:
    print(i["result"]["stuff"])
print(f"best_param is {best_params}")

100%|██████████| 10/10 [00:01<00:00,  5.21it/s, best loss: -0.814935064935065]
{'param': {'max_features': 0.7717534098692105, 'min_samples_split': 10, 'n_estimators': 63}, 'effect': 0.7775974025974026}
{'param': {'max_features': 0.26018003099419995, 'min_samples_split': 10, 'n_estimators': 61}, 'effect': 0.7483766233766234}
{'param': {'max_features': 0.21371575812946564, 'min_samples_split': 2, 'n_estimators': 41}, 'effect': 0.7402597402597403}
{'param': {'max_features': 0.41707981574580155, 'min_samples_split': 10, 'n_estimators': 19}, 'effect': 0.635551948051948}
{'param': {'max_features': 0.6356237134216275, 'min_samples_split': 100, 'n_estimators': 69}, 'effect': 0.5}
{'param': {'max_features': 0.6379048296438755, 'min_samples_split': 10, 'n_estimators': 19}, 'effect': 0.814935064935065}
{'param': {'max_features': 0.4336835533075999, 'min_samples_split': 100, 'n_estimators': 78}, 'effect': 0.5}
{'param': {'max_features': 0.8945511738511079, 'min_samples_split': 2, 'n_estimators': 8