# sklearn

导入库

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

参数设置

In [None]:
ranking_param = {
    "num_leaves": list(range(31, 50)),
    "learning_rate": [0.1, 0.01],
    "n_estimators": [300, 400, 600, 800, 1000],
    "max_depth": [2, 3, 4, 5, 6, -1],
    "boosting_type": ["gbdt"],
    "objective": ["lambdarank"],
    "scoring": ["ndcg"],
}

fit_params = {
    "early_stopping_rounds": 30,
    "eval_metric": "ndcg",
    "eval_set": [(x_dev, y_dev)],
    "eval_group": [q_dev],
    'eval_at': [1, 3],
    "verbose": True,
    "group": np.array([x_train.shape[0]*2/3]),
}
model = ......

网格调参

In [None]:
grid_search = GridSearchCV(estimator=model, n_jobs=1, param_grid=ranking_param, cv = 3, verbose=True)
grid_search.fit(x_train, y_train, **fit_params)

随机搜索

In [None]:
clf = RandomizedSearchCV(
    estimator=gbm,
    param_distributions=ranking_param,
    n_iter=100,
    scoring="roc_auc",
    cv=3,
    refit=True,
    random_state=314,
    verbose=True,
)
search = clf.fit(x_train, y_train, **fit_params)

print(
    "Best score reached: {} with params: {} ".format(
        search.best_score_, search.best_params_
    )
)

# optuna

https://github.com/optuna/optuna/blob/master/examples/lightgbm_simple.py

https://zhuanlan.zhihu.com/p/138521995

支持的部分库有(全部请参见官网)：

XGBoost
LightGBM
Sklearn
Keras
TensorFlow
tf.keras
MXNet
PyTorch
FastAI

导入库

In [None]:
import optuna
import lightgbm as lgb

参数设置

In [None]:
def objective(trial):
    params = {
        "task": "train",
        "objective": "lambdarank",
        "boosting_type": "gbdt",
        "max_depth": trial.suggest_int("max_depth", 2, 7),
        "metric": "ndcg",
        "ndcg_at": [1, 3, 5],
        "max_position": 5,  # @NDCG 位置优化 5
        "max_bin": trial.suggest_categorical("max_bin", [255, 512]),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0),  #  log=True
        "num_leaves": trial.suggest_int(
            "num_leaves", 31, 256
        ),  # 从1～3范围内的int里选 ≈max_depth
        "learning_rate": trial.suggest_categorical(
            "learning_rate", [0.01, 0.005, 0.001]
        ),  # 学习率
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0, 1),
        'bagging_fraction': trial.suggest_uniform('feature_fraction', 0, 1),
        'bagging_freq': trial.suggest_int("min_child_samples", 1, 10),
        "min_data_in_leaf": trial.suggest_categorical('min_data_in_leaf', [10, 20, 30, 80]),  # 一个叶子节点上包含的最少样本数量 1 [10, 20]
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgb.train(
        params,
        train_data,
        num_boost_round=400,
        valid_sets=[dev_data],
        categorical_feature=cate_cols,
        early_stopping_rounds=50,
        verbose_eval=-1,
    )  # verbose_eval=-1: suppresses most of LightGBM output

    # Save a trained model to a file.
    gbm.save_model(setting.model_root + '{}.mod'.format(trial.number))
    with codecs.open(setting.model_root + '{}.best_iteration.txt'.format(trial.number), "w", "utf-8") as f:
        f.write(str(gbm.best_iteration))
    
    test_predict = gbm.predict(test_X, num_iteration=gbm.best_iteration, categorical_feature=cate_cols)
    average_ndcg, _ = validate(q_test, test_y, test_predict, 60)    # 所有qid的平均ndcg
    return average_ndcg

随机搜索

In [None]:
# 创建一个学习实例，因为objective返回的评价指标是ndcg，因此目标是最大化
study = optuna.create_study(direction="maximize")
# n_trials代表搜索100种
study.optimize(objective, n_trials=100)  # , n_jobs=-1

输出

In [None]:
print("最优超参: ", study.best_params)
print("最优超参下，objective函数返回的值: ", study.best_value)
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))