In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn import metrics
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
canceData=load_breast_cancer()
X=canceData.data
y=canceData.target

In [3]:
X_train,X_test,y_train,y_test=train_test_split(X, y, 
                                               random_state=0,
                                               test_size=0.2)

In [4]:
params = {'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.1,
          'num_leaves':30, 
          'max_depth': 5,   
          'subsample': 0.8, 
          'colsample_bytree': 0.8}

In [5]:
data_train = lgb.Dataset(X_train, y_train)
cv_results = lgb.cv(params,
                    data_train,
                    num_boost_round=1000,
                    nfold=5,
                    stratified=False,
                    shuffle=True,
                    metrics='auc',
                    early_stopping_rounds=50,
                    seed=0)

In [6]:
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())

best n_estimators: 188
best cv score: 0.9913471629808542


## 确定max_depth和num_leaves

In [7]:
params_test1={'max_depth': range(3, 8, 1),
              'num_leaves':range(5, 100, 5)}
              
gsearch1 = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',
                                                     objective='binary',
                                                     metrics='auc',
                                                     learning_rate=0.1,
                                                     n_estimators=188,
                                                     max_depth=6,
                                                     bagging_fraction=0.8,
                                                     feature_fraction = 0.8), 
                        param_grid=params_test1,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 4, 'num_leaves': 10}, 0.9943573667711598)

## 确定max_bin和min_data_in_leaf

In [8]:
params_test2={'max_bin': range(5, 256, 10),
              'min_data_in_leaf': range(1, 102, 10)}
              
gsearch2 = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',
                                                     objective='binary',
                                                     metrics='auc',
                                                     learning_rate=0.1,
                                                     n_estimators=188,
                                                     max_depth=4,
                                                     num_leaves=10,
                                                     bagging_fraction=0.8,
                                                     feature_fraction = 0.8), 
                        param_grid=params_test2,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1)
gsearch2.fit(X_train,y_train)
gsearch2.best_params_, gsearch2.best_score_

({'max_bin': 15, 'min_data_in_leaf': 51}, 0.9952978056426331)

## 确定feature_fraction、bagging_fraction、bagging_freq

In [9]:
params_test3={'feature_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
              'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
              'bagging_freq': range(0, 81, 10)}
              
gsearch3 = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',
                                                     objective='binary',
                                                     metrics='auc',
                                                     learning_rate=0.1,
                                                     n_estimators=188,
                                                     max_depth=4,
                                                     num_leaves=10,
                                                     max_bin=15,
                                                     min_data_in_leaf=51), 
                        param_grid=params_test3,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

({'bagging_fraction': 0.6, 'bagging_freq': 0, 'feature_fraction': 0.8},
 0.9952978056426331)

## 确定lambda_l1和lambda_l2

In [10]:
params_test4={'lambda_l1': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
              'lambda_l2': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]}
              
gsearch4 = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',
                                                     objective='binary',
                                                     metrics='auc',
                                                     learning_rate=0.1,
                                                     n_estimators=188,
                                                     max_depth=4,
                                                     num_leaves=10,
                                                     max_bin=15,
                                                     min_data_in_leaf=51,
                                                     bagging_fraction=0.6,
                                                     bagging_freq=0,
                                                     feature_fraction=0.8), 
                        param_grid=params_test4,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

({'lambda_l1': 1e-05, 'lambda_l2': 1e-05}, 0.9952978056426331)

## 确定 min_split_gain

In [11]:
params_test5 = {'min_split_gain': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
              
gsearch5 = GridSearchCV(estimator=lgb.LGBMClassifier(boosting_type='gbdt',
                                                     objective='binary',
                                                     metrics='auc',
                                                     learning_rate=0.1,
                                                     n_estimators=188,
                                                     max_depth=4,
                                                     num_leaves=10,
                                                     max_bin=15,
                                                     min_data_in_leaf=51,
                                                     bagging_fraction=0.6,
                                                     bagging_freq=0,
                                                     feature_fraction=0.8,
                                                     lambda_l1=1e-05,
                                                     lambda_l2=1e-05), 
                        param_grid=params_test5,
                        scoring='roc_auc',
                        cv=5,
                        n_jobs=-1)
gsearch5.fit(X_train, y_train)
gsearch5.best_params_, gsearch5.best_score_

({'min_split_gain': 0.0}, 0.9952978056426331)

## 降低学习率，增加迭代次数，验证模型

In [12]:
model = lgb.LGBMClassifier(boosting_type='gbdt',
                           objective='binary',
                           metrics='auc',
                           learning_rate=0.01,
                           n_estimators=1000,
                           max_depth=4,
                           num_leaves=10,
                           max_bin=15,
                           min_data_in_leaf=51,
                           bagging_fraction=0.6,
                           bagging_freq=0,
                           feature_fraction=0.8,
                           lambda_l1=1e-05,
                           lambda_l2=1e-05,
                           min_split_gain=0)
model.fit(X_train,y_train)
y_pre = model.predict(X_test)
print('acc:', metrics.accuracy_score(y_test, y_pre))
print('auc:', metrics.roc_auc_score(y_test, y_pre))

acc: 0.9736842105263158
auc: 0.9744363289933311


## 使用默认参数

In [13]:
model=lgb.LGBMClassifier()
model.fit(X_train, y_train)
y_pre = model.predict(X_test)
print('acc:', metrics.accuracy_score(y_test, y_pre))
print('auc:', metrics.roc_auc_score(y_test, y_pre))

acc: 0.9649122807017544
auc: 0.9637980311209908


# Lightgbm cv函数调参

In [14]:
print('数据转换')
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train,free_raw_data=False)
 
# 设置初始参数--不含交叉验证参数
print('设置参数')
params = {'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread': 4,
          'learning_rate': 0.1}
 
# 交叉验证(调参)
print('交叉验证')
max_auc = 0.0
best_params = {}


print("调参1：提高准确率")
for num_leaves in range(5, 100, 5):
    for max_depth in range(3, 8, 1):
        params['num_leaves'] = num_leaves
        params['max_depth'] = max_depth
 
        cv_results = lgb.cv(params,
                            lgb_train,
                            seed=1,
                            nfold=5,
                            metrics=['auc'],
                            early_stopping_rounds=10)
            
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
            
        if mean_auc >= max_auc:
            max_auc = mean_auc
            best_params['num_leaves'] = num_leaves
            best_params['max_depth'] = max_depth
if 'num_leaves' and 'max_depth' in best_params.keys():          
    params['num_leaves'] = best_params['num_leaves']
    params['max_depth'] = best_params['max_depth']
print('params: ', params)
print('best_params: ', best_params)
    
    
# 过拟合
print("调参2：降低过拟合")
for max_bin in range(5, 256, 10):
    for min_data_in_leaf in range(1, 102, 10):
        params['max_bin'] = max_bin
        params['min_data_in_leaf'] = min_data_in_leaf
            
        cv_results = lgb.cv(params,
                            lgb_train,
                            seed=1,
                            nfold=5,
                            metrics=['auc'],
                            early_stopping_rounds=10)
                    
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
        if mean_auc >= max_auc:
            max_auc = mean_auc
            best_params['max_bin']= max_bin
            best_params['min_data_in_leaf'] = min_data_in_leaf
if 'max_bin' and 'min_data_in_leaf' in best_params.keys():
    params['min_data_in_leaf'] = best_params['min_data_in_leaf']
    params['max_bin'] = best_params['max_bin']
print('params: ', params)
print('best_params: ', best_params)


print("调参3：降低过拟合")
for feature_fraction in [0.6, 0.7, 0.8, 0.9, 1.0]:
    for bagging_fraction in [0.6, 0.7, 0.8, 0.9, 1.0]:
        for bagging_freq in range(0, 50, 5):
            params['feature_fraction'] = feature_fraction
            params['bagging_fraction'] = bagging_fraction
            params['bagging_freq'] = bagging_freq
            
            cv_results = lgb.cv(params,
                                lgb_train,
                                seed=1,
                                nfold=5,
                                metrics=['auc'],
                                early_stopping_rounds=10)
                    
            mean_auc = pd.Series(cv_results['auc-mean']).max()
            boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
            if mean_auc >= max_auc:
                max_auc = mean_auc
                best_params['feature_fraction'] = feature_fraction
                best_params['bagging_fraction'] = bagging_fraction
                best_params['bagging_freq'] = bagging_freq


if 'feature_fraction' and 'bagging_fraction' and 'bagging_freq' in best_params.keys():
    params['feature_fraction'] = best_params['feature_fraction']
    params['bagging_fraction'] = best_params['bagging_fraction']
    params['bagging_freq'] = best_params['bagging_freq']
print('params: ', params)
print('best_params: ', best_params)
 
print("调参4：降低过拟合")
for lambda_l1 in [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]:
    for lambda_l2 in [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]:
        params['lambda_l1'] = lambda_l1
        params['lambda_l2'] = lambda_l2
        cv_results = lgb.cv(params,
                            lgb_train,
                            seed=1,
                            nfold=5,
                            metrics=['auc'],
                            early_stopping_rounds=10)
                
        mean_auc = pd.Series(cv_results['auc-mean']).max()
        boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
        if mean_auc >= max_auc:
            max_auc=mean_auc
            best_params['lambda_l1'] = lambda_l1
            best_params['lambda_l2'] = lambda_l2
if 'lambda_l1' and 'lambda_l2' in best_params.keys():
    params['lambda_l1'] = best_params['lambda_l1']
    params['lambda_l2'] = best_params['lambda_l2']
print('params: ', params)
print('best_params: ', best_params)
    
print("调参5：降低过拟合2")
for min_split_gain in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]:
    params['min_split_gain'] = min_split_gain
    
    cv_results = lgb.cv(params,
                        lgb_train,
                        seed=1,
                        nfold=5,
                        metrics=['auc'],
                        early_stopping_rounds=10)
            
    mean_auc = pd.Series(cv_results['auc-mean']).max()
    boost_rounds = pd.Series(cv_results['auc-mean']).idxmax()
 
    if mean_auc >= max_auc:
        max_auc=mean_auc
        
        best_params['min_split_gain'] = min_split_gain
if 'min_split_gain' in best_params.keys():
    params['min_split_gain'] = best_params['min_split_gain']
print('params: ', params)
print('best_params: ', best_params)

print(best_params)

数据转换
设置参数
交叉验证
调参1：提高准确率
params:  {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'nthread': 4, 'learning_rate': 0.1, 'num_leaves': 95, 'max_depth': 5}
best_params:  {'num_leaves': 95, 'max_depth': 5}
调参2：降低过拟合
params:  {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'nthread': 4, 'learning_rate': 0.1, 'num_leaves': 95, 'max_depth': 5, 'max_bin': 255, 'min_data_in_leaf': 101}
best_params:  {'num_leaves': 95, 'max_depth': 5}
调参3：降低过拟合
params:  {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'nthread': 4, 'learning_rate': 0.1, 'num_leaves': 95, 'max_depth': 5, 'max_bin': 255, 'min_data_in_leaf': 101, 'feature_fraction': 1.0, 'bagging_fraction': 1.0, 'bagging_freq': 45}
best_params:  {'num_leaves': 95, 'max_depth': 5}
调参4：降低过拟合
params:  {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'nthread': 4, 'learning_rate': 0.1, 'num_leaves': 95, 'max_depth': 5, 'max_bin': 255, 'min_data_in_leaf': 101, 'feature_fraction': 1.0, 

In [15]:
model=lgb.LGBMClassifier(boosting_type='gbdt',
                         objective='binary',
                         metrics='auc',
                         learning_rate=0.01,
                         n_estimators=1000,
                         max_depth=4,
                         num_leaves=10,
                         max_bin=255,
                         min_data_in_leaf=81,
                         bagging_fraction=0.7,
                         bagging_freq=30,
                         feature_fraction=0.8,
                         lambda_l1=0.1,
                         lambda_l2=0,
                         min_split_gain=0.1)
model.fit(X_train,y_train)
y_pre = model.predict(X_test)
print('acc:', metrics.accuracy_score(y_test,y_pre))
print('auc:', metrics.roc_auc_score(y_test,y_pre))

acc: 0.9824561403508771
auc: 0.9818990155604954


In [16]:
model=lgb.LGBMClassifier(boosting_type='gbdt',
                         objective='binary',
                         metrics='auc',
                         learning_rate=0.01,
                         n_estimators=1000,
                         max_depth=5,
                         num_leaves=95)
model.fit(X_train,y_train)
y_pre = model.predict(X_test)
print('acc:', metrics.accuracy_score(y_test,y_pre))
print('auc:', metrics.roc_auc_score(y_test,y_pre))

acc: 0.9649122807017544
auc: 0.9606224198158145
