In [1]:
# coding: utf-8
import json
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
# load or create your dataset
df_train = pd.read_csv('binary_classification/binary.train', header=None, sep='\t')
df_test = pd.read_csv('binary_classification/binary.test', header=None, sep='\t')
W_train = pd.read_csv('binary_classification/binary.train.weight', header=None)[0]
W_test = pd.read_csv('binary_classification/binary.test.weight', header=None)[0]

In [3]:
y_train = df_train[0].values
y_test = df_test[0].values
X_train = df_train.drop(0, axis=1).values
X_test = df_test.drop(0, axis=1).values

In [4]:
num_train, num_feature = X_train.shape

In [5]:
# create dataset for lightgbm
# if you want to re-use data, remember to set free_raw_data=False
lgb_train=lgb.Dataset(X_train,y_train,free_raw_data=False)
lgb_eval=lgb.Dataset(X_test,y_test,reference=lgb_train,free_raw_data=False)

In [6]:
# specify your configurations as a dict
params = {
    'boosting_type': 'gbdt',#
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,#
    'feature_fraction': 0.9,#
    'bagging_fraction': 0.8,#
    'bagging_freq': 5,#
    'verbose': 0#
}

In [7]:
# generate a feature name,我认为没有必要
feature_name = ['feature_' + str(col) for col in range(num_feature)]

In [8]:
gbm=lgb.train(params,
          lgb_train,
          num_boost_round=10,#训练次数
          feature_name=feature_name,
              
         )

## save model

## txt

In [24]:
gbm.save_model('advanced_example-model.txt')

pickle,或者joblib

In [25]:
with open('advanced_example-model.pkl','wb') as f:
    pickle.dump(gbm,f)

### model 复原
    从txt复原的booster只能用最好的训练模型预测
    从pickle,复原的可以通过任何迭代预测  

In [27]:
# load model to predict
bst = lgb.Booster(model_file='advanced_example-model.txt')
# can only predict with the best iteration (or the saving iteration)
y_pred = bst.predict(X_test)
# eval with loaded model
print('The rmse of loaded model\'s prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)


The rmse of loaded model's prediction is: 0.4456811768709752


### from pickle

In [28]:
with open('advanced_example-model.pkl','rb') as f:
    gbm_pkl=pickle.load(f)

In [29]:
type(bst),type(gbm_pkl)

(lightgbm.basic.Booster, lightgbm.basic.Booster)

In [30]:
mean_squared_error(gbm_pkl.predict(X_test,num_iteration=10),y_test)

0.19863171141709746

### continue train
     init_model accepts:
     1. model file name
     2. Booster()

## fromfile

In [31]:
#gb2 为cv
with open('advanced_example-model.pkl','rb') as f:
    gb2=pickle.load(f)

In [34]:
gbm20=lgb.train(params,lgb_train,num_boost_round=20,init_model=gb2,valid_sets=lgb_eval)

[11]	valid_0's binary_logloss: 0.582247
[12]	valid_0's binary_logloss: 0.578684
[13]	valid_0's binary_logloss: 0.574623
[14]	valid_0's binary_logloss: 0.571333
[15]	valid_0's binary_logloss: 0.56686
[16]	valid_0's binary_logloss: 0.563997
[17]	valid_0's binary_logloss: 0.559482
[18]	valid_0's binary_logloss: 0.556386
[19]	valid_0's binary_logloss: 0.554983
[20]	valid_0's binary_logloss: 0.552832
[21]	valid_0's binary_logloss: 0.550053
[22]	valid_0's binary_logloss: 0.546326
[23]	valid_0's binary_logloss: 0.544424
[24]	valid_0's binary_logloss: 0.542267
[25]	valid_0's binary_logloss: 0.540004
[26]	valid_0's binary_logloss: 0.539057
[27]	valid_0's binary_logloss: 0.53717
[28]	valid_0's binary_logloss: 0.535431
[29]	valid_0's binary_logloss: 0.532915
[30]	valid_0's binary_logloss: 0.53075


## from booster

In [35]:
gbm20=lgb.train(params,lgb_train,num_boost_round=20,init_model=bst,valid_sets=lgb_eval)

[11]	valid_0's binary_logloss: 0.582247
[12]	valid_0's binary_logloss: 0.578684
[13]	valid_0's binary_logloss: 0.574623
[14]	valid_0's binary_logloss: 0.571333
[15]	valid_0's binary_logloss: 0.56686
[16]	valid_0's binary_logloss: 0.563997
[17]	valid_0's binary_logloss: 0.559482
[18]	valid_0's binary_logloss: 0.556386
[19]	valid_0's binary_logloss: 0.554983
[20]	valid_0's binary_logloss: 0.552832
[21]	valid_0's binary_logloss: 0.550053
[22]	valid_0's binary_logloss: 0.546326
[23]	valid_0's binary_logloss: 0.544424
[24]	valid_0's binary_logloss: 0.542267
[25]	valid_0's binary_logloss: 0.540004
[26]	valid_0's binary_logloss: 0.539057
[27]	valid_0's binary_logloss: 0.53717
[28]	valid_0's binary_logloss: 0.535431
[29]	valid_0's binary_logloss: 0.532915
[30]	valid_0's binary_logloss: 0.53075




### 改变学习率

In [38]:
# decay learning rates
# learning_rates accepts:
# 1. list/tuple with length = num_boost_round
# 2. function(curr_iter)
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                learning_rates=lambda iter: 0.05 * (0.99 ** iter),
                valid_sets=lgb_eval)

[11]	valid_0's binary_logloss: 0.583764
[12]	valid_0's binary_logloss: 0.58234
[13]	valid_0's binary_logloss: 0.580202
[14]	valid_0's binary_logloss: 0.578394
[15]	valid_0's binary_logloss: 0.575964
[16]	valid_0's binary_logloss: 0.573929
[17]	valid_0's binary_logloss: 0.572486
[18]	valid_0's binary_logloss: 0.571196
[19]	valid_0's binary_logloss: 0.569816
[20]	valid_0's binary_logloss: 0.568528




In [39]:
# change other parameters during training
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10,
                init_model=gbm,
                valid_sets=lgb_eval,
                callbacks=[lgb.reset_parameter(bagging_fraction=[0.7] * 5 + [0.6] * 5)])

[21]	valid_0's binary_logloss: 0.583192
[22]	valid_0's binary_logloss: 0.581739
[23]	valid_0's binary_logloss: 0.579506
[24]	valid_0's binary_logloss: 0.577723
[25]	valid_0's binary_logloss: 0.575948
[26]	valid_0's binary_logloss: 0.575386
[27]	valid_0's binary_logloss: 0.57392
[28]	valid_0's binary_logloss: 0.572346
[29]	valid_0's binary_logloss: 0.571381
[30]	valid_0's binary_logloss: 0.569988


