### LightGBM. Пример решения задачи.

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

data = load_boston()
X_full = data.data
y_full = data.target

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=100, 
                                        random_state=241)

Обучение

In [2]:
print('Starting training...')
# train
gbm = lgb.LGBMRegressor(num_leaves=31,
                        learning_rate=0.05,
                        n_estimators=20)
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='l1',
        early_stopping_rounds=5)

Starting training...
[1]	valid_0's l1: 6.55152	valid_0's l2: 76.761
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l1: 6.29855	valid_0's l2: 71.5199
[3]	valid_0's l1: 6.08526	valid_0's l2: 67.0386
[4]	valid_0's l1: 5.88219	valid_0's l2: 62.9998
[5]	valid_0's l1: 5.66549	valid_0's l2: 58.889
[6]	valid_0's l1: 5.45931	valid_0's l2: 55.1763
[7]	valid_0's l1: 5.29116	valid_0's l2: 52.2049
[8]	valid_0's l1: 5.10806	valid_0's l2: 49.089
[9]	valid_0's l1: 4.93583	valid_0's l2: 46.2628
[10]	valid_0's l1: 4.79662	valid_0's l2: 44.0623
[11]	valid_0's l1: 4.62366	valid_0's l2: 41.4139
[12]	valid_0's l1: 4.48798	valid_0's l2: 39.5294
[13]	valid_0's l1: 4.32989	valid_0's l2: 37.383
[14]	valid_0's l1: 4.18449	valid_0's l2: 35.4853
[15]	valid_0's l1: 4.07735	valid_0's l2: 34.094
[16]	valid_0's l1: 3.96772	valid_0's l2: 32.4964
[17]	valid_0's l1: 3.8743	valid_0's l2: 31.299
[18]	valid_0's l1: 3.77696	valid_0's l2: 30.0159
[19]	valid_0's l1: 3.68361	valid_0's l2: 28.8212
[20]

LGBMRegressor(learning_rate=0.05, n_estimators=20)

Предсказание и оценка качества

In [3]:
print('Starting predicting...')

y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)

print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

# feature importances
print('Feature importances:', list(gbm.feature_importances_))

Starting predicting...
The rmse of prediction is: 5.259625962258064
Feature importances: [35, 0, 6, 0, 20, 70, 21, 31, 3, 7, 15, 4, 83]


### Можно задать любую метрику качества - надо написать свою функцию.

In [4]:
# self-defined eval metric
# f(y_true: array, y_pred: array) -> name: string, eval_result: float, is_higher_better: bool
# Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False


print('Starting training with custom eval function...')
# train
gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=rmsle,
        early_stopping_rounds=5)

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# eval
print('The rmsle of prediction is:', rmsle(y_test, y_pred)[1])

Starting training with custom eval function...
[1]	valid_0's l2: 76.761	valid_0's RMSLE: 0.395874
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 71.5199	valid_0's RMSLE: 0.383203
[3]	valid_0's l2: 67.0386	valid_0's RMSLE: 0.372011
[4]	valid_0's l2: 62.9998	valid_0's RMSLE: 0.361535
[5]	valid_0's l2: 58.889	valid_0's RMSLE: 0.350227
[6]	valid_0's l2: 55.1763	valid_0's RMSLE: 0.33954
[7]	valid_0's l2: 52.2049	valid_0's RMSLE: 0.330704
[8]	valid_0's l2: 49.089	valid_0's RMSLE: 0.320856
[9]	valid_0's l2: 46.2628	valid_0's RMSLE: 0.311518
[10]	valid_0's l2: 44.0623	valid_0's RMSLE: 0.304175
[11]	valid_0's l2: 41.4139	valid_0's RMSLE: 0.295307
[12]	valid_0's l2: 39.5294	valid_0's RMSLE: 0.288303
[13]	valid_0's l2: 37.383	valid_0's RMSLE: 0.280462
[14]	valid_0's l2: 35.4853	valid_0's RMSLE: 0.273098
[15]	valid_0's l2: 34.094	valid_0's RMSLE: 0.26711
[16]	valid_0's l2: 32.4964	valid_0's RMSLE: 0.260523
[17]	valid_0's l2: 31.299	valid_0's RMSLE: 0.255064
[18]	vali

### Подбор гиперпараметров.

In [5]:
# other scikit-learn modules
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

gbm = GridSearchCV(estimator, param_grid, cv=3)
gbm.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm.best_params_)

Best parameters found by grid search are: {'learning_rate': 0.1, 'n_estimators': 40}


Предсказание и оценка качества

In [6]:
pred = gbm.predict(X_test)

In [7]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred)

12.703535974721541