### MinMaxScaling X and y

In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [2]:
train = pd.read_csv('data/train.csv', index_col=0)

X = train.drop('target', axis=1)
y = train.target
y_log = np.log1p(y)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.1, random_state=42
)

In [4]:
from sklearn.metrics import make_scorer

def rmsle_metric(y_test, y_pred) : 
    assert len(y_test) == len(y_pred)
    y_test = np.exp(y_test)-1
    y_pred = np.exp(y_pred)-1
    rmsle = np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))
    return ('RMSLE', rmsle, False)

grid_scorer = make_scorer(lambda y_test, y_pred: rmsle_metric(y_test, y_pred)[1], greater_is_better=False)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, mutual_info_regression


pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('regressor', lgb.LGBMRegressor(
        objective='regression',
        num_leaves=31,
        learning_rate=0.01,
        silent=False
    ))
])

N_FEATURES_OPTIONS = [50, 100, 300]

param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'regressor__boosting_type': ['gbdt', 'dart'], #'goss', 'rf'],
        'regressor__n_estimators': [50, 100, 500]
    },
    {
        'reduce_dim': [SelectKBest(mutual_info_regression)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'regressor__boosting_type': ['gbdt', 'dart'], #'goss', 'rf'],
        'regressor__n_estimators': [50, 100, 500]
    },
]
reducer_labels = ['PCA', 'NMF', 'KBest']

grid = GridSearchCV(pipe, cv=5, n_jobs=1, param_grid=param_grid) #, scoring=grid_scorer)
grid.fit(X_train, y_train) #, **{
    #'regressor__eval_set':[(X_test, y_test)],
    #'regressor__eval_metric':rmsle_metric,
    #'regressor__early_stopping_rounds':100
    #}
#)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('reduce_dim', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('regressor', LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.01, max_depth=-1, min_child_samples=20,
  ...0, reg_lambda=0.0, silent=False, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'reduce_dim': [PCA(copy=True, iterated_power=7, n_components=300, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False), NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=300, random_state=None, shuffle=False, solver='cd',
  tol=0.0001,...100, 300], 'regressor__boosting_type': ['gbdt', 'dart'], 'regressor__n_estimators': [50, 100, 500]}],
       pre_dispatch='2*n_jobs', refit=True, return_tra

In [9]:
y_pred = grid.predict(X_test)
rmsle_metric(y_test, y_pred)

('RMSLE', 1.42029471280403, False)

In [55]:
from sklearn.externals import joblib

joblib.dump(grid.best_estimator_, 'LightGBM-GridSearch-1_420.pkl')

['LightGBM-GridSearch-1_420.pkl']

In [11]:
grid.cv_results_.keys()

dict_keys(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_reduce_dim', 'param_reduce_dim__n_components', 'param_regressor__boosting_type', 'param_regressor__n_estimators', 'param_reduce_dim__k', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split3_test_score', 'split4_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'split3_train_score', 'split4_train_score', 'mean_train_score', 'std_train_score'])

In [13]:
keys = ['param_reduce_dim', 'param_reduce_dim__n_components', 'param_regressor__boosting_type', 'param_regressor__n_estimators', 'param_reduce_dim__k', 'params',
       'mean_test_score', 'std_test_score', 'rank_test_score','mean_train_score', 'std_train_score']

results = pd.DataFrame(grid.cv_results_)[keys]



In [17]:
results.sort_values('mean_test_score')

Unnamed: 0,param_reduce_dim,param_reduce_dim__n_components,param_regressor__boosting_type,param_regressor__n_estimators,param_reduce_dim__k,params,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
10,"PCA(copy=True, iterated_power=7, n_components=...",100.0,dart,100,,"{'reduce_dim': PCA(copy=True, iterated_power=7...",-9.467996,0.421092,54,-9.365244,0.103102
4,"PCA(copy=True, iterated_power=7, n_components=...",50.0,dart,100,,"{'reduce_dim': PCA(copy=True, iterated_power=7...",-9.467711,0.436715,53,-9.37553,0.103566
16,"PCA(copy=True, iterated_power=7, n_components=...",300.0,dart,100,,"{'reduce_dim': PCA(copy=True, iterated_power=7...",-9.461246,0.427381,52,-9.354178,0.102504
34,"NMF(alpha=0.0, beta_loss='frobenius', init=Non...",300.0,dart,100,,"{'reduce_dim': NMF(alpha=0.0, beta_loss='frobe...",-9.452988,0.412115,51,-9.402141,0.102598
22,"NMF(alpha=0.0, beta_loss='frobenius', init=Non...",50.0,dart,100,,"{'reduce_dim': NMF(alpha=0.0, beta_loss='frobe...",-9.446313,0.41581,50,-9.391625,0.106885
28,"NMF(alpha=0.0, beta_loss='frobenius', init=Non...",100.0,dart,100,,"{'reduce_dim': NMF(alpha=0.0, beta_loss='frobe...",-9.445717,0.411241,49,-9.396379,0.106739
52,"SelectKBest(k=300,\n score_func=<function...",,dart,100,300.0,"{'reduce_dim': SelectKBest(k=300,  score_...",-9.440786,0.435784,48,-9.361353,0.100158
40,"SelectKBest(k=300,\n score_func=<function...",,dart,100,50.0,"{'reduce_dim': SelectKBest(k=300,  score_...",-9.440202,0.43508,47,-9.368681,0.100942
46,"SelectKBest(k=300,\n score_func=<function...",,dart,100,100.0,"{'reduce_dim': SelectKBest(k=300,  score_...",-9.439586,0.436684,46,-9.366672,0.101006
17,"PCA(copy=True, iterated_power=7, n_components=...",300.0,dart,500,,"{'reduce_dim': PCA(copy=True, iterated_power=7...",-3.201106,0.17072,45,-2.910553,0.037857


In [19]:
grid.best_index_

50

In [20]:
grid.best_score_

0.3325356411995954

In [51]:
results[results['rank_test_score']==3]['params'].values[0]

{'reduce_dim': SelectKBest(k=100,
       score_func=<function mutual_info_regression at 0x7f7b847aa6a8>),
 'reduce_dim__k': 50,
 'regressor__boosting_type': 'gbdt',
 'regressor__n_estimators': 500}

In [52]:
del pipe2
pipe2 = Pipeline([
    ('reduce_dim', SelectKBest(k=50)),
    ('regressor', lgb.LGBMRegressor(
        objective='regression',
        num_leaves=31,
        learning_rate=0.01,
        n_estimators=500,
        boosting_type='gbdt'
    ))
])
pipe2.fit(X_train, y_train)

  259  276  294  298  301  321  322  330  333  339  341  349  357  360
  364  379  386  405  433  443  448  452  463  470  471  485  503  505
  509  515  516  520  523  527  528  531  556  563  565  579  580  581
  589  600  611  637  654  664  676  679  682  699  701  710  721  751
  773  788  793  805  806  807  810  814  845  888  899  910  919  936
  980 1000 1008 1025 1039 1043 1060 1063 1072 1084 1112 1154 1157 1164
 1177 1212 1231 1232 1247 1249 1251 1252 1258 1264 1265 1268 1278 1281
 1287 1294 1295 1298 1324 1328 1400 1416 1454 1457 1459 1464 1475 1495
 1505 1510 1518 1552 1555 1559 1583 1590 1600 1601 1626 1636 1645 1664
 1671 1677 1687 1691 1706 1735 1736 1737 1739 1752 1760 1762 1780 1786
 1830 1841 1842 1851 1863 1875 1903 1906 1917 1926 1945 1964 1969 1994
 1997 2016 2032 2035 2038 2058 2091 2122 2124 2136 2144 2179 2182 2188
 2191 2222 2227 2251 2254 2285 2287 2291 2293 2300 2327 2344 2347 2356
 2367 2368 2389 2395 2427 2439 2443 2447 2457 2478 2484 2494 2504 2505
 2545 

LightGBMError: Cannot construct Dataset since there are not useful features.
It should be at least two unique rows.
If the num_row (num_data) is small, you can set min_data=1 and min_data_in_bin=1 to fix this.
Otherwise please make sure you are using the right dataset

In [53]:
y_pred = pipe2.predict(X_test)
rmsle_metric(y_test, y_pred)

NotFittedError: No booster found. Need to call fit beforehand.