In [9]:
import lightgbm as lgb
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [23]:
train = pd.read_csv('data/train.csv', index_col=0)

X = train.drop('target', axis=1)
y = train.target

In [55]:
from sklearn.decomposition import PCA

pca = PCA(
    copy=True, iterated_power=7, n_components=100, 
    random_state=None, svd_solver='auto', tol=0.0, whiten=False
)
X_pca = pca.fit_transform(X)

In [60]:
from sklearn.preprocessing import MinMaxScaler

minmax = MinMaxScaler()
X_pca = minmax.fit_transform(X)
y_log = np.log1p(y)

In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y_log, test_size=0.2, random_state=42
)

In [62]:
def rmsle_metric(y_test, y_pred) : 
    assert len(y_test) == len(y_pred)
    y_test = np.exp(y_test)-1
    y_pred = np.exp(y_pred)-1
    rmsle = np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))
    return ('RMSLE', rmsle, False)

In [63]:
gbm = lgb.LGBMRegressor(
    objective='regression',
    num_leaves=31,
    learning_rate=0.01,
    n_estimators=1000
)

gbm.fit(X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric=rmsle_metric,
        early_stopping_rounds=100
)

[1]	valid_0's l2: 2.86364	valid_0's RMSLE: 1.69223
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's l2: 2.8509	valid_0's RMSLE: 1.68846
[3]	valid_0's l2: 2.83774	valid_0's RMSLE: 1.68456
[4]	valid_0's l2: 2.82565	valid_0's RMSLE: 1.68097
[5]	valid_0's l2: 2.81304	valid_0's RMSLE: 1.67721
[6]	valid_0's l2: 2.80186	valid_0's RMSLE: 1.67388
[7]	valid_0's l2: 2.79084	valid_0's RMSLE: 1.67058
[8]	valid_0's l2: 2.77968	valid_0's RMSLE: 1.66724
[9]	valid_0's l2: 2.76956	valid_0's RMSLE: 1.6642
[10]	valid_0's l2: 2.75916	valid_0's RMSLE: 1.66107
[11]	valid_0's l2: 2.75014	valid_0's RMSLE: 1.65835
[12]	valid_0's l2: 2.73972	valid_0's RMSLE: 1.65521
[13]	valid_0's l2: 2.7296	valid_0's RMSLE: 1.65215
[14]	valid_0's l2: 2.72009	valid_0's RMSLE: 1.64927
[15]	valid_0's l2: 2.71043	valid_0's RMSLE: 1.64634
[16]	valid_0's l2: 2.70074	valid_0's RMSLE: 1.64339
[17]	valid_0's l2: 2.69136	valid_0's RMSLE: 1.64054
[18]	valid_0's l2: 2.68283	valid_0's RMSLE: 1.63794
[19]	valid_0'

[157]	valid_0's l2: 2.12649	valid_0's RMSLE: 1.45825
[158]	valid_0's l2: 2.12434	valid_0's RMSLE: 1.45751
[159]	valid_0's l2: 2.12323	valid_0's RMSLE: 1.45713
[160]	valid_0's l2: 2.12195	valid_0's RMSLE: 1.45669
[161]	valid_0's l2: 2.12043	valid_0's RMSLE: 1.45617
[162]	valid_0's l2: 2.11943	valid_0's RMSLE: 1.45583
[163]	valid_0's l2: 2.1183	valid_0's RMSLE: 1.45544
[164]	valid_0's l2: 2.11649	valid_0's RMSLE: 1.45482
[165]	valid_0's l2: 2.11485	valid_0's RMSLE: 1.45425
[166]	valid_0's l2: 2.11347	valid_0's RMSLE: 1.45378
[167]	valid_0's l2: 2.11217	valid_0's RMSLE: 1.45333
[168]	valid_0's l2: 2.11073	valid_0's RMSLE: 1.45284
[169]	valid_0's l2: 2.10885	valid_0's RMSLE: 1.45219
[170]	valid_0's l2: 2.108	valid_0's RMSLE: 1.4519
[171]	valid_0's l2: 2.10599	valid_0's RMSLE: 1.4512
[172]	valid_0's l2: 2.10432	valid_0's RMSLE: 1.45063
[173]	valid_0's l2: 2.10284	valid_0's RMSLE: 1.45012
[174]	valid_0's l2: 2.10213	valid_0's RMSLE: 1.44987
[175]	valid_0's l2: 2.10097	valid_0's RMSLE: 1.4494

[314]	valid_0's l2: 2.04139	valid_0's RMSLE: 1.42877
[315]	valid_0's l2: 2.04097	valid_0's RMSLE: 1.42863
[316]	valid_0's l2: 2.04118	valid_0's RMSLE: 1.4287
[317]	valid_0's l2: 2.04134	valid_0's RMSLE: 1.42875
[318]	valid_0's l2: 2.04187	valid_0's RMSLE: 1.42894
[319]	valid_0's l2: 2.04207	valid_0's RMSLE: 1.42901
[320]	valid_0's l2: 2.04181	valid_0's RMSLE: 1.42892
[321]	valid_0's l2: 2.04158	valid_0's RMSLE: 1.42884
[322]	valid_0's l2: 2.04174	valid_0's RMSLE: 1.4289
[323]	valid_0's l2: 2.04161	valid_0's RMSLE: 1.42885
[324]	valid_0's l2: 2.04153	valid_0's RMSLE: 1.42882
[325]	valid_0's l2: 2.04193	valid_0's RMSLE: 1.42896
[326]	valid_0's l2: 2.04176	valid_0's RMSLE: 1.4289
[327]	valid_0's l2: 2.04161	valid_0's RMSLE: 1.42885
[328]	valid_0's l2: 2.04154	valid_0's RMSLE: 1.42883
[329]	valid_0's l2: 2.04142	valid_0's RMSLE: 1.42878
[330]	valid_0's l2: 2.0413	valid_0's RMSLE: 1.42874
[331]	valid_0's l2: 2.04074	valid_0's RMSLE: 1.42855
[332]	valid_0's l2: 2.04076	valid_0's RMSLE: 1.428

[470]	valid_0's l2: 2.04176	valid_0's RMSLE: 1.4289
[471]	valid_0's l2: 2.0416	valid_0's RMSLE: 1.42884
[472]	valid_0's l2: 2.04127	valid_0's RMSLE: 1.42873
[473]	valid_0's l2: 2.04157	valid_0's RMSLE: 1.42883
[474]	valid_0's l2: 2.04138	valid_0's RMSLE: 1.42877
[475]	valid_0's l2: 2.04147	valid_0's RMSLE: 1.4288
[476]	valid_0's l2: 2.04149	valid_0's RMSLE: 1.42881
[477]	valid_0's l2: 2.04196	valid_0's RMSLE: 1.42897
[478]	valid_0's l2: 2.04195	valid_0's RMSLE: 1.42897
[479]	valid_0's l2: 2.04189	valid_0's RMSLE: 1.42895
[480]	valid_0's l2: 2.04169	valid_0's RMSLE: 1.42888
[481]	valid_0's l2: 2.04193	valid_0's RMSLE: 1.42896
[482]	valid_0's l2: 2.04192	valid_0's RMSLE: 1.42896
[483]	valid_0's l2: 2.04163	valid_0's RMSLE: 1.42886
[484]	valid_0's l2: 2.04141	valid_0's RMSLE: 1.42878
[485]	valid_0's l2: 2.04154	valid_0's RMSLE: 1.42883
[486]	valid_0's l2: 2.04146	valid_0's RMSLE: 1.4288
[487]	valid_0's l2: 2.04179	valid_0's RMSLE: 1.42891
[488]	valid_0's l2: 2.04185	valid_0's RMSLE: 1.428

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.01, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=1000,
       n_jobs=-1, num_leaves=31, objective='regression', random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=0)

In [64]:
y_pred = gbm.predict(X_test)
print(rmsle_metric(y_test, y_pred))

('RMSLE', 1.4277885030933781, False)


In [66]:
from sklearn.externals import joblib

joblib.dump(gbm, 'LightGBM_log_y_1_427.pkl')

['LightGBM_log_y_1_427.pkl']