In [449]:
import lightgbm as lgb
#%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, ParameterGrid
from tqdm import tqdm
from logging import getLogger, StreamHandler, DEBUG, Formatter, FileHandler
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [450]:
df = pd.read_csv('D:/ml/kaggle/Bike Sharing Demand/train.csv', parse_dates=[0])
test = pd.read_csv('D:/ml/kaggle/Bike Sharing Demand/test.csv', parse_dates=[0])

In [451]:
df['count'] = np.log(df['count'] + 1)

In [452]:
df.rename(columns={'count':'rentals'}, inplace=True)

In [453]:
df = df.append(test,sort=False)

In [454]:
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['dayofweek'] = df['datetime'].dt.dayofweek
df['hour'] = df['datetime'].dt.hour

In [455]:
df.sort_values('datetime', inplace=True)

In [456]:
test = df[df['rentals'].isnull()]
df = df[~df['rentals'].isnull()]

In [457]:
removed_cols = ['rentals', 'casual', 'registered', 'datetime', 'atemp', 'holiday', 'month']

In [458]:
feats = [c for c in df.columns if c not in removed_cols]

In [459]:
def logmse(y, pred):
    g = mean_squared_error(y, pred)**(1/2)
    return g

In [460]:
def logmse_xgb(y, pred):
    #y = y.get_label()
    return "logmse", logmse(pred,y)

In [461]:
logger = getLogger(__name__)

In [465]:
kf = KFold(n_splits=5, shuffle=True, random_state=0)

all_params = {'max_depth': [3, 5, 7],
              'learning_rate':[0.1],
              'min_child_weight':[3, 5, 7],
             'n_estimators':[5000],
              'colsample_bytree': [0.8, 0.9],
              'colsample_bylevel':[0.8, 0.9],
              'reg_alpha':[0, 0.1],
              'max_delta_step':[0.1],
            'n_jobs':[-1],
            'random_state':[0]}

x_train = df[feats]
y_train = df['rentals'].values


min_score = 100
min_params = None

for params in tqdm(list(ParameterGrid(all_params))):
    logger.info('params: {}'.format(params))
    
    list_logmse_score = []
    list_best_iterations = []


    for train_idx, test_idx in kf.split(x_train, y_train):
        trn_x = x_train.iloc[train_idx, :]
        val_x = x_train.iloc[valid_idx, :]
    
        trn_y = y_train[train_idx]
        val_y = y_train[valid_idx]
    
        clf = lgb.sklearn.LGBMRegressor(**params)
        #clf = xgb.sklearn.XGBClassifier(**params)
        #clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], early_stopping_rounds=100, eval_metric=logmse_xgb)
        clf.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], early_stopping_rounds=100, eval_metric="rmse")
        y_pred = clf.predict(val_x)
    
        sc_logmse = logmse(val_y, y_pred)
    
        list_logmse_score.append(sc_logmse)
        list_best_iterations.append(clf.best_iteration_)
        logger.debug('logmse:{}'.format(sc_logmse))

    params['n_estimators'] = int(np.mean(list_best_iterations))
    sc_logmse = np.mean(list_logmse_score)
    if min_score > sc_logmse:
        min_score = sc_logmse
        min_params = params

logger.info('minimam params:{}'.format(min_params))
logger.info('minimam logmse:{}'.format(min_score))
print('minimam logmse:{}',min_score)
#x_train, x_val, y_train, y_val = train_test_split(df[feats], df['rentals'].values, test_size = 0.3, random_state=0)
clf = lgb.sklearn.LGBMRegressor(**min_params)
#clf = xgb.sklearn.XGBClassifier(**min_params)
clf.fit(x_train, y_train)











  0%|                                                   | 0/72 [00:00<?, ?it/s]

[1]	valid_0's rmse: 1.43429	valid_0's l2: 2.05718
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's rmse: 1.42703	valid_0's l2: 2.0364
[3]	valid_0's rmse: 1.423	valid_0's l2: 2.02492
[4]	valid_0's rmse: 1.41577	valid_0's l2: 2.0044
[5]	valid_0's rmse: 1.40857	valid_0's l2: 1.98408
[6]	valid_0's rmse: 1.40444	valid_0's l2: 1.97244
[7]	valid_0's rmse: 1.39728	valid_0's l2: 1.95239
[8]	valid_0's rmse: 1.39016	valid_0's l2: 1.93254
[9]	valid_0's rmse: 1.3862	valid_0's l2: 1.92156
[10]	valid_0's rmse: 1.37912	valid_0's l2: 1.90196
[11]	valid_0's rmse: 1.37207	valid_0's l2: 1.88256
[12]	valid_0's rmse: 1.36503	valid_0's l2: 1.8633
[13]	valid_0's rmse: 1.35821	valid_0's l2: 1.84473
[14]	valid_0's rmse: 1.35141	valid_0's l2: 1.82631
[15]	valid_0's rmse: 1.34448	valid_0's l2: 1.80764
[16]	valid_0's rmse: 1.33759	valid_0's l2: 1.78913
[17]	valid_0's rmse: 1.33089	valid_0's l2: 1.77128
[18]	valid_0's rmse: 1.32406	valid_0's l2: 1.75313
[19]	valid_0's rmse: 1.32022	valid

KeyboardInterrupt: 

In [437]:
clf = lgb.sklearn.LGBMRegressor(random_state=0, n_estimators=4000)
clf.fit(x_train, y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=4000, n_jobs=-1, num_leaves=31, objective=None,
       random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [438]:
test['count'] = np.exp(clf.predict(test[feats]))

In [439]:
test[['datetime', 'count']].to_csv('grid_gb.csv', index=False)

In [306]:
df[feats].head()

Unnamed: 0,season,workingday,weather,temp,humidity,windspeed,year,day,dayofweek,hour
0,1,0,1,9.84,81,0.0,2011,1,5,0
1,1,0,1,9.02,80,0.0,2011,1,5,1
2,1,0,1,9.02,80,0.0,2011,1,5,2
3,1,0,1,9.84,75,0.0,2011,1,5,3
4,1,0,1,9.84,75,0.0,2011,1,5,4


In [322]:
fti = clf.feature_importances_

print('Feature Importances:')
for i, feat in enumerate(x_train[feats]):
    print('\t{0:10s} : {1:>12.4f}'.format(feat, fti[i]))

Feature Importances:
	workingday :     154.0000
	weather    :     124.0000
	temp       :     341.0000
	humidity   :     277.0000
	windspeed  :     138.0000
	year       :     104.0000
	month      :     308.0000
	day        :     211.0000
	dayofweek  :     327.0000
	hour       :    1016.0000
