In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost
import sklearn
# import seaborn as sns
%matplotlib inline
data_path = '/Users/dirlt/.kaggle/competitions/bike-sharing-demand/'
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import sklearn.metrics
from sklearn.base import BaseEstimator, RegressorMixin

links:
- https://www.kaggle.com/miteshyadav/comprehensive-eda-with-xgboost-top-10-percentile/notebook
- https://www.kaggle.com/viveksrinivasan/eda-ensemble-model-top-10-percentile

# Load Data

In [17]:
df = pd.read_csv('mytrain.csv')
test_df = pd.read_csv('mytest.csv')
X, y = df.drop(['casual', 'registered', 'count'], axis = 1), np.log1p(df[['casual', 'registered', 'count']])

In [18]:
def input_fn(x, casual = True):
    drop_fields = ['dt_day', 'dt_hour', 'season', 'weather', 'dt_year', 'dt_month', 'dt_weekday', 'atemp']
    if 'datetime' in x.columns:
        drop_fields.append('datetime')
    return x.drop(drop_fields, axis = 1)

def make_cv(X,n = 2):
    for i in range(n):
        days = [x + i for x in [18-i, 19-i]]
        train_idx = X[X['dt_day'].apply(lambda x: x not in days)].index
        test_idx = X[X['dt_day'].apply(lambda x: x in days)].index
        yield train_idx, test_idx

def rmse(x, y):
    return mean_squared_error(x, y) ** 0.5

def print_features(names, values, thres = 0.01):
    fts = list(zip(names, values))
    fts.sort(key = lambda x: -x[1])
    ns = []
    for idx, (name, value) in enumerate(fts):
        if value < thres: break
        print('- {} {:.2f}'.format(name, value))
        ns.append(name)
    print(format(','.join(ns)))

class MyEstimator(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        if 'ma' in kwargs:
            self.ma = kwargs['ma']
            del kwargs['ma']
        if 'mb' in kwargs:
            self.mb = kwargs['mb']
            del kwargs['mb']
        kwargs['init'] = True
        self.set_params(**kwargs)
        
    def fit(self, X, y):
        input_a = input_fn(X, casual=True)
        input_b = input_fn(X, casual=False)
        self.ma.fit(input_a, y['casual'])
        self.mb.fit(input_b, y['registered'])
        self.ca = input_a.columns
        self.cb = input_b.columns
        
    def predict(self, X):
        ya = self.ma.predict(input_fn(X, casual=True))
        yb = self.mb.predict(input_fn(X, casual=False))
        y = np.log1p(np.expm1(ya) + np.expm1(yb))
        return y
    
    def score(self, X, y):
        y2 = self.predict(X)
        return -rmse(y['count'], y2)
        
    def set_params(self, **params):
        pa = {}
        pb = {}
        for k in params:
            if k.startswith('a_'):
                pa[k[2:]] = params[k]
            elif k.startswith('b_'):
                pb[k[2:]] = params[k]
            else:
                pass
        if 'init' not in params:
            #print(pa, pb)
            pass
        self.ma.set_params(**pa)
        self.mb.set_params(**pb)
        return self
        
    def get_params(self, deep = True):
        pa = self.ma.get_params(deep)
        pb = self.ma.get_params(deep)
        p = {}
        for k in pa:
            p['a_' + k] = pa[k]
        for k in pb:
            p['b_' + k] = pb[k]
        p['ma'] = self.ma
        p['mb'] = self.mb
        return p
    
    def print_features(self, thres = 0.005):
        ca = self.ca
        cb = self.cb
        print('=====casual=====')
        print_features(ca, self.ma.feature_importances_, thres)
        print('=====registered=====')
        print_features(cb, self.mb.feature_importances_, thres)

# RF model

In [24]:
%%time
print('cv for rf model')
rf0 = RandomForestRegressor(n_estimators=400, random_state = 42, verbose=0, n_jobs=4)
rf1 = RandomForestRegressor(n_estimators=400, random_state = 42, verbose=0, n_jobs=4)
rf = MyEstimator(ma = rf0, mb = rf1)
params = {'a_min_samples_split': [8,9,10,11,12], 'b_min_samples_split': [4,5,6,7,8]}
rf_cv = GridSearchCV(rf, params, cv = make_cv(X,2), n_jobs = 1, verbose = 1)
rf_cv.fit(X, y)
print(rf_cv.best_score_, rf_cv.best_params_)

cv for rf model
Fitting 2 folds for each of 25 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  8.9min finished


-0.3450205441974117 {'a_min_samples_split': 9, 'b_min_samples_split': 6}
CPU times: user 32min 54s, sys: 21.8 s, total: 33min 15s
Wall time: 9min 6s


In [25]:
%%time
rf_best_params = rf_cv.best_params_.copy()
# rf_best_params = {'a_min_samples_split': 8, 'b_min_samples_split': 5}
rf_best_params['a_n_estimators'] = 2000
rf_best_params['b_n_estimators'] = 2000
rf.set_params(**rf_best_params)
rf.fit(X, y)
output_y = rf.predict(test_df)
output = np.round(np.expm1(output_y)).astype(int)
output[output < 0] = 0
df_output = pd.DataFrame({'datetime': test_df['datetime'], 'count': output}, columns=('datetime', 'count'))
df_output['count'] = df_output['count'].astype(int)
df_output.to_csv('submission-rf.csv', index = False)

CPU times: user 3min 33s, sys: 2.41 s, total: 3min 35s
Wall time: 56.9 s


# GBM

In [20]:
%%time
print('cv for gbm model')
gbm0 = GradientBoostingRegressor(n_estimators=200, random_state = 42, verbose=0)
gbm1 = GradientBoostingRegressor(n_estimators=200, random_state = 42, verbose=0)
gbm = MyEstimator(ma = gbm0, mb = gbm1)
params = {'a_max_depth': [3,4,5,6,7,8], 'b_max_depth': [5,6,7,8]}
gbm_cv = GridSearchCV(gbm, params, cv = make_cv(X,2), n_jobs = 4, verbose = 1)
gbm_cv.fit(X, y)
print(gbm_cv.best_score_, gbm_cv.best_params_)

cv for gbm model
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=4)]: Done  48 out of  48 | elapsed:  3.1min finished


-0.3135079249588759 {'a_max_depth': 4, 'b_max_depth': 6}
CPU times: user 9.59 s, sys: 171 ms, total: 9.76 s
Wall time: 3min 12s


In [22]:
%%time
gbm_best_params = gbm_cv.best_params_.copy()
# gbm_best_params = {'a_min_samples_split': 5, 'b_min_samples_split': 3}
# gbm_best_params = {'a_max_depth': 6, 'b_max_depth':6}
gbm_best_params['a_n_estimators'] = 1000
gbm_best_params['b_n_estimators'] = 1000
gbm.set_params(**gbm_best_params)
gbm.fit(X, y)
output_y = gbm.predict(test_df)
output = np.round(np.expm1(output_y)).astype(int)
output[output < 0] = 0
df_output = pd.DataFrame({'datetime': test_df['datetime'], 'count': output}, columns=('datetime', 'count'))
df_output['count'] = df_output['count'].astype(int)
df_output.to_csv('submission-gbm.csv', index = False)

CPU times: user 45.3 s, sys: 112 ms, total: 45.4 s
Wall time: 45.6 s


# XGB

In [15]:
%%time
print('cv for xgb model')
xgb0 = XGBRegressor(n_estimators=200, random_state = 42, verbose=0, n_jobs=4)
xgb1 = XGBRegressor(n_estimators=200, random_state = 42, verbose=0, n_jobs=4)
xgb = MyEstimator(ma = xgb0, mb = xgb1)
params = {'a_max_depth': [3,4,5,6,7,8], 'b_max_depth': [5,6,7,8]}
xgb_cv = GridSearchCV(xgb, params, cv = make_cv(X,2), n_jobs = 1, verbose = 1)
xgb_cv.fit(X, y)
print(xgb_cv.best_score_, xgb_cv.best_params_)

cv for xgb model
Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  2.2min finished


-0.3114569346253781 {'a_max_depth': 4, 'b_max_depth': 6}
CPU times: user 6min 32s, sys: 39.3 s, total: 7min 11s
Wall time: 2min 12s


In [19]:
%%time
xgb_best_params = xgb_cv.best_params_.copy()
#xgb_best_params = {'a_max_depth': 7, 'b_max_depth':7}
xgb_best_params['a_n_estimators'] = 1000
xgb_best_params['b_n_estimators'] = 1000
xgb.set_params(**xgb_best_params)
xgb.fit(X, y)
output_y = xgb.predict(test_df)
output = np.round(np.expm1(output_y)).astype(int)
output[output < 0] = 0
df_output = pd.DataFrame({'datetime': test_df['datetime'], 'count': output}, columns=('datetime', 'count'))
df_output['count'] = df_output['count'].astype(int)
df_output.to_csv('submission-xgb.csv', index = False)

CPU times: user 32.9 s, sys: 2.61 s, total: 35.5 s
Wall time: 10.6 s


# average

In [5]:
df_rf = pd.read_csv('submission-rf.csv')
df_gbm = pd.read_csv('submission-gbm.csv')
df_xgb = pd.read_csv('submission-xgb.csv')
df_avg = pd.DataFrame(df_rf)
df_avg['count'] = np.round((df_rf['count'] + df_gbm['count'] + df_xgb['count'] + 1) * 0.33).astype(int)
# df_avg['count'] = np.round((df_rf['count'] + df_gbm['count'] + 1) * 0.5).astype(int)
df_avg.to_csv('submission.csv', index = False)

In [27]:
message = 'no outlier, add windspeed_0, grid search cv with low n_estimators, but run with high n_estimators'
!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "$message"

Successfully submitted to Bike Sharing Demand

[submission link](https://www.kaggle.com/c/bike-sharing-demand/submissions?sortBy=date&group=all&page=1)