In [4]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from datetime import datetime

data_path = '/Users/kweonminseong/Documents/git/Kaggle-Notebooks/input/bike-sharing-demand/'

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sampleSubmission.csv')

train = train[train['weather']!=4]

all_data = pd.concat([train, test], ignore_index=True)
all_data['date'] = all_data['datetime'].apply(lambda x: x.split()[0])
all_data['year'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[0])
all_data['month'] = all_data['datetime'].apply(lambda x: x.split()[0].split('-')[1])
all_data['hour'] = all_data['datetime'].apply(lambda x: x.split()[1].split(':')[0])
all_data["weekday"] = all_data['date'].apply(lambda dateString: datetime.strptime(dateString, "%Y-%m-%d").weekday())

drop_features = ['casual', 'registered', 'datetime', 'date', 'windspeed', 'month']
all_data = all_data.drop(drop_features, axis=1)

X_train = all_data[~pd.isnull(all_data['count'])]
X_test = all_data[pd.isnull(all_data['count'])]

X_train = X_train.drop(['count'], axis=1)
X_test = X_test.drop(['count'], axis=1)

y = train['count']

In [5]:
def rmsle(y_true: int, y_pred: int, convert_exp=True) -> float:
    if convert_exp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)

    log_true = np.nan_to_num(np.log(y_true+1))
    log_pred = np.nan_to_num(np.log(y_pred+1))

    output = np.sqrt(np.mean((log_true-log_pred)**2))
    return output

rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)

In [6]:
random_forest_model = RandomForestRegressor()

# Create Grid Search object
rf_params = {'random_state':[42], 'n_estimators':[100, 120, 140]}
gridsearch_random_forest_model = GridSearchCV(estimator=random_forest_model,
                                              param_grid=rf_params,
                                              scoring=rmsle_scorer,
                                              cv=5)

log_y = np.log(y)
gridsearch_random_forest_model.fit(X_train, log_y)
print('Hyperparameter : ', gridsearch_random_forest_model.best_params_)

Hyperparameter :  {'n_estimators': 140, 'random_state': 42}


In [7]:
preds = gridsearch_random_forest_model.best_estimator_.predict(X_train)
print(f'RMSLE of Random Forest Regression : {rmsle(log_y, preds, True):.4f}')

RMSLE of Random Forest Regression : 0.1127


In [8]:
randomforest_preds = gridsearch_random_forest_model.best_estimator_.predict(X_test)

submission['count'] = np.exp(randomforest_preds)
submission.to_csv('submission.csv', index=False)