In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
import graphviz

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/bike-sharing-demand/sampleSubmission.csv
/kaggle/input/bike-sharing-demand/test.csv
/kaggle/input/bike-sharing-demand/train.csv


In [3]:
train=pd.read_csv("../input/bike-sharing-demand/train.csv")
test=pd.read_csv("../input/bike-sharing-demand/test.csv")
train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [4]:
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [5]:
full_data=train.append(test, sort=False)
full_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3.0,13.0,16.0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8.0,32.0,40.0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5.0,27.0,32.0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3.0,10.0,13.0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0.0,1.0,1.0


In [6]:
full_data["season"]=full_data.season.map({1:"spring", 2:"summer", 3:"fall", 4:"winter"})
full_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,spring,0,0,1,9.84,14.395,81,0.0,3.0,13.0,16.0
1,2011-01-01 01:00:00,spring,0,0,1,9.02,13.635,80,0.0,8.0,32.0,40.0
2,2011-01-01 02:00:00,spring,0,0,1,9.02,13.635,80,0.0,5.0,27.0,32.0
3,2011-01-01 03:00:00,spring,0,0,1,9.84,14.395,75,0.0,3.0,10.0,13.0
4,2011-01-01 04:00:00,spring,0,0,1,9.84,14.395,75,0.0,0.0,1.0,1.0


In [7]:
full_data["weather"]=full_data.weather.map({1: "Clear, Few clouds, Partly cloudy, Partly cloudy",
                                           2: "Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist",
                                           3: "Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds",
                                           4: "Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog "})

full_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.84,14.395,81,0.0,3.0,13.0,16.0
1,2011-01-01 01:00:00,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.02,13.635,80,0.0,8.0,32.0,40.0
2,2011-01-01 02:00:00,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.02,13.635,80,0.0,5.0,27.0,32.0
3,2011-01-01 03:00:00,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.84,14.395,75,0.0,3.0,10.0,13.0
4,2011-01-01 04:00:00,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.84,14.395,75,0.0,0.0,1.0,1.0


In [8]:
full_data["datetime"]=pd.to_datetime(full_data["datetime"])
full_data["year"]=full_data["datetime"].dt.year
full_data["month"]=full_data["datetime"].dt.month
full_data["day"]=full_data["datetime"].dt.day
full_data["weekday"]=full_data["datetime"].dt.weekday
full_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,weekday
0,2011-01-01 00:00:00,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.84,14.395,81,0.0,3.0,13.0,16.0,2011,1,1,5
1,2011-01-01 01:00:00,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.02,13.635,80,0.0,8.0,32.0,40.0,2011,1,1,5
2,2011-01-01 02:00:00,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.02,13.635,80,0.0,5.0,27.0,32.0,2011,1,1,5
3,2011-01-01 03:00:00,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.84,14.395,75,0.0,3.0,10.0,13.0,2011,1,1,5
4,2011-01-01 04:00:00,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.84,14.395,75,0.0,0.0,1.0,1.0,2011,1,1,5


In [9]:
full_data.drop(["datetime"], axis=1, inplace=True)
full_data.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,weekday
0,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.84,14.395,81,0.0,3.0,13.0,16.0,2011,1,1,5
1,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.02,13.635,80,0.0,8.0,32.0,40.0,2011,1,1,5
2,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.02,13.635,80,0.0,5.0,27.0,32.0,2011,1,1,5
3,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.84,14.395,75,0.0,3.0,10.0,13.0,2011,1,1,5
4,spring,0,0,"Clear, Few clouds, Partly cloudy, Partly cloudy",9.84,14.395,75,0.0,0.0,1.0,1.0,2011,1,1,5


In [10]:
dum_full_data=pd.get_dummies(full_data, drop_first=True)
dum_full_data.tail()

Unnamed: 0,holiday,workingday,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,weekday,season_spring,season_summer,season_winter,"weather_Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog","weather_Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds","weather_Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist"
6488,0,1,10.66,12.88,60,11.0014,,,,2012,12,31,0,1,0,0,0,0,1
6489,0,1,10.66,12.88,60,11.0014,,,,2012,12,31,0,1,0,0,0,0,1
6490,0,1,10.66,12.88,60,11.0014,,,,2012,12,31,0,1,0,0,0,0,0
6491,0,1,10.66,13.635,56,8.9981,,,,2012,12,31,0,1,0,0,0,0,0
6492,0,1,10.66,13.635,65,8.9981,,,,2012,12,31,0,1,0,0,0,0,0


In [11]:
train.shape

(10886, 12)

In [12]:
X=dum_full_data.drop(["registered", "count", "casual"], axis=1)
y=dum_full_data["count"]
X.head()

Unnamed: 0,holiday,workingday,temp,atemp,humidity,windspeed,year,month,day,weekday,season_spring,season_summer,season_winter,"weather_Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog","weather_Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds","weather_Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist"
0,0,0,9.84,14.395,81,0.0,2011,1,1,5,1,0,0,0,0,0
1,0,0,9.02,13.635,80,0.0,2011,1,1,5,1,0,0,0,0,0
2,0,0,9.02,13.635,80,0.0,2011,1,1,5,1,0,0,0,0,0
3,0,0,9.84,14.395,75,0.0,2011,1,1,5,1,0,0,0,0,0
4,0,0,9.84,14.395,75,0.0,2011,1,1,5,1,0,0,0,0,0


In [13]:
y.head()

0    16.0
1    40.0
2    32.0
3    13.0
4     1.0
Name: count, dtype: float64

In [14]:
X_train=X.iloc[:10885,:]
X_test=X.iloc[10886:,:]
y_train=y.iloc[:10885]
y_test=y.iloc[10886:]
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((10885, 16), (10885,), (6493, 16), (6493,))

In [15]:
depth_range = [3,4,5,6,7,8,9]
minsplit_range = [5,10,20,25,30]
minleaf_range = [5,10,15]

In [16]:
parameters = dict(max_depth=depth_range,
                  min_samples_split=minsplit_range, 
                  min_samples_leaf=minleaf_range)

In [17]:
decTree=DecisionTreeRegressor(random_state=2018)
cv = GridSearchCV(decTree, param_grid=parameters, cv=5)
cv

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=2018,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9],
                         'min_samples_leaf': [5, 10, 15],
                         'min_samples_split': [5, 10, 20, 25, 30]},
             pre_dispatch

In [18]:
cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=2018,
                                             splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'max_depth': [3, 4, 5, 6, 7, 8, 9],
                         'min_samples_leaf': [5, 10, 15],
                         'min_samples_split': [5, 10, 20, 25, 30]},
             pre_dispatch

In [19]:
y_pred=cv.predict(X_test)
y_pred

array([ 54.16504854,  54.16504854,  54.16504854, ..., 150.23300971,
       150.23300971, 150.23300971])

In [20]:
sampleSubmission=pd.read_csv("../input/bike-sharing-demand/sampleSubmission.csv")
sampleSubmission.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,0
1,2011-01-20 01:00:00,0
2,2011-01-20 02:00:00,0
3,2011-01-20 03:00:00,0
4,2011-01-20 04:00:00,0


In [21]:
mysubmission=sampleSubmission.copy()
mysubmission["count"]=y_pred
mysubmission.head()

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,54.165049
1,2011-01-20 01:00:00,54.165049
2,2011-01-20 02:00:00,54.165049
3,2011-01-20 03:00:00,54.165049
4,2011-01-20 04:00:00,54.165049


In [22]:
mysubmission.to_csv("mysubmission.csv", index=False)