In [2]:
import pandas as pd
import numpy as np
import matplotlib as pl
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier  
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import cross_val_score

In [24]:
data = pd.read_csv("bikes_train.csv")

In [25]:
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [26]:
data["datetime"] = pd.to_datetime(data["datetime"])
data["dayofweek"] = data["datetime"].dt.dayofweek      #The day of the week with Monday=0, Sunday=6
data["year"] = data["datetime"].dt.year
data["month"] = data["datetime"].dt.month
data["day"] = data["datetime"].dt.day
data["hour"] = data["datetime"].dt.hour

In [27]:
y = data["count"]
features = ["season","holiday","workingday","weather","temp","atemp","humidity","windspeed","dayofweek","year","month","day","hour"]
X = data[features]

In [7]:
X.head()

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,dayofweek,year,month,day,hour
0,1,0,0,1,9.84,14.395,81,0.0,5,2011,1,1,0
1,1,0,0,1,9.02,13.635,80,0.0,5,2011,1,1,1
2,1,0,0,1,9.02,13.635,80,0.0,5,2011,1,1,2
3,1,0,0,1,9.84,14.395,75,0.0,5,2011,1,1,3
4,1,0,0,1,9.84,14.395,75,0.0,5,2011,1,1,4


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
parameters = {'max_depth': range(1, 11),
              'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21)}
tree = DecisionTreeRegressor() 

In [10]:
regressor = GridSearchCV(estimator = tree, param_grid = parameters)

In [11]:
regressor.fit(X = X_train, y=y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [12]:
predicted = regressor.predict(X_test) 

In [13]:
wyniki = pd.DataFrame({"y_true":y_test,"y_pred":predicted}, index = y_test.index) 

In [14]:
wyniki

Unnamed: 0,y_pred,y_true
3133,154.115044,127
5786,60.166667,13
5224,220.052632,163
8953,249.753886,233
8054,201.863636,222
10044,151.200000,166
5337,168.521739,144
2753,382.262712,376
10127,605.250000,601
33,66.062500,53


In [15]:
def rmsle(y_pred, y_true) :
    assert len(y_true) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_true))**2))

In [39]:
rmsle(y_pred=predicted, y_true=y_test)

0.383604945875327

In [40]:
test = pd.read_csv("https://storage.googleapis.com/kaggle-competitions-data/kaggle/3948/test.csv?GoogleAccessId=web-data@kaggle-161607.iam.gserviceaccount.com&Expires=1528195692&Signature=j5Y0p3gSNPzLQ%2FlUG20AHX8D5YYcXqTJx%2FH6GoUoGzI1ECywxEhvRGlAjwe4V4B73NYHXLY0sYZPpSymKphoPF8as4eRNDrK%2BRu6gTC1mfYDRZoMp7%2F5fppYcQiWHtXJzTT3tZtt0q6SQZXrE5%2BHIFl2TNBnB7yTXrpG1Xif5upFEH6Ja7OLCCsmGlOmOhlI8RNm3uGMAgJhBGrnLojST%2FPCi4TeStZ2QqI4zYA3D4mGHSMGQzLw47tIP9CfOR1I14H53vc7MSLmzxMO7uhSd5h2uqVuDXzj%2B%2Br8HJjCjWpRve1dqhAWLrcW26SYZueG%2FKXXotm4UWvx6MD%2BCJzryg%3D%3D")

In [41]:
test["datetime"] = pd.to_datetime(test["datetime"])
test["dayofweek"] = test["datetime"].dt.dayofweek      #The day of the week with Monday=0, Sunday=6
test["year"] = test["datetime"].dt.year
test["month"] = test["datetime"].dt.month
test["day"] = test["datetime"].dt.day
test["hour"] = test["datetime"].dt.hour

In [42]:
test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,dayofweek,year,month,day,hour
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,3,2011,1,20,0
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,3,2011,1,20,1
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,3,2011,1,20,2
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,3,2011,1,20,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,3,2011,1,20,4


In [48]:
kaggle_test = test[["season","holiday","workingday","weather","temp","atemp","humidity","windspeed","dayofweek","year","month","day","hour"]]

In [49]:
regressor.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(1, 11), 'min_samples_split': range(2, 21), 'min_samples_leaf': range(1, 21)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [54]:
test["count"] = regressor.predict(kaggle_test)

In [56]:
test[["datetime", "count"]]

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,11.375000
1,2011-01-20 01:00:00,10.500000
2,2011-01-20 02:00:00,7.400000
3,2011-01-20 03:00:00,2.470588
4,2011-01-20 04:00:00,2.470588
5,2011-01-20 05:00:00,7.625000
6,2011-01-20 06:00:00,37.571429
7,2011-01-20 07:00:00,128.678571
8,2011-01-20 08:00:00,128.678571
9,2011-01-20 09:00:00,128.678571


In [58]:
test[["datetime", "count"]].to_csv("kaggle2.csv", index=False)