In [1]:
import numpy as np
import pandas as pd

## Load Dataset

In [7]:
train = pd.read_csv("bike_train.csv")
train.shape
train.columns

Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp',
       'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'],
      dtype='object')

In [8]:
train.head

<bound method NDFrame.head of                datetime  season  holiday  workingday  weather   temp   atemp  \
0       2011-01-01 0:00       1        0           0        1   9.84  14.395   
1       2011-01-01 1:00       1        0           0        1   9.02  13.635   
2       2011-01-01 2:00       1        0           0        1   9.02  13.635   
3       2011-01-01 3:00       1        0           0        1   9.84  14.395   
4       2011-01-01 4:00       1        0           0        1   9.84  14.395   
5       2011-01-01 5:00       1        0           0        2   9.84  12.880   
6       2011-01-01 6:00       1        0           0        1   9.02  13.635   
7       2011-01-01 7:00       1        0           0        1   8.20  12.880   
8       2011-01-01 8:00       1        0           0        1   9.84  14.395   
9       2011-01-01 9:00       1        0           0        1  13.12  17.425   
10     2011-01-01 10:00       1        0           0        1  15.58  19.695   
11     201

In [13]:
train = pd.read_csv("bike_train.csv", parse_dates=["datetime"])

print(train.shape)
train.head()

(10886, 12)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [14]:
test = pd.read_csv("bike_test.csv", parse_dates=["datetime"])

print(test.shape)
test.head()

(6493, 9)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


## Score

In [15]:
feature_names = ["season", "holiday", "workingday", "weather",
                 "temp", "atemp", "humidity", "windspeed"]

X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(10886, 8)


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,1,0,0,1,9.84,14.395,81,0.0
1,1,0,0,1,9.02,13.635,80,0.0
2,1,0,0,1,9.02,13.635,80,0.0
3,1,0,0,1,9.84,14.395,75,0.0
4,1,0,0,1,9.84,14.395,75,0.0


In [16]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

(6493, 8)


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,1,0,1,1,10.66,11.365,56,26.0027
1,1,0,1,1,10.66,13.635,56,0.0
2,1,0,1,1,10.66,13.635,56,0.0
3,1,0,1,1,10.66,12.88,56,11.0014
4,1,0,1,1,10.66,12.88,56,11.0014


In [17]:
label_name = "count"

y_train = train[label_name]

print(y_train.shape)
y_train.head(48
            )

(10886,)


0      16
1      40
2      32
3      13
4       1
5       1
6       2
7       3
8       8
9      14
10     36
11     56
12     84
13     94
14    106
15    110
16     93
17     67
18     35
19     37
20     36
21     34
22     28
23     39
24     17
25     17
26      9
27      6
28      3
29      2
30      1
31      8
32     20
33     53
34     70
35     93
36     75
37     59
38     74
39     76
40     65
41     53
42     30
43     22
44     31
45      9
46      8
47      5
Name: count, dtype: int64

In [18]:
# from sklearn.tree import DecisionTreeRegressor

# seed = 37

# model = DecisionTreeRegressor(random_state=seed)
# model

from sklearn.ensemble import RandomForestRegressor

seed = 37

model = RandomForestRegressor(random_state=seed)
model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=37, verbose=0, warm_start=False)

** Score ** = Root Mean Squared Logarithmic Error, RMSLE.

$$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } $$

In [19]:
from sklearn.metrics import make_scorer
def rmsle(predict, actual):
    predict = np.array(predict)
    actual = np.array(actual)
    
    log_predict = np.log(predict + 1)
    log_actual = np.log(actual + 1)
    
    difference = log_predict - log_actual
    square_difference = difference ** 2
    mean_square_difference = square_difference.mean()
    
    score = np.sqrt(mean_square_difference)
    
    return score

print(rmsle([1, 2, 3], [1, 2, 3]))
print(rmsle([1, 2, 3], [2, 3, 4]))
print(rmsle([1, 2, 3], [1, 4, 9]))

rmsle_score = make_scorer(rmsle)
rmsle_score

0.0
0.314619332526
0.605676350101


make_scorer(rmsle)

In [20]:
from sklearn.cross_validation import cross_val_score

score = cross_val_score(model, X_train, y_train, cv=20, scoring=rmsle_score).mean()

print("Score = {0:.5f}".format(score))



Score = 1.39960


## Submit

In [23]:
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print(predictions.shape)
predictions[:3]

(6493,)


array([ 104.55,   38.85,   38.85])

In [24]:
submission = pd.read_csv("sampleSubmission.csv")

submission["count"] = predictions

print(submission.shape)
submission.head()

(6493, 2)


Unnamed: 0,datetime,count
0,2011-01-20 0:00,104.55
1,2011-01-20 1:00,38.85
2,2011-01-20 2:00,38.85
3,2011-01-20 3:00,97.3
4,2011-01-20 4:00,97.3


In [25]:
from datetime import datetime

current_date = datetime.now()
current_date = current_date.strftime("%Y-%m-%d_%H-%M-%S")

description = "baseline-script"

filename = "{date}_{score:.5f}_{desc}.csv".format(date=current_date, score=score, desc=description)
filepath = "submissions/{filename}".format(filename=filename)

submission.to_csv(filepath, index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'submissions/2018-03-16_11-22-32_1.39960_baseline-script.csv'