In [1]:
import numpy as np
import pandas as pd

## Load Dataset

In [2]:
train = pd.read_csv("data/train.csv", parse_dates=["datetime"])

train.drop("casual", axis=1, inplace=True)
train.drop("registered", axis=1, inplace=True)

print(train.shape)
train.head()

(10886, 10)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1


In [3]:
train["yyyy"] = train["datetime"].dt.year
train["mm"] = train["datetime"].dt.month
train["hh"] = train["datetime"].dt.hour
train["weekday"] = train["datetime"].dt.dayofweek

In [4]:
def group_week(day):
    if day >= 4 and day <= 6: 
        return 0
    else:
        return 1
    
train["group_week"] = train["weekday"].apply(group_week)

train.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,count,yyyy,mm,hh,weekday,group_week
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,16,2011,1,0,5,0
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,40,2011,1,1,5,0
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,32,2011,1,2,5,0
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,13,2011,1,3,5,0
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,1,2011,1,4,5,0


In [5]:
test = pd.read_csv("data/test.csv", parse_dates=["datetime"])

test["yyyy"] = test["datetime"].dt.year
test["mm"] = test["datetime"].dt.month
test["hh"] = test["datetime"].dt.hour
test["weekday"] = test["datetime"].dt.dayofweek

print(test.shape)
test.head()

(6493, 13)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,yyyy,mm,hh,weekday
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,3
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,1,3
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,2,3
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,3,3
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,4,3


In [6]:
test["bin_week"] = test["weekday"].apply(group_week)

test.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,yyyy,mm,hh,weekday,bin_week
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,3,1
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,1,3,1
2,2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0,2011,1,2,3,1
3,2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,3,3,1
4,2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014,2011,1,4,3,1


## Score

In [7]:
feature_names = ["season", "holiday", "workingday", "weather", 
                 "temp", "atemp", "humidity","windspeed", 
                 "yyyy", "mm", "hh", "group_week"]

X_train = train[feature_names]

print(X_train.shape)
X_train.head()

(10886, 12)


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,yyyy,mm,hh,group_week
0,1,0,0,1,9.84,14.395,81,0.0,2011,1,0,0
1,1,0,0,1,9.02,13.635,80,0.0,2011,1,1,0
2,1,0,0,1,9.02,13.635,80,0.0,2011,1,2,0
3,1,0,0,1,9.84,14.395,75,0.0,2011,1,3,0
4,1,0,0,1,9.84,14.395,75,0.0,2011,1,4,0


In [8]:
label_name ="count"

y_train = train[label_name]

print(y_train.shape)
y_train.head()

(10886,)


0    16
1    40
2    32
3    13
4     1
Name: count, dtype: int64

In [9]:
from sklearn.tree import DecisionTreeRegressor

seed=37
model = DecisionTreeRegressor(random_state=seed)

model

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=37,
           splitter='best')

## RMSLE 

$$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } $$

In [10]:
## implement RMSLE function

from sklearn.metrics import make_scorer

def rmsle(predict, actual):
    predict = np.array(predict)
    actual = np.array(actual)
    
    log_predict = np.log(predict + 1)
    log_actual = np.log(actual + 1)
    
    difference = log_predict - log_actual
    square_difference = np.square(difference)
    mean_square_difference = np.mean(square_difference)
    
    score = np.sqrt(mean_square_difference)
    
    return score 

rmsle_score = make_scorer(rmsle)
rmsle_score
    

make_scorer(rmsle)

In [11]:
from sklearn.cross_validation import cross_val_score

score = cross_val_score(model, X_train, y_train, cv=20, scoring=rmsle_score).mean()

print("Score = {0:.5f}".format(score))



Score = 0.48496


## Submission

In [12]:
X_test = test[feature_names]

print(X_test.shape)
X_test.head()

KeyError: "['group_week'] not in index"

In [None]:
model.fit(X_train, y_train)

prediction = model.predict(X_test)

print(prediction.shape)
prediction[0:5]

In [None]:
submission = pd.read_csv("data/sampleSubmission.csv")

submission["count"] = prediction

print(submission.shape)
submission.head()

In [None]:
submission.to_csv("baseline-script.csv", index=False)