In [1]:
import warnings
warnings.filterwarnings('ignore')

# Load and process the data

In [2]:
import pandas as pd
data = pd.read_csv('./train.csv', parse_dates=['datetime'])
data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,0.24,0.2879,0.75,0.0,0,1,1


## Transform raw features

In [3]:
data['year']  = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['dow']   = data['datetime'].dt.dayofweek
data['hour']  = data['datetime'].dt.hour

data[['datetime', 'dow', 'year', 'month', 'hour']].sample(n=5)

Unnamed: 0,datetime,dow,year,month,hour
573,2011-02-07 01:00:00,0,2011,2,1
4519,2011-11-01 09:00:00,1,2011,11,9
8429,2012-07-12 14:00:00,3,2012,7,14
6047,2012-02-08 05:00:00,2,2012,2,5
6192,2012-02-14 06:00:00,1,2012,2,6


### One-hot encoding of categorical features

In [4]:
dow_df = pd.get_dummies(data['dow'], prefix='dow')
dow_df.sample(n=5)

Unnamed: 0,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,dow_6
3963,0,0,0,0,1,0,0
3490,1,0,0,0,0,0,0
8976,0,0,0,1,0,0,0
6983,1,0,0,0,0,0,0
8536,0,1,0,0,0,0,0


In [5]:
# Doing the same for `weather`
wthr_df = pd.get_dummies(data['weather'], prefix='wthr')
wthr_df.sample(n=5)

Unnamed: 0,wthr_1,wthr_2,wthr_3,wthr_4
10851,1,0,0,0
6747,0,1,0,0
10638,0,0,1,0
5178,1,0,0,0
10396,1,0,0,0


Function `prepare_data` puts these transformations together, so that they can be conveniently applied to the *test* data as well as the training data: obviously, all features accessible at training time must also be accessible at prediction time.

In [6]:
def prepare_data(filename):
    data = pd.read_csv(filename, parse_dates=['datetime'])
    data['year']  = data['datetime'].dt.year
    data['month'] = data['datetime'].dt.month
    data['dow']   = data['datetime'].dt.dayofweek
    data['hour']  = data['datetime'].dt.hour
    
    # `get_dummies` implements one-hot encoding
    dow_df = pd.get_dummies(data['dow'], prefix='dow')
    wthr_df = pd.get_dummies(data['weather'], prefix='wthr')
    
    # Attach generated features to the original data frame
    data = pd.concat([data, dow_df, wthr_df], axis=1)

    # Three lines below essentially
    features = ['year', 'month', 'hour', 'holiday', 'workingday',
                'temp', 'atemp', 'humidity', 'windspeed']
    features += ['dow_%d'%i for i in range(7)]
    features += ['wthr_%d'%i for i in range(1, 5)]
    X = data[features].values

    y = None
    y_registered = None
    y_casual = None
    if 'count' in data.columns:
        y = data['count'].values
    if 'registered' in data.columns:
        y_registered = data['registered'].values
        y_casual = data['casual'].values
    
    return X, y, y_registered, y_casual

X, y, y_registered, y_casual = prepare_data('./train.csv')
X_test, _, _, _ = prepare_data('./test.csv')

# Model selection and evaluation

## Separate validation set

In [7]:
from sklearn.cross_validation import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y)
print (X_train.shape, y_train.shape)
print (X_validation.shape, y_validation.shape)

(8164, 20) (8164,)
(2722, 20) (2722,)




## Cross-validation

In [8]:
from sklearn.cross_validation import cross_val_score

## Training and evaluating a model

In [10]:
from sklearn.metrics import make_scorer
import numpy as np

def rmsle_func(y_actual, y_predicted):
    sle = (np.log(y_predicted+1) - np.log(y_actual+1))**2
    return np.sqrt(np.mean(sle))

rmsle_loss = make_scorer(rmsle_func, greater_is_better=False)

# Ensembles

## Boosting

In [11]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import  DecisionTreeRegressor

tree_model = DecisionTreeRegressor(max_depth=5)
adaboost_model = AdaBoostRegressor(DecisionTreeRegressor(max_depth=15), n_estimators=1000)

In [12]:
from sklearn.ensemble import RandomForestRegressor

params = {'n_estimators': 1000, 'max_depth': 15, 'random_state': 0, 'min_samples_split' : 5, 'n_jobs': -1}
rf_model = RandomForestRegressor(**params)

## Bagging

In [13]:
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import  DecisionTreeRegressor

bagging_model = BaggingRegressor(DecisionTreeRegressor(max_depth=15), n_estimators=1000)

## Evaluate the models

In [14]:
from sklearn.cross_validation import KFold
cv_generator = KFold(len(X), n_folds=3, shuffle=True)

def print_cv_score(model, name):
    scores = cross_val_score(model, X, y, scoring=rmsle_loss, cv=cv_generator)
    mean_score = -scores.mean()
    print('%25s: RMSLE = %.4f'%(name, mean_score))

# Compare performance of different models:   
print_cv_score(tree_model,     'DecisionTreeRegressor')
print_cv_score(adaboost_model, 'AdaBoostRegressor')
print_cv_score(rf_model,       'RandomForestRegressor')
print_cv_score(bagging_model,   'BaggingRegressor')

    DecisionTreeRegressor: RMSLE = 0.6101
        AdaBoostRegressor: RMSLE = 0.4221
    RandomForestRegressor: RMSLE = 0.3389
         BaggingRegressor: RMSLE = 0.3373


## Use two models 

In [15]:
scores = []
for train_indices, validation_indices in cv_generator:
    rf_model.fit(X[train_indices], y_casual[train_indices])
    y_c_pred = rf_model.predict(X[validation_indices])
    
    rf_model.fit(X[train_indices], y_registered[train_indices])
    y_r_pred = rf_model.predict(X[validation_indices])
    
    y_pred = y_c_pred + y_r_pred
    y_pred[y_pred < 0] = 0
    score = rmsle_func(y[validation_indices], y_pred)
    
    scores.append(score)
print (np.mean(scores))    

0.32833365705
