# Bike Sharing Demand

In [122]:
import pandas
import numpy
from sklearn.ensemble import GradientBoostingRegressor
import sklearn.model_selection as model_selection
import sklearn.metrics as metrics
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import Pipeline

### Evaluation
The scoring function is the Root Mean Squared Logarithmic Error given by

$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } $

Where

* $n$ is the number of hours in the test set
* $pi$ is your predicted count
* $ai$ is the actual count
* $log(x)log⁡(x)$ is the natural logarithm

In [110]:
def rmsle(y, y_):
    log1 = numpy.nan_to_num(numpy.array([numpy.log(v + 1) 
                                         for v 
                                         in y]))
    log2 = numpy.nan_to_num(numpy.array([numpy.log(v + 1) 
                                         for v 
                                         in y_]))
    calc = (log1 - log2) ** 2
    return numpy.sqrt(numpy.mean(calc))

### Feature Engineering

All of the data is already numeric except for datetime. Convert the datetime into distinct numeric parameters for hour, day, month and year. Then display the summary of the data as a sanity check

In [106]:
def coerceData(data):
    data['hour'] = data.datetime.apply(lambda x: x.split()[1].split(':')[0]).astype('int')
    data['day'] = data.datetime.apply(lambda x: x.split()[0].split('-')[2]).astype('int')
    data['month'] = data.datetime.apply(lambda x: x.split()[0].split('-')[1]).astype('int')
    data['year'] = data.datetime.apply(lambda x: x.split()[0].split('-')[0]).astype('int')
    data = data.drop(['datetime'], axis=1)

    return data

data = coerceData(pandas.read_csv('train.csv'))
print(data.describe())

             season       holiday    workingday       weather         temp  \
count  10886.000000  10886.000000  10886.000000  10886.000000  10886.00000   
mean       2.506614      0.028569      0.680875      1.418427     20.23086   
std        1.116174      0.166599      0.466159      0.633839      7.79159   
min        1.000000      0.000000      0.000000      1.000000      0.82000   
25%        2.000000      0.000000      0.000000      1.000000     13.94000   
50%        3.000000      0.000000      1.000000      1.000000     20.50000   
75%        4.000000      0.000000      1.000000      2.000000     26.24000   
max        4.000000      1.000000      1.000000      4.000000     41.00000   

              atemp      humidity     windspeed        casual    registered  \
count  10886.000000  10886.000000  10886.000000  10886.000000  10886.000000   
mean      23.655084     61.886460     12.799395     36.021955    155.552177   
std        8.474601     19.245033      8.164537     49.96047

### Partition Data

Split out the given training data into a train and a test set and use all of the available parameters

In [107]:
X = data.drop(['count', 'casual', 'registered'], axis=1).values
y = data['count'].values
X_train, X_dev, y_train, y_dev = model_selection.train_test_split(X,
                                                                  y,
                                                                  random_state=1)

print('{0} training examples and {1} testing examples'.format(X_train.shape[0], 
                                                              X_dev.shape[0]))
print('{0} model parameters'.format(X_train.shape[1]))

8164 training examples and 2722 testing examples
12 model parameters


### Hyperparameter Tuning

Decision Tree ensembles, particularly Boosted Decision Trees, have fairly good performance over a wide variety of use cases as demonstrated [here](https://ucb-mids.s3.amazonaws.com/prod/DATASCI+W207+Intro+to+Machine+Learning/Readings/caruana.icml06.pdf). So we will use GridSearch to tune over a range values of max depth for a Gradient Descent Boosted Decision Tree Regressor. We will also use the given RMSLE error function as a custom scoring function.

In [134]:
class PositiveIntegerGradientBoostingRegressor(GradientBoostingRegressor):
    def predict(self, X):
        prediction = super(PositiveIntegerGradientBoostingRegressor, self).predict(X)
        return numpy.around(prediction.clip(0))
    
param_grid = [
  {'max_depth': list(range(1,15))}
 ]

regressor = PositiveIntegerGradientBoostingRegressor(n_estimators=100) 

scorer = metrics.make_scorer(score_func=rmsle, 
                             greater_is_better=False)
model = model_selection.GridSearchCV(regressor, 
                                     param_grid, 
                                     scorer,
                                     n_jobs=4)

### Model Fit

Fit the model and evaluate the resulting predictions on the held out dev data set.

In [135]:
model.fit(X_train, 
          y_train)
print('Best Parameters: {0}'.format(model.best_params_))
print('RMSLE: {0}'.format(rmsle(y_dev, 
                                model.predict(X_dev))))

Best Parameters: {'max_depth': 11}
RMSLE: 0.09080789737748393


### Create Submission

In order to submit to kaggle we have to generate predictions from the test set and output them to a file with the following format

~~~~
datetime,count
2011-01-20 00:00:00,0
2011-01-20 01:00:00,0
2011-01-20 02:00:00,0
...
...
~~~~

We will regenerate the model using the optimal parameters for the Boosted Decision Tree and fit on all of the given labeled data.

In [136]:
estimator = PositiveIntegerGradientBoostingRegressor(n_estimators=1000, 
                                                 max_depth=model.best_params_['max_depth']) 
X_train = data.drop(['count', 'casual', 'registered'], axis=1).values
y_train = data['count'].values
estimator.fit(X_train, 
              y_train)

testData = pandas.read_csv('test.csv')
predictions = pandas.DataFrame(testData['datetime'])
testData = coerceData(testData)
predictions['count'] = estimator.predict(testData).astype('int')

predictions.to_csv('submission.csv', 
                   sep=',', 
                   index=False)