# Bike Sharing Demand

In [137]:
import pandas
import numpy
import time
from datetime import datetime
from sklearn.ensemble import GradientBoostingRegressor
import sklearn.model_selection as model_selection
import sklearn.metrics as metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

### Data Loading
Load the training and testing data from the given CSV files

In [138]:
# X should not contain any of the label columns and y is simply the count
train_data = pandas.read_csv('data/train.csv')
X = train_data.drop(['count', 'casual', 'registered'], axis=1)
y = train_data['count']

test_data = pandas.read_csv('data/test.csv')

### Evaluation
The scoring function is the Root Mean Squared Logarithmic Error given by

$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 } $

Where

* $n$ is the number of hours in the test set
* $pi$ is your predicted count
* $ai$ is the actual count
* $log(x)$ is the natural logarithm

In [139]:
def rmsle(y, y_):
    log1 = numpy.nan_to_num(numpy.array([numpy.log(v + 1) 
                                         for v 
                                         in y]))
    log2 = numpy.nan_to_num(numpy.array([numpy.log(v + 1) 
                                         for v 
                                         in y_]))
    calc = (log1 - log2) ** 2
    return numpy.sqrt(numpy.mean(calc))

# create a custom scorer to be used in grid search, etc
scorer = metrics.make_scorer(score_func=rmsle, 
                             greater_is_better=False)

### Submission
In order to submit to Kaggle we have to generate predictions from the test set and output them to a file with the following format

~~~~
datetime,count
2011-01-20 00:00:00,0
2011-01-20 01:00:00,0
2011-01-20 02:00:00,0
...
...
~~~~

In [140]:
def generate_kaggle_submission(
        transformer, 
        regressor, 
        X_train, 
        y_train,
        test_data):
    
    # train the final model on the transformed data
    regressor.fit(transformer(X_train), 
                  y_train)

    # create a dataframe containing the datetimes to predict and then
    # add the predictions from the trained pipeline
    predictions = pandas.DataFrame(test_data['datetime'])
    predictions['count'] = regressor.predict(transformer(test_data)).astype('int')

    # create a submission file from the result tagged with the current time
    predictions.to_csv('submissions/submission{0}.csv'.format(str(int(time.time()))), 
                       sep=',', 
                       index=False)

### Partition Data

Split out the given training data into a train and a test set and use all of the available parameters

In [141]:
def partition_train_test(
        X, 
        y,
        split_percentage = .8):
    mask = numpy.random.rand(len(X)) < split_percentage
    X_train = X[mask]
    y_train = y[mask]
    X_test = X[~mask]
    y_test = y[~mask]
    
    print('{0} training examples and {1} testing examples'.format(len(X_train), 
                                                                  len(X_test)))
    
    return X_train, X_test, y_train, y_test

X_train, X_dev, y_train, y_dev = partition_train_test(X, y)

8729 training examples and 2157 testing examples


### Initial Feature Engineering

All of the data is already numeric except for datetime. Replace the datetime with distinct numeric parameters for hour, day, month and year. Then display the summary of the data as a sanity check.

In [142]:
def simple_feature_eng(data):
    copy = data.copy()
    copy['hour'] = copy.datetime.apply(lambda x: x.split()[1].split(':')[0]).astype('int')
    copy['day'] = copy.datetime.apply(lambda x: x.split()[0].split('-')[2]).astype('int')
    copy['month'] = copy.datetime.apply(lambda x: x.split()[0].split('-')[1]).astype('int')
    copy['year'] = copy.datetime.apply(lambda x: x.split()[0].split('-')[0]).astype('int')
    copy = copy.drop(['datetime'], axis=1)
    return copy

print(simple_feature_eng(X).describe())

             season       holiday    workingday       weather         temp  \
count  10886.000000  10886.000000  10886.000000  10886.000000  10886.00000   
mean       2.506614      0.028569      0.680875      1.418427     20.23086   
std        1.116174      0.166599      0.466159      0.633839      7.79159   
min        1.000000      0.000000      0.000000      1.000000      0.82000   
25%        2.000000      0.000000      0.000000      1.000000     13.94000   
50%        3.000000      0.000000      1.000000      1.000000     20.50000   
75%        4.000000      0.000000      1.000000      2.000000     26.24000   
max        4.000000      1.000000      1.000000      4.000000     41.00000   

              atemp      humidity     windspeed          hour           day  \
count  10886.000000  10886.000000  10886.000000  10886.000000  10886.000000   
mean      23.655084     61.886460     12.799395     11.541613      9.992559   
std        8.474601     19.245033      8.164537      6.91583

### Initial Regressor

Decision Tree ensembles, particularly Boosted Decision Trees, have fairly good performance over a wide variety of use cases as demonstrated [here](https://ucb-mids.s3.amazonaws.com/prod/DATASCI+W207+Intro+to+Machine+Learning/Readings/caruana.icml06.pdf). Since the values for count have to be both non-negative and an integer we will subclass the Gradient Boosting Regressor to force the predictions to accomidate that requirement.

In [143]:
class PositiveIntegerGradientBoostingRegressor(GradientBoostingRegressor):
    def predict(
            self, 
            X):
        prediction = super(
            PositiveIntegerGradientBoostingRegressor, 
            self).predict(X)
        return numpy.around(prediction.clip(0))

### Model Fit

We will use GridSearch to tune over a range values of max depth for a Gradient Descent Boosted Decision Tree regressor preceeded by our initial feature engineering transformer. We will also use the given RMSLE error function as a custom scoring function. Fit the model and evaluate the resulting predictions on the held out dev data set.

In [153]:
def simple_grid_search(
        regressor, 
        transformer,
        param_grid, 
        X_train, 
        y_train, 
        X_dev, 
        y_dev):
    pipeline = Pipeline([('reg', regressor)])

    model = model_selection.GridSearchCV(pipeline, 
                                         param_grid, 
                                         scorer,
                                         n_jobs=4)
    
    print('Transforming dataset with {0} features'.format(len(X_train.columns)))
    transformed_X_train = transformer(X_train)
    transformed_X_dev = transformer(X_dev)
    
    print('Fitting model with {0} features'.format(len(transformed_X_train.columns)))
    model.fit(transformed_X_train,
              y_train)
    
    print('Best Parameters: {0}'.format(model.best_params_))
    print('RMSLE: {0}'.format(rmsle(y_dev, 
                                    model.predict(transformed_X_dev))))
    return model.best_params_

hyperparameters = simple_grid_search(
    PositiveIntegerGradientBoostingRegressor(n_estimators=100), 
    simple_feature_eng,
    [{'reg__max_depth': list(range(1, 15))}],
    X_train,
    y_train,
    X_dev,
    y_dev)

Transforming dataset with 9 features
Fitting model with 12 features
Best Parameters: {'reg__max_depth': 10}
RMSLE: 0.33345986936344557


##### Initial Submission Generation
Generate the first submission to Kaggle. This resulted in a score of **.50555**.

In [145]:
generate_kaggle_submission(simple_feature_eng,
                           PositiveIntegerGradientBoostingRegressor(
                               n_estimators=1000, 
                               max_depth=hyperparameters['reg__max_depth']),
                           X,
                           y,
                           test_data

SyntaxError: unexpected EOF while parsing (<ipython-input-145-3f95efc84d7e>, line 7)

### Revised Feature Engineering
After looking at the data there are a couple of changes to the feature engineering we can make:

* Since there are only 2 years worth of data we can reduce the year and month data down to a single parameter month_count
* The training data is only from the first 20 days of each month, and the test data is always from the last 7-11 days. This may be problematic because the model is seeing values outside of its trained range for day of month on EVERY testing example. One way to simplify this and still maintain the periodic nature of the data is to drop the day of the month variable and create a day of the week variable.

In [151]:
def get_date(my_datetime):
    return datetime.strptime(
        my_datetime, 
        '%Y-%m-%d %H:%M:%S')

def revised_feature_eng(data):
    copy = data.copy()
    copy['hour'] = copy.datetime.apply(lambda x: get_date(x).hour).astype('int')    
    copy['month_count'] = copy.datetime.apply(lambda x: (get_date(x).year-2011)*12 + get_date(x).month).astype('int')
    copy['day_of_week'] = copy.datetime.apply(lambda x: get_date(x).weekday())
    copy = copy.drop(['datetime'], axis=1)
    return copy

hyperparameters = simple_grid_search(
    PositiveIntegerGradientBoostingRegressor(n_estimators=100), 
    revised_feature_eng,
    [{'reg__max_depth': list(range(1, 15))}],
    X_train,
    y_train,
    X_dev,
    y_dev)

Transforming dataset with 9 features
Fitting model with 11 features
Best Parameters: {'reg__max_depth': 11}
RMSLE: 0.3300200190195893


### Submission 2
Interestingly in this submission the RMSLE score went up to **.55832**. It appears as if the day_of_the_week parameter is causing problems with the testing data despite the score improving on the held out development data.

In [152]:
generate_kaggle_submission(revised_feature_eng,
                           PositiveIntegerGradientBoostingRegressor(
                               n_estimators=1000, 
                               max_depth=hyperparameters['reg__max_depth']),
                           X,
                           y,
                           test_data)