# Benchmark Stats

This notebook walks through some benchmark models trained on a subset of the training data.

In [1]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import gcsfs
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

  from numpy.core.umath_tests import inner1d


In [2]:
dtypes = {
    'year': 'str',
    'month': 'str',
    'hour': 'str',
    'dow': 'str',
    'same_pickup_dropoff': 'str',
    'pickup_latitude_quantile': 'str',
    'pickup_longitude_quantile': 'str',
    'dropoff_latitude_quantile': 'str',
    'dropoff_longitude_quantile': 'str',
    'pickup_jfk': 'str',
    'dropoff_jfk': 'str'
}
fs = gcsfs.GCSFileSystem(project='steadfast-mason-213717')
with fs.open('nyc-taxi-fare-prediction-data/train.csv') as f:
    train = pd.read_csv(f, dtype=dtypes, nrows=500000)
dev = dd.read_csv('gs://nyc-taxi-fare-prediction-data/dev.csv', dtype=dtypes).compute()

## Summary

| Model | Features | RMSE on Validation Data |
|-------|----------|-------------------------|
| Linear Regression | distance, duration | 5.21 |
| Linear Regression | distance, duration, time, bearing | 4.93 |
| Random Forest (n=100) | distance, duration, time, bearing, coordinates | 3.76 |

## Linear Regression

In [3]:
def evaluate_linear_model(train, dev, features):
    lm = LinearRegression()
    lm.fit(pd.get_dummies(train[features]), train['fare_amount'])
    rmse = np.sqrt(mean_squared_error(
        lm.predict(pd.get_dummies(dev[features])), dev['fare_amount']))
    print('RMSE: ', rmse)
    return lm

In [4]:
evaluate_linear_model(train, dev, ['distance', 'duration'])

RMSE:  5.206930525895996


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [5]:
evaluate_linear_model(
    train, dev, ['distance', 'duration', 'year', 'month', 'dow', 'hour', 'bearing_bucket'])

RMSE:  4.928565470076604


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Random Forest

In [6]:
def evaluate_random_forest(train, dev, features, encode_time_features=False):
    # Note that sklearn RF requires that categorical features be one hot encoded, so we
    # have to decide whether to encode the time features or leave them as continuous.
    rf = RandomForestRegressor(n_estimators=50, n_jobs=-1)
    dummies = None
    if not encode_time_features and 'bearing_bucket' in features:
        dummies = ['bearing_bucket']
    rf.fit(pd.get_dummies(train[features], columns=dummies), train['fare_amount'])
    rmse = np.sqrt(mean_squared_error(
        rf.predict(pd.get_dummies(dev[features], columns=dummies)), dev['fare_amount']))
    print('RMSE: ', rmse)
    return rf

In [7]:
features = ['distance', 'duration', 'year', 'month', 'dow', 'hour', 'bearing_bucket',
            'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
evaluate_random_forest(train, dev, features)

RMSE:  3.763728701297432


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)