# Benchmark Stats

This notebook walks through some benchmark models trained on a subset of the training data.

In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [7]:
dtypes = {
  'year': 'str',
  'month': 'str',
  'hour': 'str',
  'dow': 'str',
  'same_pickup_dropoff': 'str',
  'pickup_latitude_quantile': 'str',
  'pickup_longitude_quantile': 'str',
  'dropoff_latitude_quantile': 'str',
  'dropoff_longitude_quantile': 'str',
  'pickup_jfk': 'str',
  'dropoff_jfk': 'str'
}
train = pd.read_csv('data/train.csv', nrows=2000000, dtype=dtypes)
dev = pd.read_csv('data/dev.csv', dtype=dtypes)

## Summary

| Model | Features | RMSE on Validation Data |
|-------|----------|-------------------------|
| Linear Regression | distance, duration | 5.34 |
| Linear Regression | distance, duration, time, bearing | 5.06 |
| Random Forest (n=100) | distance, duration, time, bearing, coordinates | 3.79 |

## Linear Regression

In [3]:
def evaluate_linear_model(train, dev, features):
  lm = LinearRegression()
  lm.fit(pd.get_dummies(train[features]), train['fare_amount'])
  print 'RMSE: ', np.sqrt(mean_squared_error(lm.predict(pd.get_dummies(dev[features])), dev['fare_amount']))
  return lm

In [4]:
evaluate_linear_model(train, dev, ['distance', 'duration'])

RMSE:  5.343056553857415


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [5]:
evaluate_linear_model(train, dev, ['distance', 'duration', 'year', 'month', 'dow', 'hour', 'bearing_bucket'])

RMSE:  5.064299869090713


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## Random Forest

In [29]:
def haversine(lat1, lon1, lat2, lon2):
  lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
  dlat = lat2 - lat1
  dlon = lon2 - lon1
  a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
  c = 2 * np.arcsin(np.sqrt(a)) 
  r = 6.371 * 10**6
  return c * r

def add_hav(df):
  df['haversine'] = haversine(
    df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude'])

add_hav(train)
add_hav(dev)

In [22]:
def evaluate_random_forest(train, dev, features, encode_time_features=False):
  # Note that sklearn RF requires that categorical features be one hot encoded, so we
  # have to decide whether to encode the time features or leave them as continuous.
  rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)
  dummies = None
  if not encode_time_features and 'bearing_bucket' in features:
    dummies = ['bearing_bucket']
  rf.fit(pd.get_dummies(train[features], columns=dummies), train['fare_amount'])
  print 'RMSE: ', np.sqrt(mean_squared_error(rf.predict(pd.get_dummies(dev[features], columns=dummies)), dev['fare_amount']))
  return rf

In [23]:
features = ['distance', 'duration', 'year', 'month', 'dow', 'hour', 'bearing_bucket',
            'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
evaluate_random_forest(train, dev, features)

RMSE:  3.7924545161052747


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)