# Divvy Bike Ride Prediction

### Introduction

### Importing Libraries and Data

In [1]:
import math
import random
import subprocess

import numpy   as np
import pandas  as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [None]:
%%time

filename = "data/data.csv"

def file_len(fname):
    p = subprocess.Popen(['wc', '-l', fname], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    result, err = p.communicate()
    if p.returncode != 0:
        raise IOError(err)
    return int(result.strip().split()[0]) + 1

n_rows = file_len(filename)
print('Data file contains {} rows'.format(n_rows))

In [26]:
data_df = pd.read_csv(
     filename
)

data_df.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,trip_id,year,month,week,day,hour,tripduration,temperature,from_station_id,latitude_start,longitude_start,dpcapacity_start,to_station_id,latitude_end,longitude_end,dpcapacity_end
14691560,2017,6,25,3,17,Subscriber,Female,2017-06-22 17:39:00,2017-06-22 17:44:44,5.733333,84.9,cloudy,344,Ravenswood Ave & Lawrence Ave,41.96909,-87.674237,39.0,464,Damen Ave & Foster Ave,41.975615,-87.679459,15.0
3810416,2014,9,40,1,22,Subscriber,Male,2014-09-30 22:11:00,2014-09-30 22:16:00,4.583333,55.0,cloudy,130,Damen Ave & Division St,41.90331,-87.67695,15.0,128,Damen Ave & Chicago Ave,41.895769,-87.67722,15.0
2662940,2014,7,29,5,13,Subscriber,Male,2014-07-19 13:07:00,2014-07-19 13:22:00,14.083333,79.0,cloudy,176,Clark St & Elm St,41.903233,-87.631253,19.0,324,Stockton Dr & Wrightwood Ave,41.93132,-87.638742,15.0
16997598,2017,10,42,2,12,Subscriber,Male,2017-10-18 12:32:00,2017-10-18 12:36:00,4.866667,68.0,clear,195,Columbus Dr & Randolph St,41.884728,-87.619521,47.0,197,Michigan Ave & Madison St,41.882134,-87.625125,19.0
2130730,2014,6,25,0,13,Subscriber,Male,2014-06-16 13:01:00,2014-06-16 13:14:00,13.466667,86.0,cloudy,191,Canal St & Monroe St,41.8807,-87.63947,23.0,24,Fairbanks Ct & Grand Ave,41.89186,-87.62062,15.0


In [25]:
train_df, test_df = train_test_split(data_df, shuffle=True)

print('There are {} rows in the training dataset'.format(train_df.shape[0]))
print('There are {} rows in the testing dataset'.format(test_df.shape[0]))
print(train_df.describe())

There are 75000 rows in the training dataset
There are 25000 rows in the testing dataset
               week           day   tripduration   from_station_id  \
count  75000.000000  75000.000000   75000.000000      75000.000000   
mean      11.417204     63.013908     180.154227         41.900332   
std        7.155356     17.163324     121.607714          0.034677   
min        2.000000     -8.000000       2.000000         41.736646   
25%        6.000000     52.000000      77.000000         41.881032   
50%        9.616667     66.900000     164.000000         41.892278   
75%       15.200000     75.900000     270.000000         41.920195   
max       59.883333     95.000000     625.000000         42.064313   

        latitude_start   longitude_start   dpcapacity_start   latitude_end  \
count     75000.000000      75000.000000       75000.000000   75000.000000   
mean        -87.644836         21.385587         180.564600      41.900671   
std           0.021856          7.651416      

### Feature Engineering

I want to know, as Bird Scooter flies, how far each rider took their Divvy bike. Disclaimer: this will only give us the total displacement of a ride, so a user might have gone 20 miles round trip, but if they ended up back at the same station then they'll come across as going 0 with this method. It should be a useful metric specifically for people who are using the bike to commute.

We'll use the haversine formula to calculate the great circle distance in kilometers that a bike is displaced and then get average rate from that.

Finally we have to encode the gender and events columns to integers instead of being strings.

In [None]:
def haversine(row):
    
    lon1 = row['longitude_start']
    lat1 = row['latitude_start']
    lon2 = row['longitude_end']
    lat2 = row['latitude_end']
    
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])
    
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    
    km = 6367 * c
    
    return km

train_df['displacement'] = train_df.apply(lambda row: haversine(row), axis=1)
train_df['rate'] = train_df['displacement'].div(train_df['tripduration']).multiply(60)

test_df['displacement'] = test_df.apply(lambda row: haversine(row), axis=1)
test_df['rate'] = test_df['displacement'].div(train_df['tripduration']).multiply(60)

print('==TARGETS==')
print(train_df[['tripduration', 'displacement', 'rate']].describe())

In [None]:
train_df['gender_code'] = train_df['gender'].astype('category')
train_df['events_code'] = train_df['events'].astype('category')

test_df['gender_code'] = test_df['gender'].astype('category')
test_df['events_code'] = test_df['events'].astype('category')

cat_columns = train_df.select_dtypes(['category']).columns
train_df[cat_columns] = train_df[cat_columns].apply(lambda x: x.cat.codes)

cat_columns = test_df.select_dtypes(['category']).columns
test_df[cat_columns] = test_df[cat_columns].apply(lambda x: x.cat.codes)

train_df[cat_columns].head()

### Feature Selection
Dataset features that I think will help predict ride stats:
 * gender
 * events
 * temperature
 * latitude_start
 * longitude_start
 * month
 * day
 * hour

In [None]:
features = [
    'gender_code',
    'events_code',
    'temperature',
    'latitude_start',
    'longitude_start',
    'month',
    'day',
    'hour'
]

train_X = train_df[features]
test_X  = test_df[features]

train_y_disp = train_df['displacement']
train_y_dur  = train_df['tripduration']
train_y_rate = train_df['rate']

test_y_disp = test_df['displacement']
test_y_dur  = test_df['tripduration']
test_y_rate = test_df['rate']

In [None]:
displacement_model = DecisionTreeRegressor(random_state=1)
displacement_model.fit(train_X, train_y_disp)

actual = test_y_disp.head()
pred   = displacement_model.predict(test_X.head())

for pair in zip(list(pred), list(actual)):
    print('Predicted: {}, Actual: {}'.format(pair[0], pair[1]))

In [None]:
duration_model = DecisionTreeRegressor(random_state=1)
duration_model.fit(train_X, train_y_dur)

actual = test_y_dur.head()
pred   = duration_model.predict(test_X.head())

for pair in zip(list(pred), list(actual)):
    print('Predicted: {}, Actual: {}'.format(pair[0], pair[1]))

In [None]:
rate_model = DecisionTreeRegressor(random_state=1)
rate_model.fit(train_X, train_y_rate)

actual = test_y_rate.head()
pred   = rate_model.predict(test_X.head())

for pair in zip(list(pred), list(actual)):
    print('Predicted: {}, Actual: {}'.format(pair[0], pair[1]))