In [1]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon

import json # library to handle JSON files

# import the data set
df = pd.read_csv('/Users/no50free/GitHub/NiceRide/201908-niceride-tripdata.csv')

In [2]:
df.head(3)

Unnamed: 0,tripduration,start_time,end_time,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,bike type
0,811,2019-08-01 00:00:44.8830,2019-08-01 00:14:16.2140,191.0,Park Ave & 4th Street S,44.975907,-93.260061,198.0,5th Street NE & 3rd Ave NE,44.991914,-93.255799,262,Subscriber,1991,2,Classic
1,99,2019-08-01 00:00:48.1090,2019-08-01 00:02:27.5170,43.0,Nicollet Mall & 10th Street,44.973839,-93.274544,8.0,YWCA Downtown,44.972217,-93.276435,477,Subscriber,1990,2,Classic
2,1205,2019-08-01 00:02:34.2950,2019-08-01 00:22:39.7020,69.0,N Washington Ave & 9th Ave N,44.989909,-93.279788,199.0,3rd Street NE & Lowry Ave NE,45.01304,-93.264573,24,Customer,1994,1,Classic


# Convert time

In [3]:
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

# Add start hour
df['start_hour'] = df['start_time'].dt.hour

# Add day of week
df['startdayofweek'] = df['start_time'].dt.dayofweek
df['enddayofweek'] = df['end_time'].dt.dayofweek

# Convert tripduration to minutes instead of seconds
df.tripduration = df.tripduration/60

In [10]:
df.sample(3)

Unnamed: 0,tripduration,start_time,end_time,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,bike type,startdayofweek,enddayofweek,long_trip,start_hour
60932,10.716667,2019-08-26 18:50:30.465,2019-08-26 19:01:14.412,635.0,Hennepin Ave & S 3rd Street,44.980957,-93.27046,69.0,N Washington Ave & 9th Ave N,44.989909,-93.279788,598,Subscriber,1948,1,Classic,0,0,False,18
53066,26.65,2019-08-23 08:21:11.776,2019-08-23 08:47:51.676,,,44.97,-93.28,,,44.97,-93.24,2997,Customer,1997,1,Dockless,4,4,False,8
61375,35.333333,2019-08-27 06:37:17.603,2019-08-27 07:12:38.172,199.0,3rd Street NE & Lowry Ave NE,45.01304,-93.264573,199.0,3rd Street NE & Lowry Ave NE,45.01304,-93.264573,3498,Subscriber,1969,1,Classic,1,1,True,6


In [41]:
# Long trips defined as longer than 30 minutes.
# 
df["long_trip"] = df.tripduration > 30
df.sample(3)

Unnamed: 0,tripduration,start_time,end_time,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,bike type,startdayofweek,enddayofweek,long_trip,start_hour
71190,4.883333,2019-08-31 14:16:33.826,2019-08-31 14:21:27.732,42.0,E Franklin Ave & Bloomington Ave S,44.962854,-93.252195,45.0,Franklin & Portland,44.962841,-93.267521,162,Subscriber,1992,1,Classic,5,5,False,14
53550,21.016667,2019-08-23 12:11:08.810,2019-08-23 12:32:10.091,177.0,Nicollet Mall & Grant Street,44.96995,-93.278133,803.0,Cedar Lake Point Beach,44.961106,-93.324735,771,Customer,1969,0,Classic,4,4,True,12
40688,7.85,2019-08-17 18:48:55.986,2019-08-17 18:56:47.040,74.0,West Broadway & Logan Ave N,45.00252,-93.305949,848.0,Willard Park,44.996866,-93.309435,3491,Subscriber,2000,1,Classic,5,5,False,18


# Problem statement
We want to see if we can predict if a customer will ride a bike longer than 30 minutes. After 30 minutes, additional charges apply. We created a `long_trip` feature for trips longer than 30 minutes (True).

## Feature selection.
We have a small issue that `Dockless` bikes do not have start/end stations. For the moment, I am going to restrict my attention to the `Classic` `bike type`.

The features I will use are `start_hour`, `start station latitude`, `start station longitude`, `end station latitude`, `end station longitude`, `usertype`, `birth year`, `gender`, `startdayofweek`. The response variable will be `long_trip`.

In [42]:
X = df[['start_hour', 'start station latitude', 'start station longitude', 'end station latitude',
        'end station longitude', 'usertype', 'birth year', 'gender', 'startdayofweek', 'bike type']]

X = X[X['bike type'] == 'Classic']

X.drop(columns='bike type', inplace=True)

X.sample(3)

Unnamed: 0,start_hour,start station latitude,start station longitude,end station latitude,end station longitude,usertype,birth year,gender,startdayofweek
42399,15,44.980536,-93.275626,44.96342,-93.277939,Subscriber,1984,1,6
57970,0,44.979951,-93.234787,44.98717,-93.25576,Customer,1993,1,6
69775,19,44.985388,-93.26145,44.985388,-93.26145,Customer,1969,0,4


In [43]:
y = df[df['bike type'] == 'Classic'].long_trip

In [44]:
print(X.shape)
print(y.shape)

(51200, 9)
(51200,)


## Split the data

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

## Build the model
Use a logistic regression model because its a classification problem

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier

# Find categorical columns
categorical_cols = [cname for cname in X.columns if
                   X[cname].dtype == 'object']

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps = [
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

# Bundle preprocessing for pipeline
preprocessor = ColumnTransformer(transformers =[
#    ('num', numerical_transformer, numerical_cols), # numerical columns not missing data. May use when trying dockless
    ('cat', categorical_transformer, categorical_cols)
])

# Use Logistic Regression
#model = LogisticRegression(max_iter=5000, solver='lbfgs', C=0.05)

# Use KNN
model = KNeighborsClassifier(n_neighbors=20)

# Construct pipeline
my_pipe = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('model', model)
])


In [56]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(my_pipe, X, y, cv=5, scoring="accuracy")

print("scores are", cv_scores)
print("average score is", cv_scores.mean())

scores are [0.50908203 0.70654297 0.68769531 0.6890625  0.70419922]
average score is 0.65931640625
