## Predicting Bike Rental Station Traffic

#### TODO:
- load up data for one station (one year, 3.5 million trips)
- eda hourly traffic, curious: see if coming and going are roughly equal
- get an lstm working
- try many predict one first, then many predict many (maybe 12-24 hours)
- try out a gru/different architectures, cv
- see if more than one year helps
- include weather
- scale up, get one model for one station for many stations (all if possible)
- flask basic
- flask interactive
- repo writeup (license/requirements)

#### future:
- group stations (takes into account nearby traffic) like RNN that has more than one station's output

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
from geopy.distance import geodesic

In [64]:
def basic_dist(row):
    # Gives a basic euclidean trip distance in m
    
    if row['round_trip'] == 1:
        return 0
    
    a = (row['start_lat'], row['start_lng'])
    
    b = (row['end_lat'], row['end_lng'])
    
    return geodesic(a, b).km * 1000

In [105]:
def station_data(stationid):
    '''Loads, preps, and filters data for machine learning'''
    
    # Gather one years worth of data
    filelist = []
    for month in [4,5,6,7,8,9,10,11,12]:
        filelist.append('data/2020{:02d}-divvy-tripdata.csv'.format(month))
    for month in [1,2,3]:
        filelist.append('data/2021{:02d}-divvy-tripdata.csv'.format(month))
    
    usecols = ['started_at', 'ended_at', 'start_station_id', 'end_station_id', 'member_casual', 'rideable_type',
               'start_lat', 'start_lng', 'end_lat', 'end_lng']

    for month in filelist:
        lil_df = pd.read_csv(month, usecols=usecols)
        
        # filter for specific station
        lil_df = lil_df[lil_df['end_station_id'] == stationid]
        
        frames.append(lil_df)

    df = pd.concat(frames, ignore_index=True)
    
    df = pd.read_csv('data/202004-divvy-tripdata.csv', usecols=usecols)
    
    # Null data warning
    if df.shape[0] - df.dropna().shape[0] > df.shape[0]*0.01:
        print('NULL WARNING: more than 1% of rows null')
        
    # in case of warning, change handling of missing data
    df = df.dropna()

    # might not be necessary, but sort of a df label
    df = df.reset_index(drop=True)
    df.index = df.index.rename(stationid)

    # prepping for machine learning
    df['ended_at'] = pd.to_datetime(df['ended_at'])
    df['started_at'] = pd.to_datetime(df['started_at'])

    df['date'] = pd.to_datetime(df['ended_at'].dt.date)
    df['hour'] = df['ended_at'].dt.hour

    # daylight savings makes a few negative trip times, a quick approximate fix is okay
    df['trip_time'] = abs((df['ended_at'] - df['started_at']).dt.total_seconds())

    df['round_trip'] = df.apply(lambda x: 1 if x['start_station_id'] == x['end_station_id'] else 0, axis=1)

    df['electric'] = df['rideable_type'].apply(lambda x: 1 if x == 'electric_bike' else 0)

    df['member'] = df['member_casual'].apply(lambda x: 1 if x == 'member' else 0)

    # takes time but is more accurate and in meters. Can convert to euclidean coord dist to save time
    df['trip_dist'] = df.apply(basic_dist, axis=1)
    
    df = df.drop(columns=['rideable_type', 'member_casual', 'started_at', 'ended_at', 'start_station_id',
                          'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng'])
    
    # Can only use aggregate statistics on the hour

    return df

In [106]:
df86 = station_data(86)

In [107]:
df86

Unnamed: 0_level_0,date,hour,trip_time,round_trip,electric,member,trip_dist
86,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2020-04-26,18,1609.0,0,0,1,3981.345842
1,2020-04-17,17,489.0,0,0,1,979.629317
2,2020-04-01,18,863.0,0,0,1,2984.661192
3,2020-04-07,13,732.0,0,0,1,2141.374633
4,2020-04-18,11,3175.0,0,0,0,9119.684872
...,...,...,...,...,...,...,...
84672,2020-04-16,16,775.0,1,0,1,0.000000
84673,2020-04-30,18,1149.0,0,0,0,1058.631622
84674,2020-04-24,21,6790.0,0,0,0,605.093042
84675,2020-04-02,18,2551.0,1,0,0,0.000000
