In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sps
import sklearn
import math

In [2]:
import re
import os
from datetime import datetime

In [3]:
import random

In [4]:
starting_year = 2018
ending_year = 2020

In [5]:
fgb_ft = "../tripdata/{}{}-fordgobike-tripdata.csv"
baywheels_ft = "../tripdata/{}{}-baywheels-tripdata.csv"
is_fgb = lambda yr, mt: yr == 2018 or (yr == 2019 and mt <= 4)

In [6]:
datetime_transform = lambda x: (datetime.strptime(x.split(".")[0], "%Y-%m-%d %H:%M:%S")) if type(x) == str else x

In [7]:
def baywheels_assimilation(baywheels_df):
    # fix column names for baywheels data
    rn_df = baywheels_df.rename({
            "started_at": "start_time",
            "ended_at": "end_time",
            "start_lat": "start_station_latitude",
            "start_lng": "start_station_longitude",
            "end_lat":   "end_station_latitude",
            "end_lng":   "end_station_longitude"
        }, axis=1)
    rn_df["start_time"] = rn_df["start_time"].apply(datetime_transform)
    rn_df["end_time"] = rn_df["end_time"].apply(datetime_transform)
    rn_df["duration_sec"] = (rn_df["end_time"] - rn_df["start_time"]).apply(lambda x: x.seconds)
    return rn_df

def fgb_assimilation(fgb_df):
    fgb_df["start_time"] = fgb_df["start_time"].apply(datetime_transform)
    fgb_df["end_time"] = fgb_df["end_time"].apply(datetime_transform)
    return fgb_df

def data_cleanse(rn_df):
    # remove outliers w/r/t duration_sec
    #   Use three-sigma rule for outlier removal
    #mean_duration = rn_df["duration_sec"].groupby([]).mean()
    #std_duration = rn_df["duration_sec"].groupby([]).std()
    return rn_df

def get_aggregate_df():
    full_df = pd.DataFrame()
    
    def append_to_df(yr, mt, full_df):
        mt_str = ("0" if mt < 10 else "") + str(mt)
        mt_df = pd.DataFrame()
        if is_fgb(yr, mt):
            mt_df = pd.read_csv(fgb_ft.format(yr, mt_str))
            mt_df = fgb_assimilation(mt_df)
        else:
            mt_df = pd.read_csv(baywheels_ft.format(yr, mt_str))
            mt_df = baywheels_assimilation(mt_df)
        mt_df["Year"] = yr
        mt_df["Month"] = mt
        full_df = pd.concat([full_df, mt_df])
        return full_df

    for yr in range(starting_year, ending_year+1):
        for mt in range(1, 12):
            full_df = append_to_df(yr, mt, full_df)
    return data_cleanse(full_df.reset_index(drop=True))

In [8]:
agg_df = get_aggregate_df()

  mt_df = pd.read_csv(baywheels_ft.format(yr, mt_str))
  mt_df = pd.read_csv(baywheels_ft.format(yr, mt_str))
  mt_df = pd.read_csv(baywheels_ft.format(yr, mt_str))
  mt_df = pd.read_csv(baywheels_ft.format(yr, mt_str))
  mt_df = pd.read_csv(baywheels_ft.format(yr, mt_str))


KeyboardInterrupt: 

In [None]:
# [longitude_range, latitude_range]
city_rectangles = {
    "Nashville": [[-90,-80],[35,37]],
    "NorthCal": [[-125,-115],[37,39]],
}

In [None]:
bay_rectangles = {
    "San Francisco": [[-122.55,-122.35], [37.67,37.82]],
    "Oakland": [[-122.35, -122.12], [37.7, 37.92]],
    "San Jose": [[-122, -121.75], [37.2, 37.5]],
}

In [None]:
agg_sf = agg_df[agg_df["start_station_latitude"].apply(lambda x: x >=37.7 and x <=37.8) & agg_df["start_station_longitude"].apply(lambda x: x >=-122.55 and x <=-122.35) ]

In [None]:
class SourceDest:
    def __init__(self, st_long, st_lat, end_long, end_lat):
        self.st_long  = st_long
        self.st_lat   = st_lat
        self.end_long = end_long
        self.end_lat  = end_lat
    def df_filter(self, df):
        return df[(df["start_station_longitude"] == self.st_long) &
                  (df["start_station_latitude"] == self.st_lat) &
                  (df["end_station_longitude"] == self.st_long) &
                  (df["end_station_latitude"]  == self.st_lat)]
class SourceDestName:
    def __init__(self, start_stn, end_stn):
        self.start_stn = start_stn
        self.end_stn = end_stn
    def df_filter(self, df):
        return df[(df["start_station_name"] == self.start_stn) &
                  (df["end_station_name"] == self.end_stn)]

In [None]:
route_counts = agg_sf.groupby(["start_station_name", "end_station_name"])["duration_sec"].count().reset_index().rename({"duration_sec": "count"},axis=1)
total_n_routes = route_counts.shape[0]
routes_by_popularity = route_counts.sort_values("count", ascending=False).reset_index().drop("index", axis=1)
#most_popular_routes_2 = most_popular_routes_2[:1000]