In [119]:
import plotly.express as px
from utz import *

### Load September 2021 data

In [2]:
d = read_parquet('ctbk/normalized/202109.parquet')
d

Unnamed: 0,Ride ID,Rideable Type,Start Time,Stop Time,Start Station Name,Start Station ID,End Station Name,End Station ID,Start Station Latitude,Start Station Longitude,End Station Latitude,End Station Longitude,Gender,User Type,Region
0,22C33F42C6A0E28E,classic_bike,2021-09-01 10:26:45,2021-09-01 10:43:23,Central Park West & W 72 St,7141.07,E 51 St & 1 Ave,6532.06,40.775794,-73.976206,40.754557,-73.965930,0,Subscriber,NYC
1,035F743147FCFCDE,classic_bike,2021-09-04 09:52:40,2021-09-04 10:09:19,William St & Pine St,5065.12,,,40.707179,-74.008873,40.720000,-74.010000,0,Subscriber,NYC
2,9C43CF6A07DACBC6,classic_bike,2021-09-06 17:07:40,2021-09-06 17:34:44,Fulton St & Broadway,5175.08,Jay St & Tech Pl,4710.06,40.711066,-74.009447,40.695065,-73.987167,0,Customer,NYC
3,253A1A5B20CC78EE,classic_bike,2021-09-28 16:53:43,2021-09-28 17:03:00,West Drive & Prospect Park West,3651.04,Ocean Pkwy & Church Ave,3125.09,40.661063,-73.979453,40.644719,-73.974515,0,Subscriber,NYC
4,5E8F164D6798CEFA,classic_bike,2021-09-19 09:37:47,2021-09-19 09:53:42,Lorimer St & Broadway,4965.01,Jay St & Tech Pl,4710.06,40.704118,-73.948186,40.695065,-73.987167,0,Subscriber,NYC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97285,DA808C4DE967D604,classic_bike,2021-09-23 19:08:20,2021-09-23 19:44:02,Harborside,JC104,Union St,JC051,40.719252,-74.034234,40.718211,-74.083639,0,Customer,JC
97286,667A3B00C664DBAF,classic_bike,2021-09-10 15:47:24,2021-09-10 16:36:17,8 St & Washington St,HB603,Union St,JC051,40.745984,-74.028199,40.718211,-74.083639,0,Customer,JC
97287,62E1888ABDE1464E,classic_bike,2021-09-30 13:28:29,2021-09-30 13:54:09,Harborside,JC104,Union St,JC051,40.719252,-74.034234,40.718211,-74.083639,0,Subscriber,JC
97288,475AA9DFCF196D50,classic_bike,2021-09-14 03:31:22,2021-09-14 04:18:35,Harborside,JC104,Union St,JC051,40.719252,-74.034234,40.718211,-74.083639,0,Customer,JC


### Parse Station Names, IDs, Lat/Lngs

In [163]:
r0, r1 = d[d['Start Station ID'] == '3125.09'][columns.keys()].drop_duplicates().values

In [164]:
r0

array(['3125.09', 'Ocean Pkwy & Church Ave', 40.644718957635746,
       -73.97451460361481], dtype=object)

In [165]:
r1

array(['3125.09', 'Ocean Pkwy & Church Ave', 40.644719, -73.974515],
      dtype=object)

In [174]:
columns ={
    'Start Station ID': 'Station ID', 
    'Start Station Name': 'Station Name',
    'Start Station Latitude': 'Latitude',
    'Start Station Longitude': 'Longitude',
}
starts = (
    d[columns.keys()]
    .rename(columns=columns)
)
starts['Start'] = True
columns ={
    'End Station ID': 'Station ID', 
    'End Station Name': 'Station Name',
    'End Station Latitude': 'Latitude',
    'End Station Longitude': 'Longitude',
}
ends = (
    d[columns.keys()]
    .rename(columns=columns)
)
ends['Start'] = True

station_ends = pd.concat([starts, ends])
station_ends['Latitude'] = station_ends['Latitude'].apply(lambda f: round(f, 4))
station_ends['Longitude'] = station_ends['Longitude'].apply(lambda f: round(f, 4))


def find_conflicting_stations(df):
    dd = df.drop_duplicates()
    if len(dd) == 1:
        dd = dd.iloc[0:0]
    return dd

conflicting_entries = (
    station_ends
    .drop(columns=['Start'])
    .groupby('Station ID')
    .apply(find_conflicting_stations)
    .reset_index(drop=True)
)

def squash_station_group(df):
    dd = df.drop_duplicates()
    dd = dd.iloc[-1:]
    return dd

station_entries = (
    station_ends
    .drop(columns=['Start'])
    .groupby('Station ID')
    .apply(squash_station_group)
    .reset_index(drop=True)
)

stations = station_entries.set_index('Station ID')
stations

Unnamed: 0_level_0,Station Name,Latitude,Longitude
Station ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2733.03,67 St & Erik Pl,40.6334,-74.0166
2782.02,5 Ave & 66 St,40.6357,-74.0200
2832.03,4 Ave & Shore Road Dr,40.6370,-74.0221
2872.02,63 St & 5 Ave,40.6377,-74.0178
2883.03,3 Ave & Wakeman Pl,40.6382,-74.0247
...,...,...,...
JC108,Bergen Ave & Stegman St,40.7066,-74.0867
JCSYS,JCBS Depot,40.7097,-74.0686
SYS014,NYCBS DEPOT - DELANCEY,40.7164,-73.9823
SYS033,Pier 40 X2,40.7285,-74.0117


### Create unifed "dockings" with melted start/end

In [73]:
starts = d[['Start Station ID']].copy()
start_times = d['Start Time']
ends = d[['End Station ID']].copy()
end_times = d['Stop Time']

starts['Day'] = start_times.dt.day
starts['Hour'] = start_times.dt.hour
starts['Weekday'] = start_times.dt.weekday
ends['Day'] = end_times.dt.day
ends['Hour'] = end_times.dt.hour
ends['Weekday'] = end_times.dt.weekday

starts = starts.rename(columns={'Start Station ID': 'Station ID'})
starts['Start'] = True
ends = ends.rename(columns={'End Station ID': 'Station ID'})
ends['Start'] = False
dockings = pd.concat([starts, ends])
dockings

Unnamed: 0,Station ID,Day,Hour,Weekday,Start
0,7141.07,1,10,2,True
1,5065.12,4,9,5,True
2,5175.08,6,17,0,True
3,3651.04,28,16,1,True
4,4965.01,19,9,6,True
...,...,...,...,...,...
97285,JC051,23,19,3,False
97286,JC051,10,16,4,False
97287,JC051,30,13,3,False
97288,JC051,14,4,1,False


#### JC overall start/end counts

In [149]:
jcs = dockings[dockings['Station ID'].str.match('(?:JC|HB)', na=False)]
jcs.Start.value_counts()

True     97290
False    96702
Name: Start, dtype: int64

In [179]:
stations

Unnamed: 0_level_0,Station Name,Latitude,Longitude
Station ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2733.03,67 St & Erik Pl,40.6334,-74.0166
2782.02,5 Ave & 66 St,40.6357,-74.0200
2832.03,4 Ave & Shore Road Dr,40.6370,-74.0221
2872.02,63 St & 5 Ave,40.6377,-74.0178
2883.03,3 Ave & Wakeman Pl,40.6382,-74.0247
...,...,...,...
JC108,Bergen Ave & Stegman St,40.7066,-74.0867
JCSYS,JCBS Depot,40.7097,-74.0686
SYS014,NYCBS DEPOT - DELANCEY,40.7164,-73.9823
SYS033,Pier 40 X2,40.7285,-74.0117


In [181]:
jc_starts = jcs[jcs.Start].groupby('Station ID').size().rename('Count')
jc_llcs = jc_starts.to_frame().merge(stations, left_index=True, right_index=True)
jc_llcs

Unnamed: 0_level_0,Count,Station Name,Latitude,Longitude
Station ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HB101,3728,Hoboken Terminal - Hudson St & Hudson Pl,40.7359,-74.0303
HB102,4415,Hoboken Terminal - River St & Hudson Pl,40.7361,-74.0291
HB103,4607,South Waterfront Walkway - Sinatra Dr & 1 St,40.737,-74.0278
HB201,1746,12 St & Sinatra Dr N,40.7506,-74.024
HB202,2222,14 St Ferry - 14 St & Shipyard Ln,40.753,-74.0244
HB203,1193,Bloomfield St & 15 St,40.7545,-74.0266
HB301,961,4 St & Grand St,40.7423,-74.0351
HB302,990,6 St & Grand St,40.7444,-74.0345
HB303,684,Clinton St & 7 St,40.7454,-74.0333
HB304,1190,7 St & Monroe St,40.7464,-74.038


#### Build date range (incl. weekday vs. weekends)

In [92]:
dates = pd.date_range('2021-09-01', '2021-09-30')
week_dts = dates[dates.weekday < 5]
wknd_dts = dates[dates.weekday >= 5]
len(week_dts), len(wknd_dts)

(22, 8)

### Station Counts helper

In [112]:
from typing import Collection

def station_counts(*keys, jc=None, weekdays=None, daily_avg=None):
    keys = list(keys)
    df = dockings
    dts = dates
    if weekdays is not None:
        if weekdays is True:
            df = df[df.Weekday < 5]
            dts = week_dts
        elif weekdays is False:
            df = df[df.Weekday >= 5]
            dts = wknd_dts
        elif isinstance(weekdays, Collection):
            weekdays = set(weekdays)
            df = df[df.Weekday.apply(lambda wd: wd in weekdays)]
            dts = dts[dts.weekday.apply(lambda wd: wd in weekdays)]
        else:
            raise ValueError(f'Unrecognized weekdays: {weekdays}')

    counts = df.groupby(['Station ID'] + keys).size().rename('Count')
    counts = counts.reset_index().merge(stations, left_on='Station ID', right_index=True).sort_values('Count')
    cols = ['Station Name'] + keys + ['Count']
    if daily_avg is None:
        if 'Day' in keys or 'Weekday' in keys:
            daily_avg = False
        else:
            daily_avg = True
    elif daily_avg:
        if 'Day' in keys or 'Weekday' in keys:
            raise ValueError("Can only compute daily averages if daily grouping is not applied")
        counts['Avg'] = counts['Count'] / len(dts)
        cols += ['Avg']

    counts = counts.set_index('Station ID')[cols]

    if jc is True:
        counts = counts[counts.index.to_series().str.match('(?:JC|HB)')]
    elif js is False:
        counts = counts[~counts.index.to_series().str.match('(?:JC|HB)')]

    return counts

#### JC counts, by {day,station}

In [113]:
jcd = station_counts('Day', jc=True)
jcd

Unnamed: 0_level_0,Station Name,Day,Count
Station ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HB404,Mama Johnson Field - 4 St & Jackson St,2,1
HB303,Clinton St & 7 St,30,1
JC099,Montgomery St,19,1
JC018,5 Corners Library,6,1
JC107,Grant Ave & MLK Dr,30,1
...,...,...,...
HB103,South Waterfront Walkway - Sinatra Dr & 1 St,19,452
HB103,South Waterfront Walkway - Sinatra Dr & 1 St,12,458
HB103,South Waterfront Walkway - Sinatra Dr & 1 St,11,461
HB103,South Waterfront Walkway - Sinatra Dr & 1 St,6,491


#### Biggest {station,day}s in JC

In [72]:
jcd[jcd.index.to_series().str.startswith('JC')].iloc[-50:]

Unnamed: 0_level_0,Station Name,Day,Count
Station ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
JC005,Grove St PATH,12,216
JC009,Hamilton Park,27,216
JC008,Newport Pkwy,15,219
JC008,Newport Pkwy,6,220
JC052,Liberty Light Rail,19,221
JC008,Newport Pkwy,8,223
JC008,Newport Pkwy,19,224
JC105,Hoboken Ave at Monmouth St,12,225
JC013,Marin Light Rail,18,225
JC105,Hoboken Ave at Monmouth St,18,228


### Monthly sum + Daily avg (all JC+HB)

In [96]:
sc_all = station_counts(jc=True)
sc_all

Unnamed: 0_level_0,Station Name,Count,Avg
Station ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
JC107,Grant Ave & MLK Dr,163,5.433333
JC063,Jackson Square,264,8.8
JC051,Union St,413,13.766667
JC108,Bergen Ave & Stegman St,421,14.033333
JC018,5 Corners Library,461,15.366667
JC065,Dey St,485,16.166667
JC080,Leonard Gordon Park,590,19.666667
JC094,Glenwood Ave,765,25.5
JC084,Communipaw & Berry Lane,802,26.733333
JC034,Christ Hospital,893,29.766667


### Monthly sum + Daily avg, weekdays only (all JC+HB)

In [97]:
sc_wkdy = station_counts(jc=True, weekdays=True)
sc_wkdy

Unnamed: 0_level_0,Station Name,Count,Avg
Station ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
JC107,Grant Ave & MLK Dr,107,4.863636
JC063,Jackson Square,213,9.681818
JC051,Union St,284,12.909091
JC018,5 Corners Library,315,14.318182
JC108,Bergen Ave & Stegman St,329,14.954545
JC065,Dey St,370,16.818182
JC080,Leonard Gordon Park,429,19.5
JC084,Communipaw & Berry Lane,542,24.636364
JC094,Glenwood Ave,625,28.409091
JC034,Christ Hospital,706,32.090909


### Monthly sum + Daily avg, weekends only (all JC+HB)

In [98]:
sc_wknd = station_counts(jc=True, weekdays=False)
sc_wknd

Unnamed: 0_level_0,Station Name,Count,Avg
Station ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
JC063,Jackson Square,51,6.375
JC107,Grant Ave & MLK Dr,56,7.0
JC108,Bergen Ave & Stegman St,92,11.5
JC065,Dey St,115,14.375
JC051,Union St,129,16.125
JC094,Glenwood Ave,140,17.5
JC018,5 Corners Library,146,18.25
JC080,Leonard Gordon Park,161,20.125
JC034,Christ Hospital,187,23.375
JC022,Oakland Ave,196,24.5


### Busiest 5 stations

In [108]:
top_stations = sc_all.iloc[-5:]['Station Name']
top_stations
# top_stations = pd.concat([sc_all.iloc[-5:], sc_wkdy.iloc[-5:], sc_wknd.iloc[-5:]])
# top_stations.index.drop_duplicates()

Station ID
JC008                                    Newport Pkwy
HB101        Hoboken Terminal - Hudson St & Hudson Pl
JC005                                   Grove St PATH
HB102         Hoboken Terminal - River St & Hudson Pl
HB103    South Waterfront Walkway - Sinatra Dr & 1 St
Name: Station Name, dtype: object

### Daily dockings (start+end), top 5 JC+HB stations

In [139]:
ts = jcd[jcd.index.isin(top_stations.index)].copy()
ts['Date'] = ts.Day.apply(lambda d: to_dt('2021-09-%02d' % d).date()).astype('datetime64')
ts = ts.sort_values('Date')
ts['Weekday'] = False
ts.loc[ts['Date'].dt.weekday < 5, 'Weekday'] = True
ts

Unnamed: 0_level_0,Station Name,Day,Count,Date,Weekday
Station ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
JC008,Newport Pkwy,1,67,2021-09-01,True
HB103,South Waterfront Walkway - Sinatra Dr & 1 St,1,76,2021-09-01,True
JC005,Grove St PATH,1,94,2021-09-01,True
HB102,Hoboken Terminal - River St & Hudson Pl,1,103,2021-09-01,True
HB101,Hoboken Terminal - Hudson St & Hudson Pl,1,142,2021-09-01,True
...,...,...,...,...,...
HB102,Hoboken Terminal - River St & Hudson Pl,30,316,2021-09-30,True
HB101,Hoboken Terminal - Hudson St & Hudson Pl,30,249,2021-09-30,True
HB103,South Waterfront Walkway - Sinatra Dr & 1 St,30,227,2021-09-30,True
JC008,Newport Pkwy,30,165,2021-09-30,True


In [128]:
fig = px.line(ts, x='Date', y='Count', color='Station Name', labels={'Count': 'Daily ride starts+stops'})
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01,
    bgcolor='rgba(0,0,0,0)' ,
))
fig

### Daily dockings (start+end), Grove St PATH

In [196]:
title = 'Grove St PATH Citibike Station, daily starts+ends'
fig = px.bar(
    ts[ts['Station Name'] == 'Grove St PATH'], 
    x='Date', y='Count', 
    color='Weekday', 
    labels={'Count': 'Daily ride starts+stops'},
)
fig.update_layout(
    title={
        'text': title,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top',
    },
)
fig

In [182]:
df = px.data.gapminder().query("year==2007")
df

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap,iso_alpha,iso_num
11,Afghanistan,Asia,2007,43.828,31889923,974.580338,AFG,4
23,Albania,Europe,2007,76.423,3600523,5937.029526,ALB,8
35,Algeria,Africa,2007,72.301,33333216,6223.367465,DZA,12
47,Angola,Africa,2007,42.731,12420476,4797.231267,AGO,24
59,Argentina,Americas,2007,75.320,40301927,12779.379640,ARG,32
...,...,...,...,...,...,...,...,...
1655,Vietnam,Asia,2007,74.249,85262356,2441.576404,VNM,704
1667,West Bank and Gaza,Asia,2007,73.422,4018332,3025.349798,PSE,275
1679,"Yemen, Rep.",Asia,2007,62.698,22211743,2280.769906,YEM,887
1691,Zambia,Africa,2007,42.384,11746035,1271.211593,ZMB,894


In [183]:
import plotly.graph_objects as go

In [187]:
fig = go.Figure()
fig.add_trace(
    go.Scattergeo(
        #locationmode = 'USA-states',
        lon = jc_llcs['Longitude'],
        lat = jc_llcs['Latitude'],
        text = jc_llcs['Station Name'],
        marker = dict(
            size = jc_llcs['Count'],
            color = 'royalblue',
#             line_color='rgb(40,40,40)',
#             line_width=0.5,
            sizemode = 'area'
        ),
        #name = '{0} - {1}'.format(lim[0],lim[1])
    )
)
fig.update_geos(fitbounds="locations")
fig.update_layout(height=300, margin={"r":0,"t":0,"l":0,"b":0})
fig

In [188]:
import pandas as pd
us_cities = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/us-cities-top-1k.csv")

import plotly.express as px

fig = px.scatter_mapbox(us_cities, lat="lat", lon="lon", hover_name="City", hover_data=["State", "Population"],
                        color_discrete_sequence=["fuchsia"], zoom=3, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [191]:
#import pandas as pd
#us_cities = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/us-cities-top-1k.csv")

#import plotly.express as px

dir(px)

['Constant',
 'IdentityMap',
 'NO_COLOR',
 'Range',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_chart_types',
 '_core',
 '_doc',
 '_imshow',
 '_special_inputs',
 'absolute_import',
 'area',
 'bar',
 'bar_polar',
 'box',
 'choropleth',
 'choropleth_mapbox',
 'colors',
 'data',
 'defaults',
 'density_contour',
 'density_heatmap',
 'density_mapbox',
 'ecdf',
 'funnel',
 'funnel_area',
 'get_trendline_results',
 'histogram',
 'icicle',
 'imshow',
 'imshow_utils',
 'line',
 'line_3d',
 'line_geo',
 'line_mapbox',
 'line_polar',
 'line_ternary',
 'optional_imports',
 'parallel_categories',
 'parallel_coordinates',
 'pd',
 'pie',
 'scatter',
 'scatter_3d',
 'scatter_geo',
 'scatter_mapbox',
 'scatter_matrix',
 'scatter_polar',
 'scatter_ternary',
 'set_mapbox_access_token',
 'strip',
 'sunburst',
 'timeline',
 'treemap',
 'trendline_functions',
 'violin']

In [None]:
px.scatter_mapbox(

In [192]:
fig = px.scatter_mapbox(
    jc_llcs,
    lat="Latitude", lon="Longitude",
    hover_name="Station Name",
    hover_data=["Station Name", "Count"],
    size='Count',
    #color_discrete_sequence=["fuchsia"], zoom=3, height=300
)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
px.scatter_geo(df, locations="iso_alpha", color="continent",
                     hover_name="country", size="pop",
                     projection="natural earth")