In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import altair as alt
import geopandas as gpd
from shapely.geometry import Point
import pickle
from datetime import datetime

import_new_data=True



## clean counters up

In [8]:
if import_new_data==True:
    import_df = pd.read_csv('data/Bicycle_Counts.csv')
    import_df['date'] = pd.to_datetime(import_df['date'])
    import_df = import_df.set_index('date')
    
    date_retrieved = datetime.now()
    with open('data/retrieval_date.pkl', 'wb') as f:
        pickle.dump(date_retrieved, f)


'''
status col:
"0 = raw
1 = excluded 
2 = deleted
4 = modified
8 = validated
16 = certified"'''

'\nstatus col:\n"0 = raw\n1 = excluded \n2 = deleted\n4 = modified\n8 = validated\n16 = certified"'

In [9]:
# copy after imported to save having to import again
data = import_df[['id','counts']].copy()

In [10]:
counters = pd.read_csv('data/Bicycle_Counters.csv')
counters = counters[['id', 'name', 'latitude', 'longitude']]

In [11]:
counters.sort_values(by='latitude')

Unnamed: 0,id,name,latitude,longitude
6,100055175,Manhattan Bridge Interference Calibration 2019...,0.0,0.0
22,100048744,Manhattan Bridge Interference Calibration 2018...,0.0,0.0
27,100010017,Staten Island Ferry,40.643387,-74.072075
28,300027723,Fountain Ave,40.655606,-73.862951
23,100009425,Prospect Park West,40.671288,-73.971382
13,100009429,Manhattan Bridge 2013 to 2018 Bike Counter,40.699768,-73.98582
19,100051865,Manhattan Bridge 2012 to 2019 Bike Counter,40.69981,-73.98589
17,100005020,Manhattan Bridge 2012 Test Bike Counter,40.69981,-73.98589
9,300020692,Test,40.707381,-73.998845
4,100010022,Brooklyn Bridge Bike Path,40.709274,-74.00099


In [12]:
# merge counter names
data = data.reset_index().merge(counters[['id', 'name']], on='id').set_index('date')

In [13]:
# drop counters that are tests
test_counters = [300020692, 100048744, 100005020, 100055175]
# Drop pedestrian counters
ped_counters = [300029648, 100009426]
# Drop other counters
# bk bridge comprehensive is all we need for the bridge
other_counters = [300020241, 100010022, 300028963, 100051865, 100009429, 100039064, 100057318, 100047029]


counters_to_remove = test_counters + ped_counters + other_counters

data = data[~data['id'].isin(counters_to_remove)]
counters = counters[~counters['id'].isin(counters_to_remove)]


#rename n.8th kent because we replace it with kent ave comprehensive below
counters.loc[counters['name']=='Kent Ave btw North 8th St and North 9th St', ['name', 'id',]] = ('Kent Ave Comprehensive', 999999999)
counters = counters[counters['name']!='Kent Ave btw South 6th St. and Broadway']

In [14]:
# get the first and last date of each counter
first_dates = data.reset_index().groupby('name')['date'].min()
last_dates = data.reset_index().groupby('name')['date'].max()

counter_dates = pd.DataFrame({'first': first_dates, 'last':last_dates}).sort_values(by='last')
counter_dates['runtime'] = (counter_dates['last'] - counter_dates['first']).dt.days
counter_dates.sort_values(by='runtime')

Unnamed: 0_level_0,first,last,runtime
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Willis Ave Bikes,2022-09-02 11:15:00,2023-08-29 23:45:00,361
Fountain Ave,2022-08-23 10:30:00,2023-08-29 23:45:00,371
111th St at 50th Ave,2022-05-09 15:00:00,2023-08-27 23:45:00,475
Kent Ave btw South 6th St. and Broadway,2014-08-14 00:00:00,2016-11-21 23:45:00,830
Columbus Ave at 86th St.,2019-10-24 00:00:00,2023-08-28 23:45:00,1404
Amsterdam Ave at 86th St.,2019-10-24 00:00:00,2023-08-29 23:45:00,1405
2nd Avenue - 26th St S,2015-05-22 09:15:00,2019-06-11 23:45:00,1481
8th Ave at 50th St.,2018-06-14 00:00:00,2023-08-25 23:45:00,1898
Pulaski Bridge,2017-06-24 00:00:00,2023-08-29 23:45:00,2257
1st Avenue - 26th St N - Interference testing,2013-09-11 22:00:00,2019-12-05 16:45:00,2275


In [15]:
# the two kent ave counters about 10 blocks from one another, let's combine them
data.loc[data['name']=='Kent Ave btw South 6th St. and Broadway', ['name', 'id']] = ('Kent Ave Comprehensive', 999999999)

data.loc[data['name']=='Kent Ave btw North 8th St and North 9th St', ['name', 'id']] = ('Kent Ave Comprehensive', 999999999)

### replace with pretty names

In [16]:
pretty_names = {
    '2nd Avenue - 26th St S': '26th St & 2nd Ave', 
    'Prospect Park West':'Prospect Pk W',
    'Williamsburg Bridge Bike Path': 'Williamsburg Br',
    'Ed Koch Queensboro Bridge Shared Path':'Queensboro Br', 
    'Staten Island Ferry':'Staten Isl Ferry St',
    'Pulaski Bridge':'Pulaski Br', 
    '1st Avenue - 26th St N - Interference testing': '1st Ave & 26th St',
    '8th Ave at 50th St.': '8th Ave & 50th St',
    'Amsterdam Ave at 86th St.':'Amsterdam Ave & 86th St', 
    'Manhattan Bridge Bike Comprehensive':'Manhattan Bridge',
    'Comprehensive Brooklyn Bridge Counter':'Brooklyn Br', 
    'Fountain Ave':'Fountain Ave',
    'Columbus Ave at 86th St.':'Columbus Ave & 86th St',
    'Kent Ave Comprehensive': 'Kent Ave', 
    '111th St at 50th Ave':'111th St & 50th Ave',
    'Willis Ave Bikes':'Willis Ave'
}

In [17]:
counters['name'] = counters['name'].map(pretty_names)
data['name'] = data['name'].map(pretty_names)

In [18]:
counters

Unnamed: 0,id,name,latitude,longitude
1,100010018,Pulaski Br,40.742563,-73.951492
3,999999999,Kent Ave,40.720959,-73.96093
8,300024007,111th St & 50th Ave,40.74563,-73.8525
10,100009424,26th St & 2nd Ave,40.73971,-73.97954
11,300020904,Brooklyn Br,40.711644,-74.004109
14,100057316,8th Ave & 50th St,40.762348,-73.98612
15,100009427,Williamsburg Br,40.71053,-73.96145
16,100057320,Columbus Ave & 86th St,40.7877,-73.97505
18,100057319,Amsterdam Ave & 86th St,40.7877,-73.97505
20,100010020,1st Ave & 26th St,40.73883,-73.977165


In [19]:
counters.set_index('id').to_pickle('data/counters.pkl')

# groupby hour

In [57]:
# get month of year given week number
def get_time(hour):
    hour_string = str(hour).zfill(2)
    time_string = hour_string + ':00' + ':00'
    return pd.to_datetime(time_string, format='%H:%M:%S', utc=True)

In [58]:
#groupby hour and pivot
unindexed_data = data[['name', 'id', 'counts']].reset_index()

hr = unindexed_data.groupby([unindexed_data['date'].dt.hour, unindexed_data['id']]).agg(
    {'name': 'max',
    'counts':'mean',
    }
)
hr['display_time'] = hr.index.get_level_values('date').to_series().apply(lambda x: get_time(x)).values

In [59]:
hr.to_pickle('data/by_hour.pkl')

## groupby week

In [60]:
# get month of year given week number
def get_month(week_number, year):
    first_day = datetime.fromisocalendar(year, week_number, 1)
    return first_day.strftime('%m-%d')

In [61]:
clean_unindexed_data = unindexed_data.copy()

criteria = (clean_unindexed_data['id']==100010020) & (clean_unindexed_data['date']<'2016-06-01')
clean_unindexed_data = clean_unindexed_data.loc[~criteria,:]

wk = clean_unindexed_data.groupby([clean_unindexed_data['date'].dt.week,'id']).agg({
    'counts':'mean',
    'name':'max'
})

wk = wk.loc[wk.index.get_level_values('date')!=53]
wk['display_date'] = wk.index.get_level_values('date').to_series().apply(lambda x: get_month(x,2021)).values

  wk = clean_unindexed_data.groupby([clean_unindexed_data['date'].dt.week,'id']).agg({


In [62]:
wk.to_pickle('data/by_week.pkl')

## make app data

In [63]:
# to save time on the color mapping, map colors ahead of time

# colors
cat_20 = ['#1f77b4',
    '#aec7e8',
    '#ff7f0e',
    '#ffbb78',
    '#2ca02c',
    '#98df8a',
    '#d62728',
    '#ff9896',
    '#9467bd',
    '#c5b0d5',
    '#8c564b',
    '#c49c94',
    '#e377c2',
    '#f7b6d2',
    '#7f7f7f',
    '#c7c7c7',
    '#bcbd22',
    '#dbdb8d',
    '#17becf',
    '#9edae5'
    ]

num_counters = len(counters)
counter_ids = counters['id'].unique()
color_indices = np.linspace(0, len(cat_20)-1, num_counters, dtype=int)
colors = [cat_20[x] for x in color_indices]
color_dict = dict(zip(counter_ids, colors))

In [64]:
counter_display = counters.copy()
counter_display['color'] = counter_display['id'].map(color_dict)
counter_display.set_index('id').to_pickle('data/streamlit_counters.pkl')

hr_display = hr.copy()
hr_display['color'] = hr_display.index.get_level_values('id').map(color_dict)
hr_display.to_pickle('data/streamlit_by_hr.pkl')

wk_display = wk.copy()
wk_display['color'] = wk_display.index.get_level_values('id').map(color_dict)
wk_display.to_pickle('data/streamlit_by_wk.pkl')

In [56]:
hr.index.get_level_values('date').to_series().apply(lambda x: get_time(x)).values

array(['1900-01-01T00:00:00.000000000', '1900-01-01T00:00:00.000000000',
       '1900-01-01T00:00:00.000000000', '1900-01-01T00:00:00.000000000',
       '1900-01-01T00:00:00.000000000', '1900-01-01T00:00:00.000000000',
       '1900-01-01T00:00:00.000000000', '1900-01-01T00:00:00.000000000',
       '1900-01-01T00:00:00.000000000', '1900-01-01T00:00:00.000000000',
       '1900-01-01T00:00:00.000000000', '1900-01-01T00:00:00.000000000',
       '1900-01-01T00:00:00.000000000', '1900-01-01T00:00:00.000000000',
       '1900-01-01T00:00:00.000000000', '1900-01-01T00:00:00.000000000',
       '1900-01-01T01:00:00.000000000', '1900-01-01T01:00:00.000000000',
       '1900-01-01T01:00:00.000000000', '1900-01-01T01:00:00.000000000',
       '1900-01-01T01:00:00.000000000', '1900-01-01T01:00:00.000000000',
       '1900-01-01T01:00:00.000000000', '1900-01-01T01:00:00.000000000',
       '1900-01-01T01:00:00.000000000', '1900-01-01T01:00:00.000000000',
       '1900-01-01T01:00:00.000000000', '1900-01-01