In [1]:
import sys
print('Python version: ', sys.version)

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium

## Position Data

In [3]:
# data from https://github.com/chriswhong/nycturnstiles/blob/master/geocoded.csv
geo_df = pd.read_csv('../data/turnstile/geocoded_ca_unit.csv').dropna()
geo_df

Unnamed: 0,remote unit,control area,station,lines,division,latitude,longitude
0,R470,X002,ELTINGVILLE PK,Z,SRT,40.544600,-74.164581
1,R544,PTH02,HARRISON,1,PTH,40.738879,-74.155533
2,R165,S102,TOMPKINSVILLE,1,SRT,40.636948,-74.074824
3,R070,S101,ST. GEORGE,1,SRT,40.643738,-74.073622
4,R070,S101A,ST. GEORGE,1,SRT,40.643738,-74.073622
...,...,...,...,...,...,...,...
787,R319,N601A,LEXINGTON AVE,F,IND,40.764763,-73.966291
788,R246,C015,PROSPECT AVE,R,BMT,40.665438,-73.992856
789,R001,R101,SOUTH FERRY,1RW,IRT,40.702068,-74.013664
790,R305,R107D,CORTLANDT ST,1,IRT,40.710454,-74.011324


In [4]:
# Build map 
station_loc_map = folium.Map(location=[40.738, -73.98],
    zoom_start=11, tiles='cartodbpositron')

# Plot coordinates using comprehension list
for index, row in geo_df.iterrows():
    folium.CircleMarker(location=[row['latitude'], row['longitude']],
    color='#0080bb', fill_color='#0080bb', radius=1).add_to(station_loc_map) 

# Display map in Jupyter
station_loc_map

### Trip Data from trunstile_cleaning

In [5]:
count_df = pd.read_csv('../data/turnstile/station_daily_diffs.csv')
count_df

Unnamed: 0,C/A,UNIT,STATION,LINENAME,DATE,ENTRIES_DIFF,EXITS_DIFF
0,A002,R051,59 ST,NQR456W,02/15/2020,6445.0,3259.0
1,A002,R051,59 ST,NQR456W,02/16/2020,6122.0,3888.0
2,A002,R051,59 ST,NQR456W,02/17/2020,9228.0,6073.0
3,A002,R051,59 ST,NQR456W,02/18/2020,11858.0,7907.0
4,A002,R051,59 ST,NQR456W,02/19/2020,12519.0,7999.0
...,...,...,...,...,...,...,...
46913,TRAM2,R469,RIT-ROOSEVELT,R,04/13/2020,238.0,65.0
46914,TRAM2,R469,RIT-ROOSEVELT,R,04/14/2020,433.0,61.0
46915,TRAM2,R469,RIT-ROOSEVELT,R,04/15/2020,395.0,71.0
46916,TRAM2,R469,RIT-ROOSEVELT,R,04/16/2020,408.0,60.0


In [6]:
station_df = count_df[['C/A', 'UNIT', 'STATION', 'LINENAME']].drop_duplicates()
station_df

Unnamed: 0,C/A,UNIT,STATION,LINENAME
0,A002,R051,59 ST,NQR456W
63,A006,R079,5 AV/59 ST,NQRW
126,A007,R079,5 AV/59 ST,NQRW
189,A010,R080,57 ST-7 AV,NQRW
252,A011,R080,57 ST-7 AV,NQRW
...,...,...,...,...
46605,S101,R070,ST. GEORGE,1
46668,S101A,R070,ST. GEORGE,1
46731,S102,R165,TOMPKINSVILLE,1
46792,TRAM1,R468,RIT-MANHATTAN,R


## Join

In [7]:
geostation_df = pd.merge(station_df, geo_df, how='outer',
    left_on=['C/A', 'UNIT'], right_on=['control area', 'remote unit'], suffixes=['_l', '_r'])
geostation_df

Unnamed: 0,C/A,UNIT,STATION,LINENAME,remote unit,control area,station,lines,division,latitude,longitude
0,A002,R051,59 ST,NQR456W,R051,A002,LEXINGTON AVE,456NQR,BMT,40.762796,-73.967686
1,A006,R079,5 AV/59 ST,NQRW,R079,A006,5 AVE-59 ST,NQR,BMT,40.764909,-73.973372
2,A007,R079,5 AV/59 ST,NQRW,R079,A007,5 AVE-59 ST,NQR,BMT,40.764909,-73.973372
3,A010,R080,57 ST-7 AV,NQRW,R080,A010,57 ST-7 AVE,NQR,BMT,40.764755,-73.980646
4,A011,R080,57 ST-7 AV,NQRW,R080,A011,57 ST-7 AVE,NQR,BMT,40.764755,-73.980646
...,...,...,...,...,...,...,...,...,...,...,...
781,,,,,R328,R532G,METS-WILLETS PT,7,IRT,40.754622,-73.845625
782,,,,,R414,N182A,HOWARD BCH-JFK,A,IND,40.660476,-73.830301
783,,,,,R459,OB01,ORCHARD BEACH,6,IND,40.852417,-73.828082
784,,,,,R537,JFK04,JFK JAMAICA CT2,E,IND,40.643942,-73.782356


### Repairing some data

In [8]:
geostation_df[geostation_df.isna().any(axis=1)]

Unnamed: 0,C/A,UNIT,STATION,LINENAME,remote unit,control area,station,lines,division,latitude,longitude
438,PTH01,R549,NEWARK HW BMEBE,1.0,,,,,,,
443,PTH06,R546,PAVONIA/NEWPORT,1.0,,,,,,,
444,PTH07,R550,CITY / BUS,1.0,,,,,,,
446,PTH10,R547,9TH STREET,1.0,,,,,,,
449,PTH13,R541,THIRTY ST,1.0,,,,,,,
450,PTH16,R550,LACKAWANNA,1.0,,,,,,,
452,PTH18,R549,NEWARK BM BW,1.0,,,,,,,
453,PTH19,R549,NEWARK C,1.0,,,,,,,
454,PTH20,R549,NEWARK HM HE,1.0,,,,,,,
456,PTH22,R540,PATH NEW WTC,1.0,,,,,,,


In [9]:
geostation_nona_df = geostation_df.dropna()
geostation_nona_df

Unnamed: 0,C/A,UNIT,STATION,LINENAME,remote unit,control area,station,lines,division,latitude,longitude
0,A002,R051,59 ST,NQR456W,R051,A002,LEXINGTON AVE,456NQR,BMT,40.762796,-73.967686
1,A006,R079,5 AV/59 ST,NQRW,R079,A006,5 AVE-59 ST,NQR,BMT,40.764909,-73.973372
2,A007,R079,5 AV/59 ST,NQRW,R079,A007,5 AVE-59 ST,NQR,BMT,40.764909,-73.973372
3,A010,R080,57 ST-7 AV,NQRW,R080,A010,57 ST-7 AVE,NQR,BMT,40.764755,-73.980646
4,A011,R080,57 ST-7 AV,NQRW,R080,A011,57 ST-7 AVE,NQR,BMT,40.764755,-73.980646
...,...,...,...,...,...,...,...,...,...,...,...
742,S101,R070,ST. GEORGE,1,R070,S101,ST. GEORGE,1,SRT,40.643738,-74.073622
743,S101A,R070,ST. GEORGE,1,R070,S101A,ST. GEORGE,1,SRT,40.643738,-74.073622
744,S102,R165,TOMPKINSVILLE,1,R165,S102,TOMPKINSVILLE,1,SRT,40.636948,-74.074824
745,TRAM1,R468,RIT-MANHATTAN,R,R468,TRAM1,RIT-MANHATTAN,R,RIT,40.761268,-73.964016


In [10]:
# Build map 
station_loc_map = folium.Map(location=[40.738, -73.98],
    zoom_start=11, tiles='cartodbpositron', width=640, height=480)

# Plot coordinates using comprehension list
for index, row in geostation_nona_df.iterrows():
    folium.CircleMarker(location=[row['latitude'], row['longitude']],
    color='#0080bb', fill_color='#0080bb', radius=1).add_to(station_loc_map) 

# Display map in Jupyter
station_loc_map

### Station position not found

In [11]:
geo_not_found = geostation_df[geostation_df['control area'].isnull()][['C/A', 'UNIT', 'STATION', 'LINENAME']]
geo_not_found.sort_values('STATION')

Unnamed: 0,C/A,UNIT,STATION,LINENAME
699,R550,R072,34 ST-HUDSON YD,7
700,R551,R072,34 ST-HUDSON YD,7
446,PTH10,R547,9TH STREET,1
444,PTH07,R550,CITY / BUS,1
450,PTH16,R550,LACKAWANNA,1
452,PTH18,R549,NEWARK BM BW,1
453,PTH19,R549,NEWARK C,1
454,PTH20,R549,NEWARK HM HE,1
438,PTH01,R549,NEWARK HW BMEBE,1
456,PTH22,R540,PATH NEW WTC,1


In [12]:
station_namae = '34 ST-HUDSON YD'
geo_not_found[geo_not_found['STATION'] == station_namae]

Unnamed: 0,C/A,UNIT,STATION,LINENAME
699,R550,R072,34 ST-HUDSON YD,7
700,R551,R072,34 ST-HUDSON YD,7


In [13]:
geostation_nona_df[geostation_nona_df['STATION'] == station_namae]

Unnamed: 0,C/A,UNIT,STATION,LINENAME,remote unit,control area,station,lines,division,latitude,longitude


In [14]:
station_df[station_df['STATION'] == station_namae]

Unnamed: 0,C/A,UNIT,STATION,LINENAME
43897,R550,R072,34 ST-HUDSON YD,7
43960,R551,R072,34 ST-HUDSON YD,7


In [15]:
count_df[count_df['STATION'] == station_namae].sort_values('DATE')

Unnamed: 0,C/A,UNIT,STATION,LINENAME,DATE,ENTRIES_DIFF,EXITS_DIFF
43897,R550,R072,34 ST-HUDSON YD,7,02/15/2020,9927.0,10785.0
43960,R551,R072,34 ST-HUDSON YD,7,02/15/2020,1453.0,2262.0
43898,R550,R072,34 ST-HUDSON YD,7,02/16/2020,8895.0,9532.0
43961,R551,R072,34 ST-HUDSON YD,7,02/16/2020,1266.0,1976.0
43899,R550,R072,34 ST-HUDSON YD,7,02/17/2020,11131.0,12177.0
...,...,...,...,...,...,...,...
43957,R550,R072,34 ST-HUDSON YD,7,04/15/2020,642.0,716.0
44021,R551,R072,34 ST-HUDSON YD,7,04/16/2020,191.0,235.0
43958,R550,R072,34 ST-HUDSON YD,7,04/16/2020,654.0,746.0
43959,R550,R072,34 ST-HUDSON YD,7,04/17/2020,565.0,690.0
