In [73]:
import pandas as pd

In [74]:
edf = pd.read_csv('../datasets/events/events_raw.csv')

# Convert 'datetime' to datetime
edf['date_and_time'] = pd.to_datetime(edf['date_and_time'])

# Create 'date' and 'time' columns
edf['date'] = edf['date_and_time'].dt.date
edf['time'] = edf['date_and_time'].dt.time

edf['date'] = edf['date'].astype(str).str.replace('-', '/')


In [75]:
edf.loc[edf['borough'] == 'Manhattan', 'borough'] = 'NY'
edf.loc[edf['borough'] == 'Bronx', 'borough'] = 'BX'
edf.loc[edf['borough'] == 'Staten Island', 'borough'] = 'R'
edf.loc[edf['borough'] == 'Brooklyn', 'borough'] = 'K'
edf.loc[edf['borough'] == 'Queens', 'borough'] = 'Q'

# Define column widths
colspecs = [(2, 34), (36, 37), (37, 42)]

# Read the file
sc = pd.read_fwf('../snd24Bcow.txt', colspecs=colspecs, header=None, names=["address_name", "borough", "street_code"], dtype='string')

# Mappin based on documentation from here: https://data.cityofnewyork.us/City-Government/Street-Name-Dictionary/w4v2-rv6b/about_data
sc.loc[sc.borough == "1", "borough"] = 'NY'
sc.loc[sc.borough == "2", "borough"] = 'BX'
sc.loc[sc.borough == "3", "borough"] = 'K'
sc.loc[sc.borough == "4", "borough"] = 'Q'
sc.loc[sc.borough == "5", "borough"] = 'R'

In [76]:
edf['location'] = edf['location'].str.lower()
sc['address_name'] = sc['address_name'].str.lower()

# Remove extra whitespaces between words in 'location' and 'address_name'
edf['location'] = edf['location'].str.replace('\s+', ' ', regex=True)
sc['address_name'] = sc['address_name'].str.replace('\s+', ' ', regex=True)

# Remove suffixes like 'th', 'st', 'nd', 'rd' from 'location'
edf['location'] = edf['location'].str.replace('(\d+)(st|nd|rd|th)', r'\1', regex=True)
edf['location'] = edf['location'].str.replace(' ave. ', ' avenue ')
edf['location'] = edf['location'].str.replace(' ave ', ' avenue ')
edf['location'] = edf['location'].str.replace(' ave$', ' avenue', regex=True)
edf['location'] = edf['location'].str.replace(' st ', ' street ')
edf['location'] = edf['location'].str.replace(' st$', ' street', regex=True)
edf['location'] = edf['location'].str.replace('st.', 'st')
edf['location'] = edf['location'].str.replace('^w ', 'west ', regex=True)
edf['location'] = edf['location'].str.replace(' w$', ' west', regex=True)
edf['location'] = edf['location'].str.replace('^e ', 'east ', regex=True)
edf['location'] = edf['location'].str.replace('blvd', 'boulevard', regex=True)
edf['location'] = edf['location'].str.replace(' rd$', ' road', regex=True)
edf['location'] = edf['location'].str.replace(' rd ', ' road ', regex=True)
edf['location'] = edf['location'].str.replace('saint', 'st', regex=True)
edf['location'] = edf['location'].str.replace(' pl$', ' place', regex=True)
edf['location'] = edf['location'].str.replace('pkwy', 'parkway', regex=True)
edf['location'] = edf['location'].str.replace(' dr$', ' drive', regex=True)
edf['location'] = edf['location'].str.replace('^s ', 'south ', regex=True)
edf['location'] = edf['location'].str.replace(' cir$', ' circle', regex=True)
edf['location'] = edf['location'].str.replace(' ter$', ' terrace', regex=True)
edf['location'] = edf['location'].str.replace(' expy$', ' expressway', regex=True)
edf['location'] = edf['location'].str.replace(' hwy$', ' highway', regex=True)
edf['location'] = edf['location'].str.replace('alfred e\.', 'alfred e', regex=True)
edf['location'] = edf['location'].str.replace('flushing meadows corona park pool & rink', 'flushing meadows corona park', regex=True)
edf['location'] = edf['location'].str.replace('st james recreation center', 'st james park recreation center', regex=True)
edf['location'] = edf['location'].str.replace('thomas jefferson recreation center', 'thomas jefferson rec center', regex=True)
edf['location'] = edf['location'].str.replace('j. hood wright recreation center', 'hood wright recreation center', regex=True)
edf['location'] = edf['location'].str.replace('jackie robinson recreation center', 'jackie robinson rec center', regex=True)
len(edf)

7373

In [80]:
merged = sc.merge(edf, how='inner', left_on=['borough', 'address_name'], right_on=['borough', 'location'])

len(merged)
merged.head()

Unnamed: 0,address_name,borough,street_code,unit,group_name_partner,date_and_time,locationtype,location,event_name,event_type,category,classification,attendance,audience,date,time
0,abraham lincoln playground,NY,12155,,,2022-07-01 20:30:00,,abraham lincoln playground,Movies Under the Stars: Respect,,Mobile Unit,M.U.T.S,105.0,,2022/07/01,20:30:00
1,abraham lincoln playground,NY,12155,,,2023-05-25 19:30:00,,abraham lincoln playground,Movies Under the Stars: Fruitvale Station,,Mobile Unit,M.U.T.S,50.0,,2023/05/25,19:30:00
2,alexander hamilton playground,NY,11705,,,2022-10-01 20:00:00,,alexander hamilton playground,Movies Under the Stars: Minions: The Rise of Gru,,Mobile Unit,M.U.T.S,,,2022/10/01,20:00:00
3,alfred e smith playground,NY,12160,Recreation,,2018-07-11 11:00:00,Playground,alfred e smith playground,Puppet Mobile,Community Based Event,Performance,Movies,95.0,General Public,2018/07/11,11:00:00
4,alfred e smith playground,NY,12160,Recreation,,2019-08-23 10:00:00,Playground,alfred e smith playground,Al Smith Summer Camp,Agency Produced Event,Mobile Unit,Play,19.0,Children,2019/08/23,10:00:00


In [78]:
# Find the locations that are in edf but not in merged
not_in_merged = edf[~edf['location'].isin(merged['location'])]
print(not_in_merged['location'].head(50))

7                          carmansville park
9           samuel n. bennerson 2 playground
19         gertrude ederle recreation center
34                      arsenal central park
44                  hoover manton playground
45               st mary's recreation center
46         skating day at daniel oâ€™connell
50                        ponomok playground
53                    baisley park extension
79                       160 beach 29 street
80                  109-20 union hall street
86         gertrude ederle recreation center
92                                     mauro
107                          commodore barry
108                                 dry dock
110                       douglas and degraw
113                            west brighton
115                       cpl. thompson park
117                              tottenville
119                                    mapes
120                                claremont
121                                   wagner
124       

In [79]:
merged.drop(columns='location')
merged.to_parquet('../datasets/events/events_with_sc.parquet', index=False)