In [2]:
import pandas as pd

In [4]:
edf = pd.read_csv('../datasets/events/events_park_raw.csv')

# Convert 'datetime' to datetime
edf['date_and_time'] = pd.to_datetime(edf['date_and_time'])

# Create 'date' and 'time' columns
edf['date'] = edf['date_and_time'].dt.date
edf['time'] = edf['date_and_time'].dt.time.astype('string')

edf['date'] = pd.to_datetime(edf['date']).dt.normalize().dt.strftime('%Y-%m-%dT%H:%M:%S.000')

edf['time'].head()

0    18:30:00
1    00:00:00
2    10:00:00
3    19:00:00
4    11:00:00
Name: time, dtype: string

In [5]:
edf.loc[edf['borough'] == 'Manhattan', 'borough'] = 'NY'
edf.loc[edf['borough'] == 'Bronx', 'borough'] = 'BX'
edf.loc[edf['borough'] == 'Staten Island', 'borough'] = 'R'
edf.loc[edf['borough'] == 'Brooklyn', 'borough'] = 'K'
edf.loc[edf['borough'] == 'Queens', 'borough'] = 'Q'

In [3]:
# Define column widths
colspecs = [(2, 34), (36, 37), (37, 42)]

# Read the file
sc = pd.read_fwf('../snd24Bcow.txt', colspecs=colspecs, header=None, names=["address_name", "borough", "street_code"], dtype='string')

# Mappin based on documentation from here: https://data.cityofnewyork.us/City-Government/Street-Name-Dictionary/w4v2-rv6b/about_data
sc.loc[sc.borough == "1", "borough"] = 'NY'
sc.loc[sc.borough == "2", "borough"] = 'BX'
sc.loc[sc.borough == "3", "borough"] = 'K'
sc.loc[sc.borough == "4", "borough"] = 'Q'
sc.loc[sc.borough == "5", "borough"] = 'R'

sc['address_name'] = sc['address_name'].str.lower()
sc['address_name'] = sc['address_name'].str.replace('\s+', ' ', regex=True)

In [7]:

sc[sc.address_name == 'broadway'].head()

Unnamed: 0,address_name,borough,street_code
2665,broadway,NY,13610
24818,broadway,BX,14920
49618,broadway,K,23230
75057,broadway,Q,37290
101054,broadway,R,20450


In [8]:
edf['location'] = edf['location'].str.lower()

# remove extra whitespaces between words in 'location' and 'address_name'
edf['location'] = edf['location'].str.replace('\s+', ' ', regex=True)

# remove suffixes like 'th', 'st', 'nd', 'rd' from 'location'
edf['location'] = edf['location'].str.replace('(\d+)(st|nd|rd|th)', r'\1', regex=True)
edf['location'] = edf['location'].str.replace(' ave. ', ' avenue ')
edf['location'] = edf['location'].str.replace(' ave ', ' avenue ')
edf['location'] = edf['location'].str.replace(' ave$', ' avenue', regex=True)
edf['location'] = edf['location'].str.replace(' st ', ' street ')
edf['location'] = edf['location'].str.replace(' st$', ' street', regex=True)
edf['location'] = edf['location'].str.replace('st.', 'st')
edf['location'] = edf['location'].str.replace('^w ', 'west ', regex=True)
edf['location'] = edf['location'].str.replace(' w$', ' west', regex=True)
edf['location'] = edf['location'].str.replace('^e ', 'east ', regex=True)
edf['location'] = edf['location'].str.replace('blvd', 'boulevard', regex=True)
edf['location'] = edf['location'].str.replace(' rd$', ' road', regex=True)
edf['location'] = edf['location'].str.replace(' rd ', ' road ', regex=True)
edf['location'] = edf['location'].str.replace('saint', 'st', regex=True)
edf['location'] = edf['location'].str.replace(' pl$', ' place', regex=True)
edf['location'] = edf['location'].str.replace('pkwy', 'parkway', regex=True)
edf['location'] = edf['location'].str.replace(' dr$', ' drive', regex=True)
edf['location'] = edf['location'].str.replace('^s ', 'south ', regex=True)
edf['location'] = edf['location'].str.replace(' cir$', ' circle', regex=True)
edf['location'] = edf['location'].str.replace(' ter$', ' terrace', regex=True)
edf['location'] = edf['location'].str.replace(' expy$', ' expressway', regex=True)
edf['location'] = edf['location'].str.replace(' hwy$', ' highway', regex=True)
edf['location'] = edf['location'].str.replace('alfred e\.', 'alfred e', regex=True)
edf['location'] = edf['location'].str.replace('flushing meadows corona park pool & rink', 'flushing meadows corona park', regex=True)
edf['location'] = edf['location'].str.replace('st james recreation center', 'st james park recreation center', regex=True)
edf['location'] = edf['location'].str.replace('thomas jefferson recreation center', 'thomas jefferson rec center', regex=True)
edf['location'] = edf['location'].str.replace('j. hood wright recreation center', 'hood wright recreation center', regex=True)
edf['location'] = edf['location'].str.replace('jackie robinson recreation center', 'jackie robinson rec center', regex=True)
len(edf)

7373

In [9]:
merged = sc.merge(edf, how='inner', left_on=['borough', 'address_name'], right_on=['borough', 'location'])

merged.head()

Unnamed: 0,address_name,borough,street_code,unit,group_name_partner,date_and_time,locationtype,location,event_name,event_type,category,classification,attendance,audience,date,time
0,abraham lincoln playground,NY,12155,,,2022-07-01 20:30:00,,abraham lincoln playground,Movies Under the Stars: Respect,,Mobile Unit,M.U.T.S,105.0,,2022-07-01T00:00:00.000,20:30:00
1,abraham lincoln playground,NY,12155,,,2023-05-25 19:30:00,,abraham lincoln playground,Movies Under the Stars: Fruitvale Station,,Mobile Unit,M.U.T.S,50.0,,2023-05-25T00:00:00.000,19:30:00
2,alexander hamilton playground,NY,11705,,,2022-10-01 20:00:00,,alexander hamilton playground,Movies Under the Stars: Minions: The Rise of Gru,,Mobile Unit,M.U.T.S,,,2022-10-01T00:00:00.000,20:00:00
3,alfred e smith playground,NY,12160,Recreation,,2018-07-11 11:00:00,Playground,alfred e smith playground,Puppet Mobile,Community Based Event,Performance,Movies,95.0,General Public,2018-07-11T00:00:00.000,11:00:00
4,alfred e smith playground,NY,12160,Recreation,,2019-08-23 10:00:00,Playground,alfred e smith playground,Al Smith Summer Camp,Agency Produced Event,Mobile Unit,Play,19.0,Children,2019-08-23T00:00:00.000,10:00:00


In [10]:
# Find the locations that are in 50642129

7                          carmansville park
9           samuel n. bennerson 2 playground
19         gertrude ederle recreation center
34                      arsenal central park
44                  hoover manton playground
45               st mary's recreation center
46         skating day at daniel oâ€™connell
50                        ponomok playground
53                    baisley park extension
79                       160 beach 29 street
80                  109-20 union hall street
86         gertrude ederle recreation center
92                                     mauro
107                          commodore barry
108                                 dry dock
110                       douglas and degraw
113                            west brighton
115                       cpl. thompson park
117                              tottenville
119                                    mapes
120                                claremont
121                                   wagner
124       

In [11]:
merged.drop(columns='location')
merged.to_parquet('../datasets/events/events_with_sc.parquet', index=False)

Park dataset was not enough so we are trying with another to mach more columns. Source: https://data.cityofnewyork.us/City-Government/NYC-Permitted-Event-Information-Historical/bkfu-528j/about_data . File was too big to commit

In [4]:
ehdf = pd.read_csv('../datasets/events/events_historical_raw.csv')

In [5]:
ehdf['start_date_time'] = pd.to_datetime(ehdf['start_date_time'])
ehdf['end_date_time'] = pd.to_datetime(ehdf['end_date_time'])

# The data we want to join with starts after 2014
ehdf = ehdf[ehdf['start_date_time'].dt.year >= 2014]
# Restrict ourselves to one day events
ehdf = ehdf[ehdf['start_date_time'].dt.date == ehdf['end_date_time'].dt.date]

In [6]:
ehdf = ehdf.dropna(subset=['event_borough'])
ehdf = ehdf.dropna(subset=['event_location'])

In [7]:
ehdf['event_location'] = ehdf['event_location'].str.lower()
ehdf['event_location'] = ehdf['event_location'].str.split(' between').str[0]
ehdf['event_location'] = ehdf['event_location'].str.replace('\s+', ' ', regex=True)

# remove suffixes like 'th', 'st', 'nd', 'rd' from 'event_location'
ehdf['event_location'] = ehdf['event_location'].str.replace('(\d+)(st|nd|rd|th)', r'\1', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' ave. ', ' avenue ')
ehdf['event_location'] = ehdf['event_location'].str.replace(' ave ', ' avenue ')
ehdf['event_location'] = ehdf['event_location'].str.replace(' ave$', ' avenue', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' st ', ' street ')
ehdf['event_location'] = ehdf['event_location'].str.replace(' st$', ' street', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace('st.', 'st')
ehdf['event_location'] = ehdf['event_location'].str.replace('^w ', 'west ', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' w$', ' west', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace('^e ', 'east ', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace('blvd', 'boulevard', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' rd$', ' road', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' rd ', ' road ', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace('saint', 'st', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' pl$', ' place', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace('pkwy', 'parkway', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' dr$', ' drive', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace('^s ', 'south ', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' cir$', ' circle', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' ter$', ' terrace', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' expy$', ' expressway', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace(' hwy$', ' highway', regex=True)
ehdf['event_location'] = ehdf['event_location'].str.replace('alfred e\.', 'alfred e', regex=True)

In [8]:
ehdf.loc[ehdf.event_borough == "Manhattan", "event_borough"] = 'NY'
ehdf.loc[ehdf.event_borough == "Bronx", "event_borough"] = 'BX'
ehdf.loc[ehdf.event_borough == "Brooklyn", "event_borough"] = 'K'
ehdf.loc[ehdf.event_borough == "Queens", "event_borough"] = 'Q'
ehdf.loc[ehdf.event_borough == "Staten Island", "event_borough"] = 'R'

In [9]:
# Get only the date of the event
ehdf['date'] = pd.to_datetime(ehdf['start_date_time'].dt.date)
ehdf['time'] = ehdf['start_date_time'].dt.time.astype('string')

In [10]:
print(ehdf['event_borough'].unique())

['NY' 'BX' 'R' 'K' 'Q']


In [11]:
ehdf.shape

(22609911, 14)

In [12]:

ehdf.head()

Unnamed: 0,event_id,event_name,start_date_time,end_date_time,event_agency,event_type,event_borough,event_location,event_street_side,street_closure_type,community_board,police_precinct,date,time
0,368421.0,Big Apple Circus,2017-11-18 19:00:00,2017-11-18 20:00:00,Parks Department,Special Event,NY,"damrosch park: damrosch park ,damrosch park: t...",,,7,20,2017-11-18,19:00:00
1,330050.0,Mt. Eden Farmer's Market,2017-11-16 08:00:00,2017-11-16 16:00:00,Parks Department,Special Event,BX,mount eden malls: mount eden malls,,,4,44,2017-11-16,08:00:00
2,314111.0,Columbia Greenmarket Thursday,2017-11-21 08:00:00,2017-11-21 17:00:00,Street Activity Permit Office,Farmers Market,NY,broadway,East,Sidewalk and Curb Lane Closure,9,26,2017-11-21,08:00:00
3,369850.0,Lawn Maintenance,2017-11-23 00:00:00,2017-11-23 23:58:00,Parks Department,Construction,NY,madison square park: center lawn,,,5,13,2017-11-23,00:00:00
4,335783.0,"October, November December model aircraft flying",2017-11-22 09:00:00,2017-11-22 20:00:00,Parks Department,Special Event,R,latourette park & golf course: model airplane ...,,,2,122,2017-11-22,09:00:00


In [13]:
merged = sc.merge(ehdf, how='inner', left_on=['borough', 'address_name'], right_on=['event_borough', 'event_location'])

In [14]:
merged.drop(columns=['strat_date_time','end_date_time', 'event_location', 'event_borough', 'community_board'], inplace=True)

In [15]:
merged.head()

Unnamed: 0,address_name,borough,street_code,event_id,event_name,start_date_time,event_agency,event_type,event_street_side,street_closure_type,police_precinct,date,time
0,1 avenue,NY,10010,552649.0,Lenox Hill Food Box,2021-09-14 11:00:00,Street Activity Permit Office,Farmers Market,West,Partial Sidewalk Closure,19,2021-09-14,11:00:00
1,1 avenue,NY,10010,552649.0,Lenox Hill Food Box,2021-09-28 11:00:00,Street Activity Permit Office,Farmers Market,West,Partial Sidewalk Closure,19,2021-09-28,11:00:00
2,1 avenue,NY,10010,552649.0,Lenox Hill Food Box,2021-09-07 11:00:00,Street Activity Permit Office,Farmers Market,West,Partial Sidewalk Closure,19,2021-09-07,11:00:00
3,1 avenue,NY,10010,552649.0,Lenox Hill Food Box,2021-09-21 11:00:00,Street Activity Permit Office,Farmers Market,West,Partial Sidewalk Closure,19,2021-09-21,11:00:00
4,1 avenue,NY,10010,552649.0,Lenox Hill Food Box,2021-09-14 11:00:00,Street Activity Permit Office,Farmers Market,West,Partial Sidewalk Closure,19,2021-09-14,11:00:00


In [16]:
merged.shape

(247314, 13)

In [17]:
mask = ehdf['event_location'].str.contains(':')

ehdf.loc[mask, 'event_location'] = ehdf.loc[mask, 'event_location'].str.split(':').str[0]

In [18]:
merged.sort_values('date', ascending=True).head(100)

Unnamed: 0,address_name,borough,street_code,event_id,event_name,start_date_time,event_agency,event_type,event_street_side,street_closure_type,police_precinct,date,time
125904,east 161 street,BX,26490,562475.0,Test Open Culture Event,2021-03-18 05:00:00,Street Activity Permit Office,Open Culture,Full,Full Street Closure,44,2021-03-18,05:00:00
125916,east 161 street,BX,26490,562475.0,Test Open Culture Event,2021-03-18 05:00:00,Street Activity Permit Office,Open Culture,Full,Full Street Closure,44,2021-03-18,05:00:00
125938,east 161 street,BX,26490,562475.0,Test Open Culture Event,2021-03-18 05:00:00,Street Activity Permit Office,Open Culture,Full,Full Street Closure,44,2021-03-18,05:00:00
125912,east 161 street,BX,26490,562475.0,Test Open Culture Event,2021-03-18 05:00:00,Street Activity Permit Office,Open Culture,Full,Full Street Closure,44,2021-03-18,05:00:00
125908,east 161 street,BX,26490,562475.0,Test Open Culture Event,2021-03-18 05:00:00,Street Activity Permit Office,Open Culture,Full,Full Street Closure,44,2021-03-18,05:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57919,hamilton place,NY,22190,564647.0,NDI 45th Annual virtual Gala Filming,2021-03-30 12:00:00,Street Activity Permit Office,Open Culture,Full,Full Street Closure,30,2021-03-30,12:00:00
57921,hamilton place,NY,22190,564647.0,NDI 45th Annual virtual Gala Filming,2021-03-30 12:00:00,Street Activity Permit Office,Open Culture,Full,Full Street Closure,30,2021-03-30,12:00:00
57929,hamilton place,NY,22190,564647.0,NDI 45th Annual virtual Gala Filming,2021-03-30 12:00:00,Street Activity Permit Office,Open Culture,Full,Full Street Closure,30,2021-03-30,12:00:00
41446,east 103 street,NY,19030,562779.0,MOVEMENT SPEAKS Dance Classes @UES,2021-04-03 10:00:00,Street Activity Permit Office,Open Culture,Full,Full Street Closure,23,2021-04-03,10:00:00


In [19]:
sample = merged.sample(100000, random_state=42)
sample.to_parquet("../datasets/events/events_historical.parquet", compression= 'snappy')

In [20]:
merged.to_parquet("../datasets/events/events_historical_whole.parquet", compression= 'snappy')