In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('NYC_Health___Hospitals_Facilities_-_2011.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
df

Unnamed: 0,Facility Type,Borough,Facility Name,Cross Streets,Phone,Location 1,Postcode,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA
0,Child Health Center,Manhattan,La Clinica Del Barrio,,212-410-7940,"413 120th Street\nNew York, NY 10035\n(40.7982...",,,,,,,,,
1,Acute Care Hospital,Queens,Elmhurst Hospital Center,,718-334-4000,"79 01\nElmhurst, NY 11373\n(40.738710402563, -...",,,,,,,,,
2,Child Health Center,Brooklyn,Ida G. Israel Community Health Center,W. 22nd St. & W. 23rd St.,718-946-3400,"2201 Neptune Avenue\nBrooklyn, NY 11224\n(40.5...",11224.0,40.578468,-73.989614,13.0,47.0,348.0,3188417.0,3.069900e+09,Seagate-Coney Island ...
3,Child Health Center,Queens,South Queens Community Health Center,,718-883-6699,"114 02 Guy R Brewer Blvd\nJamaica, NY 11434\n(...",11434.0,40.688615,-73.785593,12.0,28.0,276.0,4264631.0,4.122000e+09,Baisley Park ...
4,Child Health Center,Bronx,Melrose Houses Child Health Clinic,between Morris Ave. & Courtlandt Ave.,718-292-2820,"348 156th Street\nBronx, NY 10451\n(40.8213011...",,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,Child Health Center,Brooklyn,Bushwick Community Health Center,,718-919-1200,"1420 Bushwick Avenue\nBrooklyn, NY 11207\n(40....",11207.0,40.684318,-73.909056,4.0,37.0,403.0,3080067.0,3.034440e+09,Bushwick South ...
74,Child Health Center,Manhattan,Bellevue Hospital Center,,212-562-4141,"462 First Avenue\nNew York, NY 10016\n(40.7396...",10016.0,40.739173,-73.976862,6.0,4.0,62.0,1086515.0,1.009620e+09,Murray Hill-Kips Bay ...
75,Child Health Center,Queens,Ridgewood Communicare Clinic,between Woodbine St. & Madison St.,718-334-6190,"769 Onderdonk Avenue\nRidgewood, NY 11385\n(40...",11385.0,40.702972,-73.905489,5.0,34.0,549.0,4083018.0,4.034700e+09,Ridgewood ...
76,Child Health Center,Manhattan,Smith Communicare Health Center,corner of Catherine St.,212-346-0500,"60 Madison Street\nNew York, NY 10038\n(40.711...",10038.0,40.712019,-73.997309,3.0,1.0,25.0,1077421.0,1.001110e+09,Chinatown ...


In [4]:
df.shape

(78, 15)

In [5]:
# check duplicated rows
print('Number of duplicate (excluding original) rows is:', df.duplicated().sum())
print('Number of duplicate rows (including first) in the table is:', df[df.duplicated(keep=False)].shape[0])
# Show duplicate row data that can be dropped
df[df.duplicated(keep=False)]

Number of duplicate (excluding original) rows is: 0
Number of duplicate rows (including first) in the table is: 0


Unnamed: 0,Facility Type,Borough,Facility Name,Cross Streets,Phone,Location 1,Postcode,Latitude,Longitude,Community Board,Council District,Census Tract,BIN,BBL,NTA


In [6]:
df = df[['Facility Name', 'Facility Type', 'Borough', 'Longitude', 'Latitude']]

In [7]:
df

Unnamed: 0,Facility Name,Facility Type,Borough,Longitude,Latitude
0,La Clinica Del Barrio,Child Health Center,Manhattan,,
1,Elmhurst Hospital Center,Acute Care Hospital,Queens,,
2,Ida G. Israel Community Health Center,Child Health Center,Brooklyn,-73.989614,40.578468
3,South Queens Community Health Center,Child Health Center,Queens,-73.785593,40.688615
4,Melrose Houses Child Health Clinic,Child Health Center,Bronx,,
...,...,...,...,...,...
73,Bushwick Community Health Center,Child Health Center,Brooklyn,-73.909056,40.684318
74,Bellevue Hospital Center,Child Health Center,Manhattan,-73.976862,40.739173
75,Ridgewood Communicare Clinic,Child Health Center,Queens,-73.905489,40.702972
76,Smith Communicare Health Center,Child Health Center,Manhattan,-73.997309,40.712019


In [8]:
df[df['Longitude'].isnull() | df['Latitude'].isnull()]

Unnamed: 0,Facility Name,Facility Type,Borough,Longitude,Latitude
0,La Clinica Del Barrio,Child Health Center,Manhattan,,
1,Elmhurst Hospital Center,Acute Care Hospital,Queens,,
4,Melrose Houses Child Health Clinic,Child Health Center,Bronx,,
5,Daniel Webster Houses Child Health Clinic,Child Health Center,Bronx,,
6,Segundo Ruiz Belvis Diagnostic & Treatment Center,Diagnostic & Treatment Center,Bronx,,
9,Cumberland Diagnostic & Treatment Center,Child Health Center,Brooklyn,,
21,Sydenham Health Center,Child Health Center,Manhattan,,
23,Lincoln Medical & Mental Health Center,Child Health Center,Bronx,,
27,Segundo Ruiz Belvis Diagnostic & Treatment Center,Child Health Center,Bronx,,
29,Renaissance Health Care Network Diagnostic & T...,Child Health Center,Manhattan,,


In [9]:
borough_lat_long = {
    'Manhattan': {'Longitude':-73.971321, 'Latitude': 40.776676},
    'Brooklyn': {'Longitude':-73.949997, 'Latitude': 40.650002},
    'Bronx': {'Longitude':-73.865433, 'Latitude': 40.837048},
    'Queens': {'Longitude':-73.769417, 'Latitude': 40.742054}
}

In [10]:
for index, row in df.iterrows():
    borough = row['Borough']
    if pd.isna(row['Longitude']) and pd.isna(row['Latitude']):
        if borough in borough_lat_long:
            coordinates = borough_lat_long[borough]
            df.loc[index, 'Longitude'] = coordinates['Longitude']
            df.loc[index, 'Latitude'] = coordinates['Latitude']

print(df)

                               Facility Name        Facility Type    Borough  \
0                      La Clinica Del Barrio  Child Health Center  Manhattan   
1                   Elmhurst Hospital Center  Acute Care Hospital     Queens   
2      Ida G. Israel Community Health Center  Child Health Center   Brooklyn   
3       South Queens Community Health Center  Child Health Center     Queens   
4         Melrose Houses Child Health Clinic  Child Health Center      Bronx   
..                                       ...                  ...        ...   
73          Bushwick Community Health Center  Child Health Center   Brooklyn   
74                  Bellevue Hospital Center  Child Health Center  Manhattan   
75              Ridgewood Communicare Clinic  Child Health Center     Queens   
76           Smith Communicare Health Center  Child Health Center  Manhattan   
77  Morrisania Diagnostic & Treatment Center  Child Health Center      Bronx   

    Longitude   Latitude  
0  -73.97132

In [11]:
df

Unnamed: 0,Facility Name,Facility Type,Borough,Longitude,Latitude
0,La Clinica Del Barrio,Child Health Center,Manhattan,-73.971321,40.776676
1,Elmhurst Hospital Center,Acute Care Hospital,Queens,-73.769417,40.742054
2,Ida G. Israel Community Health Center,Child Health Center,Brooklyn,-73.989614,40.578468
3,South Queens Community Health Center,Child Health Center,Queens,-73.785593,40.688615
4,Melrose Houses Child Health Clinic,Child Health Center,Bronx,-73.865433,40.837048
...,...,...,...,...,...
73,Bushwick Community Health Center,Child Health Center,Brooklyn,-73.909056,40.684318
74,Bellevue Hospital Center,Child Health Center,Manhattan,-73.976862,40.739173
75,Ridgewood Communicare Clinic,Child Health Center,Queens,-73.905489,40.702972
76,Smith Communicare Health Center,Child Health Center,Manhattan,-73.997309,40.712019


In [12]:
df[df['Longitude'].isnull() | df['Latitude'].isnull()]

Unnamed: 0,Facility Name,Facility Type,Borough,Longitude,Latitude


In [13]:
df.isnull().sum()

Facility Name    0
Facility Type    0
Borough          0
Longitude        0
Latitude         0
dtype: int64

In [14]:
# check the data type
df.dtypes

Facility Name     object
Facility Type     object
Borough           object
Longitude        float64
Latitude         float64
dtype: object

In [15]:
df['Industry'] = 'Health Care'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Industry'] = 'Health Care'


In [16]:
df

Unnamed: 0,Facility Name,Facility Type,Borough,Longitude,Latitude,Industry
0,La Clinica Del Barrio,Child Health Center,Manhattan,-73.971321,40.776676,Health Care
1,Elmhurst Hospital Center,Acute Care Hospital,Queens,-73.769417,40.742054,Health Care
2,Ida G. Israel Community Health Center,Child Health Center,Brooklyn,-73.989614,40.578468,Health Care
3,South Queens Community Health Center,Child Health Center,Queens,-73.785593,40.688615,Health Care
4,Melrose Houses Child Health Clinic,Child Health Center,Bronx,-73.865433,40.837048,Health Care
...,...,...,...,...,...,...
73,Bushwick Community Health Center,Child Health Center,Brooklyn,-73.909056,40.684318,Health Care
74,Bellevue Hospital Center,Child Health Center,Manhattan,-73.976862,40.739173,Health Care
75,Ridgewood Communicare Clinic,Child Health Center,Queens,-73.905489,40.702972,Health Care
76,Smith Communicare Health Center,Child Health Center,Manhattan,-73.997309,40.712019,Health Care


In [17]:
import geopandas as gpd
from shapely.geometry import Point, Polygon

In [18]:
# change 'Longitude' and 'Latitude' into GeoDataFrame
df['Coordinates'] = list(zip(df.Longitude, df.Latitude))
df['Coordinates'] = df['Coordinates'].apply(Point)

gdf_HealthCare = gpd.GeoDataFrame(df, geometry='Coordinates')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Coordinates'] = list(zip(df.Longitude, df.Latitude))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Coordinates'] = df['Coordinates'].apply(Point)


In [19]:
gdf_HealthCare

Unnamed: 0,Facility Name,Facility Type,Borough,Longitude,Latitude,Industry,Coordinates
0,La Clinica Del Barrio,Child Health Center,Manhattan,-73.971321,40.776676,Health Care,POINT (-73.97132 40.77668)
1,Elmhurst Hospital Center,Acute Care Hospital,Queens,-73.769417,40.742054,Health Care,POINT (-73.76942 40.74205)
2,Ida G. Israel Community Health Center,Child Health Center,Brooklyn,-73.989614,40.578468,Health Care,POINT (-73.98961 40.57847)
3,South Queens Community Health Center,Child Health Center,Queens,-73.785593,40.688615,Health Care,POINT (-73.78559 40.68861)
4,Melrose Houses Child Health Clinic,Child Health Center,Bronx,-73.865433,40.837048,Health Care,POINT (-73.86543 40.83705)
...,...,...,...,...,...,...,...
73,Bushwick Community Health Center,Child Health Center,Brooklyn,-73.909056,40.684318,Health Care,POINT (-73.90906 40.68432)
74,Bellevue Hospital Center,Child Health Center,Manhattan,-73.976862,40.739173,Health Care,POINT (-73.97686 40.73917)
75,Ridgewood Communicare Clinic,Child Health Center,Queens,-73.905489,40.702972,Health Care,POINT (-73.90549 40.70297)
76,Smith Communicare Health Center,Child Health Center,Manhattan,-73.997309,40.712019,Health Care,POINT (-73.99731 40.71202)


In [22]:
# Read the geojson file
taxi_zone = gpd.read_file('NYC Taxi Zones.geojson')
taxi_zone

Unnamed: 0,shape_area,objectid,shape_leng,location_id,zone,borough,geometry
0,0.0007823067885,1,0.116357453189,1,Newark Airport,EWR,"MULTIPOLYGON (((-74.18445 40.69500, -74.18449 ..."
1,0.00486634037837,2,0.43346966679,2,Jamaica Bay,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,0.000314414156821,3,0.0843411059012,3,Allerton/Pelham Gardens,Bronx,"MULTIPOLYGON (((-73.84793 40.87134, -73.84725 ..."
3,0.000111871946192,4,0.0435665270921,4,Alphabet City,Manhattan,"MULTIPOLYGON (((-73.97177 40.72582, -73.97179 ..."
4,0.000497957489363,5,0.0921464898574,5,Arden Heights,Staten Island,"MULTIPOLYGON (((-74.17422 40.56257, -74.17349 ..."
...,...,...,...,...,...,...,...
258,0.000168611097013,256,0.0679149669603,256,Williamsburg (South Side),Brooklyn,"MULTIPOLYGON (((-73.95834 40.71331, -73.95681 ..."
259,0.000394552487366,259,0.126750305191,259,Woodlawn/Wakefield,Bronx,"MULTIPOLYGON (((-73.85107 40.91037, -73.85207 ..."
260,0.000422345326907,260,0.133514154636,260,Woodside,Queens,"MULTIPOLYGON (((-73.90175 40.76078, -73.90147 ..."
261,0.0000343423231652,261,0.0271204563616,261,World Trade Center,Manhattan,"MULTIPOLYGON (((-74.01333 40.70503, -74.01327 ..."


In [23]:
gdf_taxi_zone = gpd.GeoDataFrame(taxi_zone, geometry='geometry')
gdf_taxi_zone

Unnamed: 0,shape_area,objectid,shape_leng,location_id,zone,borough,geometry
0,0.0007823067885,1,0.116357453189,1,Newark Airport,EWR,"MULTIPOLYGON (((-74.18445 40.69500, -74.18449 ..."
1,0.00486634037837,2,0.43346966679,2,Jamaica Bay,Queens,"MULTIPOLYGON (((-73.82338 40.63899, -73.82277 ..."
2,0.000314414156821,3,0.0843411059012,3,Allerton/Pelham Gardens,Bronx,"MULTIPOLYGON (((-73.84793 40.87134, -73.84725 ..."
3,0.000111871946192,4,0.0435665270921,4,Alphabet City,Manhattan,"MULTIPOLYGON (((-73.97177 40.72582, -73.97179 ..."
4,0.000497957489363,5,0.0921464898574,5,Arden Heights,Staten Island,"MULTIPOLYGON (((-74.17422 40.56257, -74.17349 ..."
...,...,...,...,...,...,...,...
258,0.000168611097013,256,0.0679149669603,256,Williamsburg (South Side),Brooklyn,"MULTIPOLYGON (((-73.95834 40.71331, -73.95681 ..."
259,0.000394552487366,259,0.126750305191,259,Woodlawn/Wakefield,Bronx,"MULTIPOLYGON (((-73.85107 40.91037, -73.85207 ..."
260,0.000422345326907,260,0.133514154636,260,Woodside,Queens,"MULTIPOLYGON (((-73.90175 40.76078, -73.90147 ..."
261,0.0000343423231652,261,0.0271204563616,261,World Trade Center,Manhattan,"MULTIPOLYGON (((-74.01333 40.70503, -74.01327 ..."


In [24]:
# Ensure both GeoDataFrames are using the same CRS
gdf_HealthCare = gdf_HealthCare.set_crs("EPSG:4326")
gdf_taxi_zone = gdf_taxi_zone.set_crs("EPSG:4326")

# Perform spatial join with the 'predicate' parameter
result = gpd.sjoin(gdf_HealthCare, gdf_taxi_zone, how="inner", predicate='intersects')

In [25]:
result

Unnamed: 0,Facility Name,Facility Type,Borough,Longitude,Latitude,Industry,Coordinates,index_right,shape_area,objectid,shape_leng,location_id,zone,borough
0,La Clinica Del Barrio,Child Health Center,Manhattan,-73.971321,40.776676,Health Care,POINT (-73.97132 40.77668),44,0.000379662912054,43,0.0997386183576,43,Central Park,Manhattan
21,Sydenham Health Center,Child Health Center,Manhattan,-73.971321,40.776676,Health Care,POINT (-73.97132 40.77668),44,0.000379662912054,43,0.0997386183576,43,Central Park,Manhattan
29,Renaissance Health Care Network Diagnostic & T...,Child Health Center,Manhattan,-73.971321,40.776676,Health Care,POINT (-73.97132 40.77668),44,0.000379662912054,43,0.0997386183576,43,Central Park,Manhattan
30,Washington Heights Child Health Care Center,Child Health Center,Manhattan,-73.971321,40.776676,Health Care,POINT (-73.97132 40.77668),44,0.000379662912054,43,0.0997386183576,43,Central Park,Manhattan
33,Renaissance Health Care Network Diagnostic & T...,Diagnostic & Treatment Center,Manhattan,-73.971321,40.776676,Health Care,POINT (-73.97132 40.77668),44,0.000379662912054,43,0.0997386183576,43,Central Park,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,Woodside Houses Child Health Clinic,Child Health Center,Queens,-73.910752,40.753164,Health Care,POINT (-73.91075 40.75316),6,0.000389787989274,7,0.107417171123,7,Astoria,Queens
66,Health Center at Tremont,Child Health Center,Bronx,-73.894482,40.844083,Health Care,POINT (-73.89448 40.84408),81,0.000191114419551,78,0.0935944495806,78,East Tremont,Bronx
68,Crown Heights Child Health Clinic,Child Health Center,Brooklyn,-73.935845,40.673530,Health Care,POINT (-73.93585 40.67353),60,0.000510772092295,61,0.117310808097,61,Crown Heights North,Brooklyn
75,Ridgewood Communicare Clinic,Child Health Center,Queens,-73.905489,40.702972,Health Care,POINT (-73.90549 40.70297),198,0.000499809882564,198,0.134094242763,198,Ridgewood,Queens


In [26]:
result = result[['Facility Name', 'Facility Type', 'Industry', 'Longitude', 'Latitude', 'zone', "location_id", "borough" ]]
result

Unnamed: 0,Facility Name,Facility Type,Industry,Longitude,Latitude,zone,location_id,borough
0,La Clinica Del Barrio,Child Health Center,Health Care,-73.971321,40.776676,Central Park,43,Manhattan
21,Sydenham Health Center,Child Health Center,Health Care,-73.971321,40.776676,Central Park,43,Manhattan
29,Renaissance Health Care Network Diagnostic & T...,Child Health Center,Health Care,-73.971321,40.776676,Central Park,43,Manhattan
30,Washington Heights Child Health Care Center,Child Health Center,Health Care,-73.971321,40.776676,Central Park,43,Manhattan
33,Renaissance Health Care Network Diagnostic & T...,Diagnostic & Treatment Center,Health Care,-73.971321,40.776676,Central Park,43,Manhattan
...,...,...,...,...,...,...,...,...
62,Woodside Houses Child Health Clinic,Child Health Center,Health Care,-73.910752,40.753164,Astoria,7,Queens
66,Health Center at Tremont,Child Health Center,Health Care,-73.894482,40.844083,East Tremont,78,Bronx
68,Crown Heights Child Health Clinic,Child Health Center,Health Care,-73.935845,40.673530,Crown Heights North,61,Brooklyn
75,Ridgewood Communicare Clinic,Child Health Center,Health Care,-73.905489,40.702972,Ridgewood,198,Queens


In [27]:
result.isnull().sum()

Facility Name    0
Facility Type    0
Industry         0
Longitude        0
Latitude         0
zone             0
location_id      0
borough          0
dtype: int64

# the code below used for model

In [28]:
result = result[['Industry', 'Facility Type','location_id']]
result

Unnamed: 0,Industry,Facility Type,location_id
0,Health Care,Child Health Center,43
21,Health Care,Child Health Center,43
29,Health Care,Child Health Center,43
30,Health Care,Child Health Center,43
33,Health Care,Diagnostic & Treatment Center,43
...,...,...,...
62,Health Care,Child Health Center,7
66,Health Care,Child Health Center,78
68,Health Care,Child Health Center,61
75,Health Care,Child Health Center,198


In [29]:
df_combined_Industry = result.groupby(['location_id', 'Industry']).size().reset_index(name='Counts')

df_combined_Industry

Unnamed: 0,location_id,Industry,Counts
0,10,Health Care,1
1,112,Health Care,1
2,118,Health Care,1
3,121,Health Care,2
4,127,Health Care,1
5,130,Health Care,1
6,137,Health Care,2
7,144,Health Care,1
8,156,Health Care,1
9,166,Health Care,1


In [30]:
df_combined_FacilityType = result.groupby(['location_id', 'Facility Type']).size().reset_index(name='Counts')

df_combined_FacilityType

Unnamed: 0,location_id,Facility Type,Counts
0,10,Child Health Center,1
1,112,Child Health Center,1
2,118,Nursing Home,1
3,121,Acute Care Hospital,1
4,121,Child Health Center,1
...,...,...,...
57,80,Child Health Center,2
58,85,Child Health Center,2
59,85,Diagnostic & Treatment Center,1
60,98,Acute Care Hospital,1


In [31]:
# check the outcome is correct
total_count = df_combined_FacilityType['Counts'].sum()
total_count

78

In [32]:
df_combined_Industry.to_csv("basic_HospitalData_version_1.csv", index=False)

In [33]:
df_combined_FacilityType.to_csv("basic_HospitalData_version_2.csv", index=False)