In [10]:
import pandas as pd
import zipfile
import os
import geopandas as gpd

# Part 1 - Add block info to datasets
We have multiple datasets where the raw data included latitude/longitude, but we ultimately want to do our analysis at block level. We create a function to add block data to each dataset prior to detailed analysis.

In [44]:
# unzip census blocks file
with zipfile.ZipFile('2020_census_blocks.zip', 'r') as zip_ref:
    zip_ref.extractall('2020_census_blocks')
    
os.listdir('./2020_census_blocks') #check the filename for the shapefile

['geo_export_0cf61a4c-2518-43af-b17b-0ae104636fd0.prj',
 'geo_export_0cf61a4c-2518-43af-b17b-0ae104636fd0.dbf',
 'geo_export_0cf61a4c-2518-43af-b17b-0ae104636fd0.shx',
 'geo_export_0cf61a4c-2518-43af-b17b-0ae104636fd0.shp']

In [61]:
# load the shape file using geopandas
blocks = gpd.read_file('./2020_census_blocks/geo_export_0cf61a4c-2518-43af-b17b-0ae104636fd0.shp')
blocks.head()

Unnamed: 0,cb2020,borocode,boroname,ct2020,bctcb2020,geoid,shape_leng,shape_area,geometry
0,1000,1,Manhattan,100,10001001000,360610001001000,6437.853745,1202838.0,"POLYGON ((-74.03995 40.70089, -74.03945 40.700..."
1,1001,1,Manhattan,100,10001001001,360610001001001,4395.190183,640166.4,"POLYGON ((-74.04388 40.69019, -74.04351 40.689..."
2,1000,1,Manhattan,201,10002011000,360610002011000,1569.384823,129276.3,"POLYGON ((-73.98511 40.71379, -73.98706 40.713..."
3,1001,1,Manhattan,201,10002011001,360610002011001,1594.262855,139360.4,"POLYGON ((-73.98506 40.71308, -73.98544 40.713..."
4,2000,1,Manhattan,201,10002012000,360610002012000,2055.295961,263308.4,"POLYGON ((-73.98495 40.71236, -73.98485 40.711..."


In [75]:
def load_df_and_add_block(dataset_name):
    # get crime data from csv
    dataset = pd.read_csv("../cleaned_datasets/" + dataset_name + ".csv")
    
    # convert lat/lon to geopandas df
    gdf = gpd.GeoDataFrame(
        dataset, 
        geometry=gpd.points_from_xy(dataset["longitude"], dataset["latitude"]), 
        crs=blocks.crs
    )
    
    return gpd.sjoin(gdf, blocks, predicate='within', how='left')

In [71]:
crime_with_block[['boroname', 'id']].groupby(by='boroname').count()

Unnamed: 0_level_0,id
boroname,Unnamed: 1_level_1
Bronx,108984
Brooklyn,144565
Manhattan,133018
Queens,111978
Staten Island,22066


In [98]:
datasets_with_lat_lon = [
    'crime_data'
    , 'grocery'
    , 'restaurant'
    , 'subway_stations'
]

for dataset in datasets_with_lat_lon:
    dataset_name = dataset
    
    globals()[dataset + "_df"] = load_df_and_add_block(dataset)
    
    # confirm that data successfully mapped to blocks 
    value_check = globals()[dataset + "_df"][['boroname', 'longitude']].groupby(by='boroname').count().rename(
            columns={
                "longitude": "records"
            }
        )
    
    print(dataset_name)
    display(value_check)

crime_data


Unnamed: 0_level_0,records
boroname,Unnamed: 1_level_1
Bronx,108984
Brooklyn,144565
Manhattan,133018
Queens,111978
Staten Island,22066


grocery


Unnamed: 0_level_0,records
boroname,Unnamed: 1_level_1
Bronx,425
Brooklyn,833
Manhattan,570
Queens,650
Staten Island,64


restaurant


Unnamed: 0_level_0,records
boroname,Unnamed: 1_level_1
Bronx,2251
Brooklyn,6484
Manhattan,9452
Queens,5855
Staten Island,914


subway_stations


Unnamed: 0_level_0,records
boroname,Unnamed: 1_level_1
Bronx,238
Brooklyn,537
Manhattan,757
Queens,325
