**Libraries and Imports**

In [2]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely import wkt
from sklearn.preprocessing import StandardScaler

**Load in Data**

In [3]:
# Load in Crime Dataset
crime_raw = pd.read_csv('NYPD_Arrests_Data__Historic_.csv')

# Load in Food Dataset
food_raw = pd.read_csv('Retail_Food_Stores.csv')

**Finding Zipcodes**

In [4]:
# Include only crimes dated 2010 or later
crime_raw['ARREST_DATE'] = pd.to_datetime(crime_raw['ARREST_DATE'], format='%m/%d/%Y', errors='coerce')
date_filtered = crime_raw[crime_raw['ARREST_DATE'] >= pd.Timestamp('2010-01-01')].copy()

# Converting from str to shapely.geometry.point.Point for processing into zipcodes
food_raw['geometry'] = food_raw['Georeference'].apply(wkt.loads)
date_filtered['geometry'] = date_filtered['Lon_Lat'].apply(wkt.loads)

# Converting to GeoDataFrame and standardizing Coordinate Points
geo_food = gpd.GeoDataFrame(food_raw, geometry='geometry', crs="EPSG:4326")
geo_crime = gpd.GeoDataFrame(date_filtered, geometry='geometry', crs="EPSG:4326")

# Load in zipcode boundaries
# https://data.cityofnewyork.us/Health/Modified-Zip-Code-Tabulation-Areas-MODZCTA-/pri4-ifjk/about_data
zip_boundaries = gpd.read_file('Modified Zip Code Tabulation Areas (MODZCTA)_20250326.geojson')
zip_boundaries = zip_boundaries.to_crs("EPSG:4326")

# Compiling all crimes and food places and assigning them to zipcodes
food_with_zip = gpd.sjoin(geo_food, zip_boundaries[['geometry', 'modzcta']], how='left', predicate='within')
food_with_zip = food_with_zip.dropna(subset=['modzcta'])

crime_with_zip = gpd.sjoin(geo_crime, zip_boundaries[['geometry', 'modzcta']], how='left', predicate='within')
crime_with_zip = crime_with_zip.dropna(subset=['modzcta'])



**Aggregating Counts and Final Dataset**

In [7]:
# Counting all types of crimes in each zipcode (9, F, I, M, V)
all_zipcodes = crime_with_zip['modzcta'].unique()
crime_categories = crime_with_zip['LAW_CAT_CD'].unique()
crime_counts = pd.crosstab(crime_with_zip['modzcta'], crime_with_zip['LAW_CAT_CD']).reset_index()
crime_counts = crime_counts.drop(columns=['LAW_CAT_CD', '(null)'], errors='ignore')

# Counting all types of stores in each zipcode
food_categories = food_with_zip['Establishment Type'].unique()
food_counts = pd.crosstab(food_with_zip['modzcta'], food_with_zip['Establishment Type']).reset_index()

# Combining into Final
final = pd.merge(crime_counts, food_counts, on='modzcta')

# Initialize StandardScaler
scaler = StandardScaler()

# Scale only numeric columns (excluding 'modzcta')
numeric_cols = final.columns.difference(['modzcta'])  # Get all columns except 'modzcta'
final_scaled = final.copy()  # Copy original DataFrame to preserve 'modzcta'
final_scaled[numeric_cols] = scaler.fit_transform(final[numeric_cols])  # Apply scaling

# Drop zipcode 99999 --> doesn't actually exist
final_scaled = final_scaled[final_scaled['modzcta'] != '99999'].copy()

# Save to CSV
final_scaled.to_csv('final.csv', index=False)