In [6]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import hvplot.pandas
from citipy import citipy
import geopandas as gpd
from shapely.geometry import Point

In [2]:
nypd_data_df = pd.read_csv(r"C:\Users\maria\Data_Project_1\NYPD_Arrest_Data__Year_to_Date__20240711.csv")

nypd_data_df.head()

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,280255493,01/10/2024,397.0,"ROBBERY,OPEN AREA UNCLASSIFIED",105.0,ROBBERY,PL 1601001,F,M,26,0,<18,M,BLACK,996342,236149,40.814845,-73.956312,POINT (-73.956312 40.814845)
1,279805419,01/02/2024,157.0,RAPE 1,104.0,RAPE,PL 1303501,F,K,77,0,25-44,M,WHITE HISPANIC,1003509,185018,40.674496,-73.930571,POINT (-73.9305713255961 40.6744956865259)
2,279895750,01/03/2024,101.0,ASSAULT 3,344.0,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,Q,106,0,65+,F,WHITE,1026836,180689,40.662526,-73.846499,POINT (-73.846499 40.662526)
3,280809090,01/19/2024,511.0,"CONTROLLED SUBSTANCE, POSSESSI",235.0,DANGEROUS DRUGS,PL 2200300,M,B,49,0,45-64,M,BLACK,1027430,251104,40.855793,-73.843908,POINT (-73.843908 40.855793)
4,280357135,01/11/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,F,K,81,2,25-44,F,BLACK,1002457,192292,40.694456,-73.934343,POINT (-73.934343 40.694456)


In [3]:
clean_data_df = nypd_data_df[['ARREST_KEY', 'ARREST_DATE', 'OFNS_DESC', 'AGE_GROUP', 'PERP_SEX', 'Latitude', 'Longitude']].copy()

clean_data_df.rename(columns={
    'ARREST_KEY': 'Arrest Key',
    'ARREST_DATE': 'Date',
    'OFNS_DESC': 'Offense',
    'AGE_GROUP': 'Age Group',
    'PERP_SEX': 'Sex',
    'Latitude': 'Latitude',
    'Longitude': 'Longitude'
}, inplace=True)

clean_data_df.dropna(inplace=True)

clean_data_df.head()

Unnamed: 0,Arrest Key,Date,Offense,Age Group,Sex,Latitude,Longitude
0,280255493,01/10/2024,ROBBERY,<18,M,40.814845,-73.956312
1,279805419,01/02/2024,RAPE,25-44,M,40.674496,-73.930571
2,279895750,01/03/2024,ASSAULT 3 & RELATED OFFENSES,65+,F,40.662526,-73.846499
3,280809090,01/19/2024,DANGEROUS DRUGS,45-64,M,40.855793,-73.843908
4,280357135,01/11/2024,FELONY ASSAULT,25-44,F,40.694456,-73.934343


In [4]:
offense_age = clean_data_df.groupby('Age Group')['Offense'].agg(lambda x: x.mode().iloc[0])
offense_age_df = offense_age.reset_index()
offense_age_df.columns = ['Age Group', 'Most Common Offense']

offense_age_df

Unnamed: 0,Age Group,Most Common Offense
0,18-24,ASSAULT 3 & RELATED OFFENSES
1,25-44,ASSAULT 3 & RELATED OFFENSES
2,45-64,PETIT LARCENY
3,65+,ASSAULT 3 & RELATED OFFENSES
4,<18,ROBBERY


In [26]:
#SUBSET
subset_df_age = clean_data_df.head(100)
geometry = [Point(xy) for xy in zip(subset_df_age['Longitude'], subset_df_age['Latitude'])]
geo_df = gpd.GeoDataFrame(subset_df_age, geometry=geometry)

mile_in_degrees = 1 / 69

geo_df['lat_bin'] = (geo_df['Latitude'] // mile_in_degrees) * mile_in_degrees
geo_df['lon_bin'] = (geo_df['Longitude'] // mile_in_degrees) * mile_in_degrees

boroughs = gpd.read_file(r"C:\Users\maria\Data_Project_1\Borough Boundaries")

geo_df = geo_df[geo_df.geometry.within(boroughs.geometry.union_all())]

common_age_group_per_mile = geo_df.groupby(['lat_bin', 'lon_bin'])['Age Group'].agg(lambda x: x.mode().iloc[0])

common_age_group_per_mile = common_age_group_per_mile.reset_index()

geometry = [Point(xy) for xy in zip(common_age_group_per_mile['lon_bin'], common_age_group_per_mile['lat_bin'])]
common_age_group_geo_df = gpd.GeoDataFrame(common_age_group_per_mile, geometry=geometry, crs=boroughs.crs)

age_group_order = common_age_group_geo_df['Age Group'].value_counts().index

map_plot_age = common_age_group_geo_df.hvplot.points(
    'lon_bin', 'lat_bin',
    geo=True,
    tiles='OSM',
    color='Age Group',
    size=10,
    title='Most Common Offender Age Group Per Mile (Subset)'
)

map_plot_age

In [8]:
boroughs = gpd.read_file(r"C:\Users\maria\Data_Project_1\Borough Boundaries")

geometry = [Point(xy) for xy in zip(clean_data_df['Longitude'], clean_data_df['Latitude'])]
geo_df = gpd.GeoDataFrame(clean_data_df, geometry=geometry)

mile_in_degrees = 1 / 69

geo_df['lat_bin'] = (geo_df['Latitude'] // mile_in_degrees) * mile_in_degrees
geo_df['lon_bin'] = (geo_df['Longitude'] // mile_in_degrees) * mile_in_degrees

geo_df = geo_df[geo_df.geometry.within(boroughs.geometry.union_all())]

common_age_group_per_mile = geo_df.groupby(['lat_bin', 'lon_bin'])['Age Group'].agg(lambda x: x.mode().iloc[0])

common_age_group_per_mile = common_age_group_per_mile.reset_index()

geometry = [Point(xy) for xy in zip(common_age_group_per_mile['lon_bin'], common_age_group_per_mile['lat_bin'])]
common_age_group_geo_df = gpd.GeoDataFrame(common_age_group_per_mile, geometry=geometry, crs=boroughs.crs)

age_group_order = common_age_group_geo_df['Age Group'].value_counts().index

map_plot_age = common_age_group_geo_df.hvplot.points(
    'lon_bin', 'lat_bin',
    geo=True,
    tiles='OSM',
    color='Age Group',
    size=10,
    title='Most Common Offender Age Group Per Mile in NYC'
)

map_plot_age

In [7]:
offense_sex = clean_data_df.groupby('Sex')['Offense'].agg(lambda x: x.mode().iloc[0])
offense_sex_df = offense_sex.reset_index()
offense_sex_df.columns = ['Sex', 'Most Common Offense']

offense_sex_df

Unnamed: 0,Sex,Most Common Offense
0,F,ASSAULT 3 & RELATED OFFENSES
1,M,ASSAULT 3 & RELATED OFFENSES


In [7]:
#SUBSET
subset_df_sex = clean_data_df.head(100)

geometry = [Point(xy) for xy in zip(subset_df_sex['Longitude'], subset_df_sex['Latitude'])]
geo_df = gpd.GeoDataFrame(subset_df_sex, geometry=geometry)

mile_in_degrees = 1 / 69

geo_df['lat_bin'] = (geo_df['Latitude'] // mile_in_degrees) * mile_in_degrees
geo_df['lon_bin'] = (geo_df['Longitude'] // mile_in_degrees) * mile_in_degrees

boroughs = gpd.read_file(r"C:\Users\maria\Data_Project_1\Borough Boundaries")

geo_df = geo_df[geo_df.geometry.within(boroughs.geometry.union_all())]

common_sex_per_mile = geo_df.groupby(['lat_bin', 'lon_bin'])['Sex'].agg(lambda x: x.mode().iloc[0])

common_sex_per_mile = common_sex_per_mile.reset_index()

geometry = [Point(xy) for xy in zip(common_sex_per_mile['lon_bin'], common_sex_per_mile['lat_bin'])]
common_sex_geo_df = gpd.GeoDataFrame(common_sex_per_mile, geometry=geometry, crs=boroughs.crs)

sex_order = common_sex_geo_df['Sex'].value_counts().index

map_plot_sex = common_sex_geo_df.hvplot.points(
    'lon_bin', 'lat_bin',
    geo=True,
    tiles='OSM',
    color='Sex',
    size=10,
    legend='top_left',
    title='Most Common Offender Sex Per Mile in NYC Boroughs (Subset)'
)

map_plot_sex

In [20]:
boroughs = gpd.read_file(r"C:\Users\maria\Data_Project_1\Borough Boundaries")

geometry = [Point(xy) for xy in zip(clean_data_df['Longitude'], clean_data_df['Latitude'])]
geo_df = gpd.GeoDataFrame(clean_data_df, geometry=geometry)

mile_in_degrees = 1 / 69

geo_df['lat_bin'] = (geo_df['Latitude'] // mile_in_degrees) * mile_in_degrees
geo_df['lon_bin'] = (geo_df['Longitude'] // mile_in_degrees) * mile_in_degrees

geo_df = geo_df[geo_df.geometry.within(boroughs.geometry.union_all())]

common_sex_per_mile = geo_df.groupby(['lat_bin', 'lon_bin'])['Sex'].agg(lambda x: x.mode().iloc[0])

common_sex_per_mile = common_sex_per_mile.reset_index()

geometry = [Point(xy) for xy in zip(common_sex_per_mile['lon_bin'], common_sex_per_mile['lat_bin'])]
common_sex_geo_df = gpd.GeoDataFrame(common_sex_per_mile, geometry=geometry, crs=boroughs.crs)

sex_order = common_sex_geo_df['Sex'].value_counts().index

map_plot_sex = common_sex_geo_df.hvplot.points(
    'lon_bin', 'lat_bin',
    geo=True,
    tiles='OSM',
    color='Sex',
    size=10,
    legend='top_left',
    title='Most Common Offender Sex Per Mile in NYC Boroughs (Subset)'
)

map_plot_sex

In [18]:
#SUBSET
subset_df = clean_data_df.head(100)

geometry = [Point(xy) for xy in zip(subset_df['Longitude'], subset_df['Latitude'])]
geo_df = gpd.GeoDataFrame(subset_df, geometry=geometry)

mile_in_degrees = 1 / 69

geo_df['lat_bin'] = (geo_df['Latitude'] // mile_in_degrees) * mile_in_degrees
geo_df['lon_bin'] = (geo_df['Longitude'] // mile_in_degrees) * mile_in_degrees

boroughs = gpd.read_file(r"C:\Users\maria\Data_Project_1\Borough Boundaries")

geo_df = geo_df[geo_df.geometry.within(boroughs.union_all())]

common_offense_per_mile = geo_df.groupby(['lat_bin', 'lon_bin'])['Offense'].agg(lambda x: x.mode().iloc[0])

common_offense_per_mile = common_offense_per_mile.reset_index()

geometry = [Point(xy) for xy in zip(common_offense_per_mile['lon_bin'], common_offense_per_mile['lat_bin'])]
common_offense_geo_df = gpd.GeoDataFrame(common_offense_per_mile, geometry=geometry, crs=boroughs.crs)

offense_order = common_offense_geo_df['Offense'].value_counts().index

map_plot = common_offense_geo_df.hvplot.points(
    'lon_bin', 'lat_bin',
    geo=True,
    tiles='OSM',
    color='Offense',
    size=10,
    legend_sort_field='ascending',
    value_label={offense: offense for offense in offense_order},
    title='Most Common Offense Per Mile in NYC Boroughs (Subset)'
)
offense_order_per_bin = common_offense_geo_df.groupby(['lat_bin', 'lon_bin'])['Offense'].agg(lambda x: x.mode().iloc[0])

offense_order_per_bin = offense_order_per_bin.reset_index()

common_offense_geo_df = pd.merge(common_offense_geo_df, offense_order_per_bin, on=['lat_bin', 'lon_bin'], suffixes=('', '_mode'))


map_plot

  geo_df = geo_df[geo_df.geometry.within(boroughs.unary_union)]


In [11]:
boroughs = gpd.read_file(r"C:\Users\maria\Data_Project_1\Borough Boundaries")

geometry = [Point(xy) for xy in zip(clean_data_df['Longitude'], clean_data_df['Latitude'])]
geo_df = gpd.GeoDataFrame(clean_data_df, geometry=geometry, crs=boroughs.crs)

mile_in_degrees = 1 / 69

geo_df['lat_bin'] = (geo_df['Latitude'] // mile_in_degrees) * mile_in_degrees
geo_df['lon_bin'] = (geo_df['Longitude'] // mile_in_degrees) * mile_in_degrees

geo_df = geo_df[geo_df.geometry.within(boroughs.unary_union)]

common_offense_per_mile = geo_df.groupby(['lat_bin', 'lon_bin'])['Offense'].agg(lambda x: x.mode().iloc[0])

common_offense_per_mile = common_offense_per_mile.reset_index()

geometry = [Point(xy) for xy in zip(common_offense_per_mile['lon_bin'], common_offense_per_mile['lat_bin'])]
common_offense_geo_df = gpd.GeoDataFrame(common_offense_per_mile, geometry=geometry, crs=boroughs.crs)

map_plot = common_offense_geo_df.hvplot.points(
    'lon_bin', 'lat_bin',
    geo=True,
    tiles='OSM',
    color='Offense',
    size=10,
    legend_sort_field='ascending',
    legend_labels={offense: offense for offense in offense_order},
    title='Most Common Offense Per Mile in NYC Boroughs'
)

map_plot

