In [1]:
import pandas as pd

DATA = '/kaggle/input/nypd-complaint-data-historic/NYPD_Complaint_Data_Historic.csv'
df = pd.read_csv(filepath_or_buffer=DATA, parse_dates=['RPT_DT'])
df['year'] = df['RPT_DT'].dt.year
df.shape

  df = pd.read_csv(filepath_or_buffer=DATA, parse_dates=['RPT_DT'])


(8914838, 36)

We have so many records let's just look at one year's worth.

In [2]:
year_df = df[df['year'] == 2023]

In [3]:
df['year'].value_counts().sort_index().to_frame().T

year,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
count,530891,536330,529973,512985,509731,498589,504351,497264,491332,478583,478934,469017,464138,461926,413639,450042,531996,555117


Let's make a map of our one year of data. A half-million points is too many to plot on an interactive map, so we have to take a sample.

In [4]:
from plotly import express
from plotly import io

io.renderers.default = 'iframe'
express.scatter_mapbox(mapbox_style='open-street-map', lat='Latitude', lon='Longitude', color='BORO_NM', data_frame=year_df.sample(n=5000), height=800, zoom=10)

We can see the boroughs pretty clearly. Let's plot by jurisdiction.

In [5]:
express.scatter_mapbox(mapbox_style='open-street-map', lat='Latitude', lon='Longitude', color='KY_CD', data_frame=year_df.sample(n=5000), height=800, zoom=10)

In [6]:
express.scatter_mapbox(mapbox_style='open-street-map', lat='Latitude', lon='Longitude', color='PD_CD', data_frame=year_df.sample(n=5000), height=800, zoom=10)

What are our top crimes by code?

In [7]:
year_df['PD_CD'].value_counts().head(n=10).to_frame().T

PD_CD,638.0,333.0,101.0,109.0,637.0,639.0,922.0,259.0,441.0,198.0
count,64807,51234,44137,20770,19147,16548,13683,12389,11429,10892


What are they by name?

In [8]:
year_df['PD_DESC'].value_counts().head(n=10).to_frame().T

PD_DESC,"HARASSMENT,SUBD 3,4,5","LARCENY,PETIT FROM STORE-SHOPL",ASSAULT 3,"ASSAULT 2,1,UNCLASSIFIED","HARASSMENT,SUBD 1,CIVILIAN",AGGRAVATED HARASSMENT 2,"TRAFFIC,UNCLASSIFIED MISDEMEAN","CRIMINAL MISCHIEF,UNCLASSIFIED 4","LARCENY,GRAND OF AUTO",CRIMINAL CONTEMPT 1
count,64807,51234,44137,20770,19147,16548,13683,12389,11429,10892


Let's try plotting our GTAs.

In [9]:
express.scatter_mapbox(mapbox_style='open-street-map', lat='Latitude', lon='Longitude', color='PD_CD', data_frame=year_df[year_df['PD_DESC'] == 'LARCENY,GRAND OF AUTO'], height=800, zoom=10)

Wow. They are not evenly distributed geographically. Let's try counting them by borough.

In [10]:
year_df[year_df['PD_DESC'] == 'LARCENY,GRAND OF AUTO']['BORO_NM'].value_counts().to_frame().T

BORO_NM,BRONX,QUEENS,BROOKLYN,MANHATTAN,STATEN ISLAND
count,3991,3276,2599,1135,428
