In [28]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 500)


https://data.world/data-society/nyc-crime-data

In [112]:
tb_crime = pd.read_csv("data/NYPD_Complaint_Data_Current_YTD.csv")
tb_crime.head()


Unnamed: 0,CMPLNT_NUM,CMPLNT_FR_DT,CMPLNT_FR_TM,CMPLNT_TO_DT,CMPLNT_TO_TM,RPT_DT,KY_CD,OFNS_DESC,PD_CD,PD_DESC,CRM_ATPT_CPTD_CD,LAW_CAT_CD,JURIS_DESC,BORO_NM,ADDR_PCT_CD,LOC_OF_OCCUR_DESC,PREM_TYP_DESC,PARKS_NM,HADEVELOPT,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Lat_Lon
0,736216184,09/30/2016,23:25:00,09/30/2016,23:25:00,09/30/2016,236,DANGEROUS WEAPONS,782.0,"WEAPONS, POSSESSION, ETC",COMPLETED,MISDEMEANOR,N.Y. TRANSIT POLICE,BRONX,42.0,,TRANSIT - NYC SUBWAY,,,1015308.0,244373.0,40.837376,-73.887761,"(40.837376359, -73.887760929)"
1,294332956,09/30/2016,23:16:00,09/30/2016,23:21:00,09/30/2016,344,ASSAULT 3 & RELATED OFFENSES,101.0,ASSAULT 3,COMPLETED,MISDEMEANOR,N.Y. POLICE DEPT,BROOKLYN,71.0,OPPOSITE OF,STREET,,,997932.0,180172.0,40.661205,-73.950687,"(40.661204871, -73.950686652)"
2,852981427,09/30/2016,23:00:00,09/30/2016,23:05:00,09/30/2016,235,DANGEROUS DRUGS,567.0,"MARIJUANA, POSSESSION 4 & 5",COMPLETED,MISDEMEANOR,N.Y. HOUSING POLICE,BRONX,43.0,INSIDE,RESIDENCE - PUBLIC HOUSING,,CASTLE HILL,1025580.0,236918.0,40.816872,-73.850685,"(40.816872438, -73.850684927)"
3,369976063,09/30/2016,23:00:00,,,09/30/2016,118,DANGEROUS WEAPONS,793.0,WEAPONS POSSESSION 3,COMPLETED,FELONY,N.Y. POLICE DEPT,QUEENS,103.0,,STREET,,,1038464.0,192970.0,40.696177,-73.804492,"(40.696177006, -73.804492266)"
4,117213771,09/30/2016,23:00:00,09/30/2016,23:10:00,09/30/2016,578,HARRASSMENT 2,637.0,"HARASSMENT,SUBD 1,CIVILIAN",COMPLETED,VIOLATION,N.Y. POLICE DEPT,QUEENS,110.0,FRONT OF,STREET,,,1016301.0,209428.0,40.741458,-73.884339,"(40.741458245, -73.884339073)"


In [113]:
tb_crime["CMPLNT_FR_DT"] = pd.to_datetime(tb_crime["CMPLNT_FR_DT"], errors="coerce")
tb_crime["CMPLNT_TO_DT"] = pd.to_datetime(tb_crime["CMPLNT_TO_DT"], errors="coerce")
tb_crime["RPT_DT"] = pd.to_datetime(tb_crime["RPT_DT"], errors="coerce")


# Dataset p/ Clusterização Hierarquica

Dataset agregado por DP Policial e tipo de crime - p/ clusterização hierarquica.

In [114]:
relevant_crimes = (
    tb_crime["OFNS_DESC"]
    .value_counts()[tb_crime["OFNS_DESC"].value_counts() > 10000]
    .index
)


In [115]:
tb_crime_agg = (
    tb_crime.groupby(["OFNS_DESC", "ADDR_PCT_CD"])["CMPLNT_NUM"].count().reset_index()
)
tb_crime_agg = tb_crime_agg[tb_crime_agg["OFNS_DESC"].isin(relevant_crimes)].copy()
tb_crime_agg["NUM_CRIME_PCT"] = tb_crime_agg.groupby("ADDR_PCT_CD")[
    "CMPLNT_NUM"
].transform(sum)
tb_crime_agg["PER_CMPLNT"] = tb_crime_agg["CMPLNT_NUM"] / tb_crime_agg["NUM_CRIME_PCT"]
tb_crime_agg["OFNS_DESC"] = tb_crime_agg["OFNS_DESC"].map(
    lambda x: str(x).lower().replace(" ", "_")
)


In [116]:
tb_crime_agg.pivot(
    index="ADDR_PCT_CD", columns="OFNS_DESC", values="PER_CMPLNT"
).reset_index().to_csv("data/nypd_crime_pct.csv", index=False)


# Dados para DBSCAN

Utilizar Lat-Long de BOs em Nova Iorque para identificar clusters geográficos com alta incidencia de crimes.

111139m em um grau de latitude/longitude!

In [123]:
tb_crime_nona = (
    tb_crime[tb_crime["OFNS_DESC"] == "MURDER & NON-NEGL. MANSLAUGHTER"]
    .dropna(subset=["Latitude", "Longitude"])
    .copy()
    .rename({"Latitude": "LAT", "Longitude": "LONG"}, axis=1)
)

tb_crime_nona.head()
int_vars = [
    "CMPLNT_NUM",
    "CMPLNT_FR_DT",
    "LOC_OF_OCCUR_DESC",
    "BORO_NM",
    "ADDR_PCT_CD",
    "LOC_OF_OCCUR_DESC",
    "LAT",
    "LONG"
]


In [124]:
tb_crime_nona[int_vars].to_csv("data/nypd_murder_locations.csv", index=False)
