Preprocessing of the arrest 2010 Census Tracts dataset

In [36]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point # used to find the corresponding spatial zone
import sys
import os # to use functions defined in other scripts

# add scripts dir to path
# Add the relative path to the system path
edit_df_folder = os.path.abspath('../..')
if edit_df_folder not in sys.path:
    sys.path.append(edit_df_folder)

# Import the editing scripts
from edit_funcs import *

# add data subdirectories to path    
core_folder_arrests =  os.path.abspath('../../../core_datasets/arrests')
if core_folder_arrests not in sys.path:
    sys.path.append(core_folder_arrests)
    
final_datasets_folder =  os.path.abspath('../../../final_datasets')
if final_datasets_folder not in sys.path:
    sys.path.append(final_datasets_folder)
    
coordinates_maps_folder =  os.path.abspath('../../../coordinates_maps')
if coordinates_maps_folder not in sys.path:
    sys.path.append(coordinates_maps_folder)
    
var_to_keep_folder =  os.path.abspath('../')
if coordinates_maps_folder not in sys.path:
    sys.path.append(coordinates_maps_folder)

census_folder =  os.path.abspath('../../../census_data')
if census_folder not in sys.path:
    sys.path.append(census_folder)

census_var_to_keep_folder =  os.path.abspath('../../census')
if census_var_to_keep_folder not in sys.path:
    sys.path.append(census_var_to_keep_folder)


# Absolute paths of files
# Actual data of interest
core_df_path = os.path.join(core_folder_arrests,
                                   "NYPD_Arrests_Data__Historic__20241202.csv")

# first processing
var_v1_to_keep_path = os.path.join(var_to_keep_folder,"arrests_var_to_keep.csv")
# second processing
var_v2_to_keep_path = os.path.join(var_to_keep_folder,"arrests_v2_var_to_keep.csv")

coordinates_file_path = os.path.join(coordinates_maps_folder, "2020 Census Tracts_20241216.geojson")

output_df_path = os.path.join(core_folder_arrests,
                                   "arrests_2010_cta.csv")

# census related
census_data_path = os.path.join(census_folder,
                                "nyc_decennialcensusdata_2010_core-geographies.CSV")
census_var_to_keep_path = os.path.join(census_var_to_keep_folder,
                                "census_variables_to_keep.CSV")


# Other useful variables
date_variable_name = "ARREST_DATE"
age_group_variable_name = "AGE_GROUP"

considered_years_list = ["2010"]

# coordinates name variable: we only keep this info
census_coord_var_name = "GeoType"
census_coord_value_to_keep = ["CT2020"]

# join by space keys: they should have the same variable
df_space_var_name = "geoid"
census_df_space_var_name = "GeoID"

Filter columns

In [37]:


# load first df of variables to keep

var_v1_to_keep_df = pd.read_csv(var_v1_to_keep_path)

df = FilterColumns(df = pd.read_csv(core_df_path),
                   var_df = var_v1_to_keep_df)


Filter rows: keep only year 2010

In [38]:
df = FilterRowsContains(df = df,
                        var_name = date_variable_name,
                        accepted_var_values = considered_years_list)

Add MONTH variable

In [39]:
df = AddMONTH(df = df,
    date_var_name= date_variable_name,
    date_format = '%m/%d/%Y')

Add CT indicator variables

In [40]:

# Read the GeoJSON file into a GeoDataFrame
gdf = gpd.read_file(coordinates_file_path)

df = ConvertToGeodf(df,
                    long = "Longitude",
                    lat = "Latitude",
                    crs = gdf.crs)
# actually join the two by inclusion:
# df coordinates which are in polygons defined by gdf data
df = SJoinWithinGeo(geodf_units = df,
                    geodf2_polygons = gdf)

Second column filtering.
Remove latitude and longitude coordinates variables.
Remove complete date time variable.
(WARNING: we're using a different columns to keep file: arrests_v2).
Here we keep both Census tract variable (CTLabel) both NTA variable (NTA2020).

In [41]:
# load first df of variables to keep
var_v2_to_keep_df = pd.read_csv(var_v2_to_keep_path)

# filter columns
df = FilterColumns(df = df, var_df = var_v2_to_keep_df)

Look for missing and most likely values.

In [42]:
for column in df.columns:
    print(f"Unique values in column '{column}': { df[column].value_counts()}")

Unique values in column 'ARREST_DATE': ARREST_DATE
2010-01-20    1773
2010-05-20    1725
2010-03-05    1700
2010-01-22    1694
2010-03-03    1661
              ... 
2010-12-24     356
2010-11-25     280
2010-12-26     224
2010-12-25     171
2010-12-27      91
Name: count, Length: 365, dtype: int64
Unique values in column 'KY_CD': KY_CD
235.0    82064
344.0    36842
343.0    26019
677.0    24634
117.0    22939
         ...  
102.0        7
349.0        5
882.0        4
577.0        2
357.0        1
Name: count, Length: 69, dtype: int64
Unique values in column 'LAW_CAT_CD': LAW_CAT_CD
M    292227
F     97524
V     29548
I      1890
Name: count, dtype: int64
Unique values in column 'AGE_GROUP': AGE_GROUP
25-44    183083
18-24    118629
45-64     75032
<18       42577
65+        3001
Name: count, dtype: int64
Unique values in column 'PERP_SEX': PERP_SEX
M    352850
F     69472
Name: count, dtype: int64
Unique values in column 'PERP_RACE': PERP_RACE
BLACK                             208789


First we uniform missing values to UNKNOWN

In [43]:
df.replace(['(null)'], 'UNKNOWN', inplace=True)

Second we remove non understandable age group values

In [44]:
df[age_group_variable_name].replace(["2020", "2019", "-977", "-962", "-71", "-12", "-942", "1020", "-965", "1925", "-928",
            "-948", "-967", "-4", "-958", "943", "-968", "949", "-973", "-2", "932", "-31", "-938",
            "1016", "1014", "-60", "-1", "938", "950", "-963"],
           'UNKNOWN',
           inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[age_group_variable_name].replace(["2020", "2019", "-977", "-962", "-71", "-12", "-942", "1020", "-965", "1925", "-928",


In [45]:
for column in df.columns:
    print(f"Unique values in column '{column}': { df[column].value_counts()}")

Unique values in column 'ARREST_DATE': ARREST_DATE
2010-01-20    1773
2010-05-20    1725
2010-03-05    1700
2010-01-22    1694
2010-03-03    1661
              ... 
2010-12-24     356
2010-11-25     280
2010-12-26     224
2010-12-25     171
2010-12-27      91
Name: count, Length: 365, dtype: int64
Unique values in column 'KY_CD': KY_CD
235.0    82064
344.0    36842
343.0    26019
677.0    24634
117.0    22939
         ...  
102.0        7
349.0        5
882.0        4
577.0        2
357.0        1
Name: count, Length: 69, dtype: int64
Unique values in column 'LAW_CAT_CD': LAW_CAT_CD
M    292227
F     97524
V     29548
I      1890
Name: count, dtype: int64
Unique values in column 'AGE_GROUP': AGE_GROUP
25-44    183083
18-24    118629
45-64     75032
<18       42577
65+        3001
Name: count, dtype: int64
Unique values in column 'PERP_SEX': PERP_SEX
M    352850
F     69472
Name: count, dtype: int64
Unique values in column 'PERP_RACE': PERP_RACE
BLACK                             208789


Join with selected census dataset by space location

In [46]:
# read census data
census_df = pd.read_csv(census_data_path, 
                        sep = ";")

# filter type of coordinates
census_df = FilterRowsContains(df = census_df,
                               var_name = census_coord_var_name,
                               accepted_var_values = census_coord_value_to_keep)

# read census variables to keep
census_var_to_keep_df = pd.read_csv(census_var_to_keep_path,
                                    sep = ";")

# filter columns

census_df = FilterColumns(df = census_df,
                          var_df = census_var_to_keep_df)

# Ensure key columns have the same data type 
df[df_space_var_name] = df[df_space_var_name].astype(str)
census_df[census_df_space_var_name] = census_df[census_df_space_var_name].astype(str)

# debug
print(df[df_space_var_name].head())
print(census_df[census_df_space_var_name].head())

# join with df based on GeoID
df = pd.merge(df, census_df,
              left_on = df_space_var_name,
              right_on = census_df_space_var_name,
              how = "inner")

  census_df = pd.read_csv(census_data_path,


471     36005031900
478     36047030700
545     36061010900
1452    36047001501
1453    36005006500
Name: geoid, dtype: object
390    36005000100
391    36005000200
392    36005000400
393    36005001600
394    36005001901
Name: GeoID, dtype: object


Save to csv

In [47]:

df.to_csv(output_df_path, index = False)