Preprocessing of the arrest 2010 Census Tracts dataset

In [5]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point # used to find the corresponding spatial zone
import sys
import os # to use functions defined in other scripts

# add scripts dir to path
# Add the relative path to the system path
edit_df_folder = os.path.abspath('../..')
if edit_df_folder not in sys.path:
    sys.path.append(edit_df_folder)

# Import the editing scripts
from edit_funcs import *

# add data subdirectories to path    
core_folder_arrests =  os.path.abspath('../../core_datasets/arrests')
if core_folder_arrests not in sys.path:
    sys.path.append(core_folder_arrests)
    
final_datasets_folder =  os.path.abspath('../../final_datasets')
if final_datasets_folder not in sys.path:
    sys.path.append(final_datasets_folder)
    
coordinates_maps_folder =  os.path.abspath('../../coordinates_maps')
if coordinates_maps_folder not in sys.path:
    sys.path.append(coordinates_maps_folder)
    
var_to_keep_folder =  os.path.abspath('../')
if coordinates_maps_folder not in sys.path:
    sys.path.append(coordinates_maps_folder)

Filter columns

In [None]:
core_df_path = os.path.join(core_folder_arrests,
                                   "NYPD_Arrest_Data_Historic_20241202.csv")

# load first df of variables to keep

var_v1_to_keep_df = pd.read_csv(os.path.join(var_to_keep_folder,"arrests_var_to_keep.csv"))

# filter columns
df = FilterColumns(df = pd.read_csv(core_df_path), var_df = var_v1_to_keep_df)


FileNotFoundError: [Errno 2] No such file or directory: '../../arrests_var_to_keep.csv'

Filter rows: keep only year 2010

In [3]:
df = FilterRowsContains(df = df,
                        var_name = "ARREST_DATE",
                        accepted_var_values = ["2010"])

Add MONTH variable

In [4]:
df = AddMONTH(df = df,
    date_var_name= "ARREST_DATE",
    date_format = '%m/%d/%Y')

Add CT indicator variables

In [5]:

# Read the GeoJSON file into a GeoDataFrame
gdf = gpd.read_file(os.path.join(coordinates_maps_folder, "2020 Census Tracts_20241216.csv"))

df = ConvertToGeodf(df,
                    long = "Longitude",
                    lat = "Latitude",
                    crs = gdf.crs)
# actually join the two: by inclusion: df coordinates which are in polygons defined by gdf data
df = SJoinWithinGeo(geodf_units = df,
                    geodf2_polygons = gdf)

Second column filtering.
Remove latitude and longitude coordinates variables.
Remove complete date time variable.
(WARNING: we're using a different columns to keep file: arrests_v2).
Here we keep both Census tract variable (CTLabel) both NTA variable (NTA2020).

In [6]:
# load first df of variables to keep
var_v2_to_keep_df = pd.read_csv("arrests_v2_var_to_keep.csv")

# filter columns
df = FilterColumns(df = df, var_df = var_v2_to_keep_df)

Look for missing and most likely values.

In [7]:
for column in df.columns:
    print(f"Unique values in column '{column}': { df[column].value_counts()}")

Unique values in column 'CRM_ATPT_CPTD_CD': CRM_ATPT_CPTD_CD
COMPLETED    407023
ATTEMPTED      6616
Name: count, dtype: int64
Unique values in column 'JURISDICTION_CODE': JURISDICTION_CODE
0     372249
2      30684
1       7224
97      1551
3       1041
88       252
72       214
14       150
4         87
11        79
15        26
87        21
13        18
12        15
9         13
6          6
85         5
7          4
Name: count, dtype: int64
Unique values in column 'KY_CD': KY_CD
341    82008
578    66799
344    43381
109    35738
351    35649
       ...  
102        2
234        2
357        1
676        1
571        1
Name: count, Length: 64, dtype: int64
Unique values in column 'LAW_CAT_CD': LAW_CAT_CD
MISDEMEANOR    210678
FELONY         135643
VIOLATION       67318
Name: count, dtype: int64
Unique values in column 'LOC_OF_OCCUR_DESC': LOC_OF_OCCUR_DESC
INSIDE         217057
FRONT OF       111733
(null)          66113
OPPOSITE OF     10031
REAR OF          8382
OUTSIDE         

First we uniform missing values to UNKNOWN

In [8]:
df.replace(['(null)'], 'UNKNOWN', inplace=True)

Second we remove non understandable age group values

In [9]:
df["AGE_GROUP"].replace(["2020", "2019", "-977", "-962", "-71", "-12", "-942", "1020", "-965", "1925", "-928",
            "-948", "-967", "-4", "-958", "943", "-968", "949", "-973", "-2", "932", "-31", "-938",
            "1016", "1014", "-60", "-1", "938", "950", "-963"],
           'UNKNOWN',
           inplace=True)

In [10]:
for column in df.columns:
    print(f"Unique values in column '{column}': { df[column].value_counts()}")

Unique values in column 'CRM_ATPT_CPTD_CD': CRM_ATPT_CPTD_CD
COMPLETED    407023
ATTEMPTED      6616
Name: count, dtype: int64
Unique values in column 'JURISDICTION_CODE': JURISDICTION_CODE
0     372249
2      30684
1       7224
97      1551
3       1041
88       252
72       214
14       150
4         87
11        79
15        26
87        21
13        18
12        15
9         13
6          6
85         5
7          4
Name: count, dtype: int64
Unique values in column 'KY_CD': KY_CD
341    82008
578    66799
344    43381
109    35738
351    35649
       ...  
102        2
234        2
357        1
676        1
571        1
Name: count, Length: 64, dtype: int64
Unique values in column 'LAW_CAT_CD': LAW_CAT_CD
MISDEMEANOR    210678
FELONY         135643
VIOLATION       67318
Name: count, dtype: int64
Unique values in column 'LOC_OF_OCCUR_DESC': LOC_OF_OCCUR_DESC
INSIDE         217057
FRONT OF       111733
UNKNOWN         66113
OPPOSITE OF     10031
REAR OF          8382
OUTSIDE         

Save to csv

In [11]:
output_df_path = os.path.join(core_folder_arrests,
                                   "arrests_2010_cta.csv")
df.to_csv(output_df_path, index = False)