Preprocessing of the arrest dataset: copy the json file in this folder, change its parameters and execute this script to save the new dataset

In [9]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point # used to find the corresponding spatial zone
import json
import sys
import os # to use functions defined in other scripts

# add scripts dir to path
# Add the relative path to the system path
# Load configuration from JSON file 
with open('arrests_nta_2010_2020_pars.json', 'r') as f: 
    config = json.load(f) # Add directories to the system path 
    for folder in config['folders'].values():
        abs_path = os.path.abspath(folder)
        if abs_path not in sys.path:
            sys.path.append(abs_path)


import os
import sys
import json

# Load configuration from JSON file
with open('arrests_nta_2010_2020_pars.json', 'r') as f:
    config = json.load(f)

# Add directories to the system path
for folder in config['folders'].values():
    abs_path = os.path.abspath(folder)
    if abs_path not in sys.path:
        sys.path.append(abs_path)

# Import the editing scripts
from edit_funcs import *

# Absolute paths of files
core_df_path = os.path.join(os.path.abspath(config['folders']['core_folder_arrests']), config['files']['core_df_path'])
var_v1_to_keep_path = os.path.join(os.path.abspath(config['folders']['var_to_keep_folder']), config['files']['var_v1_to_keep_path'])
var_v2_to_keep_path = os.path.join(os.path.abspath(config['folders']['var_to_keep_folder']), config['files']['var_v2_to_keep_path'])
coordinates_file_path = os.path.join(os.path.abspath(config['folders']['coordinates_maps_folder']), config['files']['coordinates_file_path'])
output_df_path = os.path.join(os.path.abspath(config['folders']['final_datasets_folder']), config['files']['output_df_path'])
census_data_path = os.path.join(os.path.abspath(config['folders']['census_folder']), config['files']['census_data_path'])
census_var_to_keep_path = os.path.join(os.path.abspath(config['folders']['census_var_to_keep_folder']), config['files']['census_var_to_keep_path'])

# Other useful variables
date_variable_name = config['variables']['date_variable_name']
age_group_variable_name = config['variables']['age_group_variable_name']
considered_years_list = config['variables']['considered_years_list']
census_coord_var_name = config['variables']['census_coord_var_name']
census_coord_value_to_keep = config['variables']['census_coord_value_to_keep']
df_space_var_name = config['variables']['df_space_var_name']
census_df_space_var_name = config['variables']['census_df_space_var_name']


Filter columns

In [10]:


# load first df of variables to keep

var_v1_to_keep_df = pd.read_csv(var_v1_to_keep_path)

df = FilterColumns(df = pd.read_csv(core_df_path),
                   var_df = var_v1_to_keep_df)


In [11]:
print(df.head())

  ARREST_DATE  KY_CD LAW_CAT_CD AGE_GROUP PERP_SEX       PERP_RACE   Latitude  \
0  08/07/2018    NaN          F     45-64        M           BLACK  40.671110   
1  11/13/2020  105.0          F     25-44        M           BLACK  40.810398   
2  07/01/2019    NaN          F     25-44        M  BLACK HISPANIC  40.789348   
3  02/22/2020  235.0          M     25-44        M           BLACK  40.829163   
4  11/10/2020  344.0          M     25-44        M           WHITE  40.854826   

   Longitude  
0 -73.915881  
1 -73.924895  
2 -73.947352  
3 -73.937272  
4 -73.854880  


Filter rows: keep only years selected

In [12]:
#df = FilterRowsContains(df = df,
                        #var_name = date_variable_name,
                        #accepted_var_values = considered_years_list)

Add MONTH variable

In [13]:
df = AddMONTH(df = df,
    date_var_name= date_variable_name,
    date_format = '%m/%d/%Y')

In [14]:
print(df.head())

  ARREST_DATE  KY_CD LAW_CAT_CD AGE_GROUP PERP_SEX       PERP_RACE   Latitude  \
0  2018-08-07    NaN          F     45-64        M           BLACK  40.671110   
1  2020-11-13  105.0          F     25-44        M           BLACK  40.810398   
2  2019-07-01    NaN          F     25-44        M  BLACK HISPANIC  40.789348   
3  2020-02-22  235.0          M     25-44        M           BLACK  40.829163   
4  2020-11-10  344.0          M     25-44        M           WHITE  40.854826   

   Longitude  MONTH  
0 -73.915881      8  
1 -73.924895     11  
2 -73.947352      7  
3 -73.937272      2  
4 -73.854880     11  


Add YEAR variable

In [15]:
df = AddYEAR(df = df,
    date_var_name= date_variable_name,
    date_format = '%m/%d/%Y')

In [16]:
print(df.head())

  ARREST_DATE  KY_CD LAW_CAT_CD AGE_GROUP PERP_SEX       PERP_RACE   Latitude  \
0  2018-08-07    NaN          F     45-64        M           BLACK  40.671110   
1  2020-11-13  105.0          F     25-44        M           BLACK  40.810398   
2  2019-07-01    NaN          F     25-44        M  BLACK HISPANIC  40.789348   
3  2020-02-22  235.0          M     25-44        M           BLACK  40.829163   
4  2020-11-10  344.0          M     25-44        M           WHITE  40.854826   

   Longitude  MONTH  YEAR  
0 -73.915881      8  2018  
1 -73.924895     11  2020  
2 -73.947352      7  2019  
3 -73.937272      2  2020  
4 -73.854880     11  2020  


Add NTA indicator variables

In [17]:

# Read the GeoJSON file into a GeoDataFrame
gdf = gpd.read_file(coordinates_file_path)

df = ConvertToGeodf(df,
                    long = "Longitude",
                    lat = "Latitude",
                    crs = gdf.crs)
# actually join the two by inclusion:
# df coordinates which are in polygons defined by gdf data
df = SJoinWithinGeo(geodf_units = df,
                    geodf2_polygons = gdf)

In [18]:
print(df.head())

  ARREST_DATE  KY_CD LAW_CAT_CD AGE_GROUP PERP_SEX       PERP_RACE   Latitude  \
0  2018-08-07    NaN          F     45-64        M           BLACK  40.671110   
1  2020-11-13  105.0          F     25-44        M           BLACK  40.810398   
2  2019-07-01    NaN          F     25-44        M  BLACK HISPANIC  40.789348   
3  2020-02-22  235.0          M     25-44        M           BLACK  40.829163   
4  2020-11-10  344.0          M     25-44        M           WHITE  40.854826   

   Longitude  MONTH  YEAR  ...   BoroName  CountyFIPS  NTA2020  \
0 -73.915881      8  2018  ...   Brooklyn         047   BK1601   
1 -73.924895     11  2020  ...      Bronx         005   BX0101   
2 -73.947352      7  2019  ...  Manhattan         061   MN1101   
3 -73.937272      2  2020  ...  Manhattan         061   MN1002   
4 -73.854880     11  2020  ...      Bronx         005   BX1102   

                  NTAName  NTAAbbrev NTAType CDTA2020  \
0              Ocean Hill      OcnHl       0     BK16   
1 

Second column filtering.
Remove latitude and longitude coordinates variables.
Remove complete date time variable.
(WARNING: we're using a different columns to keep file: arrests_v2).
Here we keep both Census tract variable (CTLabel) both NTA variable (NTA2020).

In [19]:
# load first df of variables to keep
var_v2_to_keep_df = pd.read_csv(var_v2_to_keep_path)

# filter columns
df = FilterColumns(df = df, var_df = var_v2_to_keep_df)

In [20]:
print(df.head())

  ARREST_DATE  KY_CD LAW_CAT_CD AGE_GROUP PERP_SEX       PERP_RACE   Latitude  \
0  2018-08-07    NaN          F     45-64        M           BLACK  40.671110   
1  2020-11-13  105.0          F     25-44        M           BLACK  40.810398   
2  2019-07-01    NaN          F     25-44        M  BLACK HISPANIC  40.789348   
3  2020-02-22  235.0          M     25-44        M           BLACK  40.829163   
4  2020-11-10  344.0          M     25-44        M           WHITE  40.854826   

   Longitude NTA2020  MONTH  YEAR  
0 -73.915881  BK1601      8  2018  
1 -73.924895  BX0101     11  2020  
2 -73.947352  MN1101      7  2019  
3 -73.937272  MN1002      2  2020  
4 -73.854880  BX1102     11  2020  


Look for missing and most likely values.

In [21]:
for column in df.columns:
    print(f"Unique values in column '{column}': { df[column].value_counts()}")

Unique values in column 'ARREST_DATE': ARREST_DATE
2010-01-20    1773
2009-05-13    1772
2012-03-07    1750
2009-02-11    1738
2012-02-01    1726
              ... 
2012-10-29     150
2021-02-01     139
2006-12-25     138
2014-12-25     136
2010-12-27      91
Name: count, Length: 6574, dtype: int64
Unique values in column 'KY_CD': KY_CD
235.0    819593
344.0    606989
343.0    320160
117.0    305947
341.0    279096
          ...  
357.0        12
577.0        11
123.0        10
575.0         1
362.0         1
Name: count, Length: 76, dtype: int64
Unique values in column 'LAW_CAT_CD': LAW_CAT_CD
M         3719626
F         1658694
V          295559
I           26974
9            1067
(null)          2
Name: count, dtype: int64
Unique values in column 'AGE_GROUP': AGE_GROUP
25-44    2722384
18-24    1448162
45-64    1063911
<18       438174
65+        52696
          ...   
959            1
910            1
309            1
446            1
330            1
Name: count, Length: 91, dtype

First we uniform missing values to UNKNOWN

In [22]:
df.replace(['(null)'], 'UNKNOWN', inplace=True)

Second we remove non understandable age group values

In [23]:
df[age_group_variable_name].replace(["2020", "2019", "-977", "-962", "-71", "-12", "-942", "1020", "-965", "1925", "-928",
            "-948", "-967", "-4", "-958", "943", "-968", "949", "-973", "-2", "932", "-31", "-938",
            "1016", "1014", "-60", "-1", "938", "950", "-963"],
           'UNKNOWN',
           inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[age_group_variable_name].replace(["2020", "2019", "-977", "-962", "-71", "-12", "-942", "1020", "-965", "1925", "-928",


In [24]:
for column in df.columns:
    print(f"Unique values in column '{column}': { df[column].value_counts()}")

Unique values in column 'ARREST_DATE': ARREST_DATE
2010-01-20    1773
2009-05-13    1772
2012-03-07    1750
2009-02-11    1738
2012-02-01    1726
              ... 
2012-10-29     150
2021-02-01     139
2006-12-25     138
2014-12-25     136
2010-12-27      91
Name: count, Length: 6574, dtype: int64
Unique values in column 'KY_CD': KY_CD
235.0    819593
344.0    606989
343.0    320160
117.0    305947
341.0    279096
          ...  
357.0        12
577.0        11
123.0        10
575.0         1
362.0         1
Name: count, Length: 76, dtype: int64
Unique values in column 'LAW_CAT_CD': LAW_CAT_CD
M          3719626
F          1658694
V           295559
I            26974
9             1067
UNKNOWN          2
Name: count, dtype: int64
Unique values in column 'AGE_GROUP': AGE_GROUP
25-44    2722384
18-24    1448162
45-64    1063911
<18       438174
65+        52696
          ...   
959            1
910            1
309            1
446            1
330            1
Name: count, Length: 87,

Join with selected census dataset by space location

In [25]:
# read census data
census_df = pd.read_csv(census_data_path, 
                        sep = ",")

# keep all
# filter type of coordinates
# census_df = FilterRowsContains(df = census_df,
                               # var_name = census_coord_var_name,
                               # accepted_var_values = census_coord_value_to_keep)

# read census variables to keep
# census_var_to_keep_df = pd.read_csv(census_var_to_keep_path,
                                    # sep = ";")

# filter columns

# census_df = FilterColumns(df = census_df,
                          # var_df = census_var_to_keep_df)

# Ensure key columns have the same data type 
# df[df_space_var_name] = df[df_space_var_name].astype(str)
# census_df[census_df_space_var_name] = census_df[census_df_space_var_name].astype(str)

# debug
print(df[df_space_var_name].head())
print(census_df[census_df_space_var_name].head())

# join with df based on GeoID
df = pd.merge(df, census_df,
              on = ["YEAR", "NTA2020"],
              how = "inner")

0    BK1601
1    BX0101
2    MN1101
3    MN1002
4    BX1102
Name: NTA2020, dtype: object
0    BK0101
1    BK0101
2    BK0101
3    BK0101
4    BK0101
Name: NTA2020, dtype: object


In [26]:
print(df.columns)

Index(['ARREST_DATE', 'KY_CD', 'LAW_CAT_CD', 'AGE_GROUP', 'PERP_SEX',
       'PERP_RACE', 'Latitude', 'Longitude', 'NTA2020', 'MONTH', 'YEAR',
       'Pop1', 'MaleP', 'MdAge', 'Hsp1P', 'WNHP', 'BNHP', 'ANHP', 'OthNHP',
       'MIncome'],
      dtype='object')


In [27]:
print(df.head())

  ARREST_DATE  KY_CD LAW_CAT_CD AGE_GROUP PERP_SEX       PERP_RACE   Latitude  \
0  2018-08-07    NaN          F     45-64        M           BLACK  40.671110   
1  2020-11-13  105.0          F     25-44        M           BLACK  40.810398   
2  2019-07-01    NaN          F     25-44        M  BLACK HISPANIC  40.789348   
3  2020-02-22  235.0          M     25-44        M           BLACK  40.829163   
4  2020-11-10  344.0          M     25-44        M           WHITE  40.854826   

   Longitude NTA2020  MONTH  YEAR   Pop1  MaleP  MdAge  Hsp1P   WNHP   BNHP  \
0 -73.915881  BK1601      8  2018  36749  45.84   33.7  19.82   5.72  67.30   
1 -73.924895  BX0101     11  2020  57718  46.19   32.6  67.31   2.72  26.78   
2 -73.947352  MN1101      7  2019  59623  44.63   35.9  43.36  19.74  22.26   
3 -73.937272  MN1002      2  2020  83327  45.45   36.4  25.21  10.39  56.75   
4 -73.854880  BX1102     11  2020  25077  48.30   37.6  33.15  42.47   6.68   

    ANHP  OthNHP   MIncome  
0   1.72 

Save to csv a different file for each year, due to computational issues

In [28]:
# save the entire dataset to one unique file


years_list = [2010, 2011,2012, 2013,2014,2015,2016,2017,2018,2019,2020]

for year in years_list:
    df.loc[df["YEAR"] == year].to_csv(f"../../../final_datasets/arrests_{year}_nta.csv" ,index = False)

In [29]:
df.to_csv(output_df_path, index = False)