Preprocessing of the arrest dataset: copy the json file in this folder, change its parameters and execute this script to save the new dataset

In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point # used to find the corresponding spatial zone
import json
import sys
import os # to use functions defined in other scripts

# add scripts dir to path
# Add the relative path to the system path
# Load configuration from JSON file 
with open('arrests_nta_2010_2020_pars.json', 'r') as f: 
    config = json.load(f) # Add directories to the system path 
    for folder in config['folders'].values():
        abs_path = os.path.abspath(folder)
        if abs_path not in sys.path:
            sys.path.append(abs_path)


import os
import sys
import json

# Load configuration from JSON file
with open('arrests_nta_2010_2020_pars.json', 'r') as f:
    config = json.load(f)

# Add directories to the system path
for folder in config['folders'].values():
    abs_path = os.path.abspath(folder)
    if abs_path not in sys.path:
        sys.path.append(abs_path)

# Import the editing scripts
from edit_funcs import *

# Absolute paths of files
core_df_path = os.path.join(os.path.abspath(config['folders']['core_folder_arrests']), config['files']['core_df_path'])
var_v1_to_keep_path = os.path.join(os.path.abspath(config['folders']['var_to_keep_folder']), config['files']['var_v1_to_keep_path'])
var_v2_to_keep_path = os.path.join(os.path.abspath(config['folders']['var_to_keep_folder']), config['files']['var_v2_to_keep_path'])
coordinates_file_path = os.path.join(os.path.abspath(config['folders']['coordinates_maps_folder']), config['files']['coordinates_file_path'])
output_df_path = os.path.join(os.path.abspath(config['folders']['final_datasets_folder']), config['files']['output_df_path'])
census_data_path = os.path.join(os.path.abspath(config['folders']['census_folder']), config['files']['census_data_path'])
census_var_to_keep_path = os.path.join(os.path.abspath(config['folders']['census_var_to_keep_folder']), config['files']['census_var_to_keep_path'])

# Other useful variables
date_variable_name = config['variables']['date_variable_name']
age_group_variable_name = config['variables']['age_group_variable_name']
considered_years_list = config['variables']['considered_years_list']
census_coord_var_name = config['variables']['census_coord_var_name']
census_coord_value_to_keep = config['variables']['census_coord_value_to_keep']
df_space_var_name = config['variables']['df_space_var_name']
census_df_space_var_name = config['variables']['census_df_space_var_name']


Filter columns

In [2]:


# load first df of variables to keep

var_v1_to_keep_df = pd.read_csv(var_v1_to_keep_path)

df = FilterColumns(df = pd.read_csv(core_df_path),
                   var_df = var_v1_to_keep_df)


Filter rows: keep only years selected

In [3]:
df = FilterRowsContains(df = df,
                        var_name = date_variable_name,
                        accepted_var_values = considered_years_list)

Add MONTH variable

In [4]:
df = AddMONTH(df = df,
    date_var_name= date_variable_name,
    date_format = '%m/%d/%Y')

Add YEAR variable

In [5]:
df = AddYEAR(df = df,
    date_var_name= date_variable_name,
    date_format = '%m/%d/%Y')

Add NTA indicator variables

In [6]:

# Read the GeoJSON file into a GeoDataFrame
gdf = gpd.read_file(coordinates_file_path)

df = ConvertToGeodf(df,
                    long = "Longitude",
                    lat = "Latitude",
                    crs = gdf.crs)
# actually join the two by inclusion:
# df coordinates which are in polygons defined by gdf data
df = SJoinWithinGeo(geodf_units = df,
                    geodf2_polygons = gdf)

Second column filtering.
Remove latitude and longitude coordinates variables.
Remove complete date time variable.
(WARNING: we're using a different columns to keep file: arrests_v2).
Here we keep both Census tract variable (CTLabel) both NTA variable (NTA2020).

In [7]:
# load first df of variables to keep
var_v2_to_keep_df = pd.read_csv(var_v2_to_keep_path)

# filter columns
df = FilterColumns(df = df, var_df = var_v2_to_keep_df)

Look for missing and most likely values.

In [8]:
for column in df.columns:
    print(f"Unique values in column '{column}': { df[column].value_counts()}")

Unique values in column 'ARREST_DATE': ARREST_DATE
2010-01-20    1773
2012-03-07    1750
2012-02-01    1726
2010-05-20    1725
2010-03-05    1700
              ... 
2020-03-29     152
2020-06-21     151
2012-10-29     150
2014-12-25     136
2010-12-27      91
Name: count, Length: 4018, dtype: int64
Unique values in column 'KY_CD': KY_CD
235.0    497406
344.0    381255
343.0    237115
341.0    193962
348.0    178706
          ...  
349.0        50
455.0        25
123.0         9
577.0         5
357.0         4
Name: count, Length: 73, dtype: int64
Unique values in column 'LAW_CAT_CD': LAW_CAT_CD
M    2344897
F     985323
V     195291
I      15573
Name: count, dtype: int64
Unique values in column 'AGE_GROUP': AGE_GROUP
25-44    1665397
18-24     917388
45-64     679711
<18       259459
65+        33404
Name: count, dtype: int64
Unique values in column 'PERP_SEX': PERP_SEX
M    2948940
F     606419
Name: count, dtype: int64
Unique values in column 'PERP_RACE': PERP_RACE
BLACK             

First we uniform missing values to UNKNOWN

In [9]:
df.replace(['(null)'], 'UNKNOWN', inplace=True)

Second we remove non understandable age group values

In [10]:
df[age_group_variable_name].replace(["2020", "2019", "-977", "-962", "-71", "-12", "-942", "1020", "-965", "1925", "-928",
            "-948", "-967", "-4", "-958", "943", "-968", "949", "-973", "-2", "932", "-31", "-938",
            "1016", "1014", "-60", "-1", "938", "950", "-963"],
           'UNKNOWN',
           inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[age_group_variable_name].replace(["2020", "2019", "-977", "-962", "-71", "-12", "-942", "1020", "-965", "1925", "-928",


In [11]:
for column in df.columns:
    print(f"Unique values in column '{column}': { df[column].value_counts()}")

Unique values in column 'ARREST_DATE': ARREST_DATE
2010-01-20    1773
2012-03-07    1750
2012-02-01    1726
2010-05-20    1725
2010-03-05    1700
              ... 
2020-03-29     152
2020-06-21     151
2012-10-29     150
2014-12-25     136
2010-12-27      91
Name: count, Length: 4018, dtype: int64
Unique values in column 'KY_CD': KY_CD
235.0    497406
344.0    381255
343.0    237115
341.0    193962
348.0    178706
          ...  
349.0        50
455.0        25
123.0         9
577.0         5
357.0         4
Name: count, Length: 73, dtype: int64
Unique values in column 'LAW_CAT_CD': LAW_CAT_CD
M    2344897
F     985323
V     195291
I      15573
Name: count, dtype: int64
Unique values in column 'AGE_GROUP': AGE_GROUP
25-44    1665397
18-24     917388
45-64     679711
<18       259459
65+        33404
Name: count, dtype: int64
Unique values in column 'PERP_SEX': PERP_SEX
M    2948940
F     606419
Name: count, dtype: int64
Unique values in column 'PERP_RACE': PERP_RACE
BLACK             

Join with selected census dataset by space location

In [12]:
# read census data
census_df = pd.read_csv(census_data_path, 
                        sep = ",")

# keep all
# filter type of coordinates
# census_df = FilterRowsContains(df = census_df,
                               # var_name = census_coord_var_name,
                               # accepted_var_values = census_coord_value_to_keep)

# read census variables to keep
# census_var_to_keep_df = pd.read_csv(census_var_to_keep_path,
                                    # sep = ";")

# filter columns

# census_df = FilterColumns(df = census_df,
                          # var_df = census_var_to_keep_df)

# Ensure key columns have the same data type 
df[df_space_var_name] = df[df_space_var_name].astype(str)
census_df[census_df_space_var_name] = census_df[census_df_space_var_name].astype(str)

# debug
print(df[df_space_var_name].head())
print(census_df[census_df_space_var_name].head())

# join with df based on GeoID
df = pd.merge(df, census_df,
              left_on = df_space_var_name,
              right_on = census_df_space_var_name,
              how = "inner")

0    BK1601
1    BX0101
2    MN1101
3    MN1002
4    BX1102
Name: NTA2020, dtype: object
0    BK0101
1    BK0101
2    BK0101
3    BK0101
4    BK0101
Name: NTA2020, dtype: object


In [14]:
print(df.columns)

Index(['ARREST_DATE', 'KY_CD', 'LAW_CAT_CD', 'AGE_GROUP', 'PERP_SEX',
       'PERP_RACE', 'NTA2020', 'MONTH', 'YEAR', 'Pop1', 'MaleP', 'MdAge',
       'Hsp1P', 'WNHP', 'BNHP', 'ANHP', 'OthNHP', 'MIncome'],
      dtype='object')


Save to csv a different file for each year, due to computational issues

In [15]:
# save the entire dataset to one unique file
# df.to_csv(output_df_path, index = False)

years_list = [2010, 2011,2012, 2013,2014,2015,2016,2017,2018,2019,2020]

for year in years_list:
    df.loc[df["YEAR"] == year].to_csv(f"../../../final_datasets/arrests_{year}_nta.csv" ,index = False)