# Cleaning the Chicago Police Investigatory Stops Data
#### Mihir Bhaskar
#### 11/28/2021

The following file reads in raw .csv data files on Investigatory Stops by the Chicago Police Department sourced from: https://home.chicagopolice.org/statistics-data/isr-data/ (accessed on 28th November, 2021). 

The code then processes the data, creates lat-longs by geocoding the street addresses, does a spatial merge based on lat-long with the CleanACSFile data outputted from 2_CleanACS to get the tract ID for each stop, and aggregates data up to the tract level. 

It then exports a .csv file called 'CleanStopReports', which has all the tract IDs in Chicago, along with columns relating to the number of stops (e.g. total # of stops made in each tract).

In [64]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely import wkt
from pyprojroot import here
from sodapy import Socrata
from shapely.geometry import shape


In [65]:
# Import data (downloaded from website linked above as .csv files)

isr_16 = pd.read_csv(here('./data/raw/ISR_2016.csv'))
isr_17 = pd.read_csv(here('./data/raw/ISR_2017.csv'))
isr_1819 = pd.read_csv(here('./data/raw/ISR_2018-2019.csv'))

# Appending data from multiple years into one dataframe
isr_df = isr_16.append(isr_17)
isr_df = isr_df.append(isr_1819)

print(isr_df.shape)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(508640, 169)


In [66]:
#isr_df.describe()
#isr_df.info()

#isr_df.nunique(axis=0)

### Creating a simple dataset of stop report counts per census tract

In [67]:
# Keeping only relevant variables
basic_df = isr_df[['CARD_NO', 'BEAT']]

# Dropping duplicates in CARD_NO
# Note: according to the data description (found on the same website from which data was sourced), multiple card numbers 
# basically mean that the same incident may have had updated information. Since for now we are only interested in total number
# of incidents, we can drop the duplicates

basic_df.drop_duplicates(subset=['CARD_NO'], inplace=True)

# Checking the quality/missingness of beat data
print(basic_df.isnull().sum()) 

CARD_NO    0
BEAT       0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basic_df.drop_duplicates(subset=['CARD_NO'], inplace=True)


In [68]:
# Import beat boundaries from the Chicago Open Data Portal - the code below
# is sourced from the following API documentations on the Chicago Open Data Portal: https://dev.socrata.com/foundry/data.cityofchicago.org/n9it-hstw

client = Socrata("data.cityofchicago.org", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cityofchicago.org,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
beat_bounds = client.get("n9it-hstw", limit=2000)

# Convert to pandas DataFrame
beat_bounds = pd.DataFrame.from_records(beat_bounds)

beat_bounds['beat_num'] = pd.to_numeric(beat_bounds['beat_num'])

beat_bounds



Unnamed: 0,the_geom,district,sector,beat,beat_num
0,"{'type': 'MultiPolygon', 'coordinates': [[[[-8...",17,1,1,1713
1,"{'type': 'MultiPolygon', 'coordinates': [[[[-8...",31,0,0,3100
2,"{'type': 'MultiPolygon', 'coordinates': [[[[-8...",16,5,5,1651
3,"{'type': 'MultiPolygon', 'coordinates': [[[[-8...",19,1,1,1914
4,"{'type': 'MultiPolygon', 'coordinates': [[[[-8...",19,1,1,1915
...,...,...,...,...,...
272,"{'type': 'MultiPolygon', 'coordinates': [[[[-8...",03,1,1,314
273,"{'type': 'MultiPolygon', 'coordinates': [[[[-8...",08,2,2,825
274,"{'type': 'MultiPolygon', 'coordinates': [[[[-8...",03,1,1,313
275,"{'type': 'MultiPolygon', 'coordinates': [[[[-8...",08,2,2,823


In [69]:
# Merge on the count of stops to the beat boundaries dataset

# Aggregate stop counts to the beat level
basic_df = basic_df.groupby('BEAT').count()

# Merge these datasets on the beat
basic_df = beat_bounds.merge(basic_df, how='left', left_on=['beat_num'], right_on=['BEAT'])

# Convert the dataset to a geodataframe
basic_df['the_geom'] = basic_df['the_geom'].apply(shape)

#basic_df['the_geom'] = basic_df['the_geom'].apply(wkt.loads)
basic_df = gpd.GeoDataFrame(basic_df, geometry='the_geom', crs='epsg:4326')

basic_df

Unnamed: 0,the_geom,district,sector,beat,beat_num,CARD_NO
0,"MULTIPOLYGON (((-87.70473 41.97577, -87.70472 ...",17,1,1,1713,1100
1,"MULTIPOLYGON (((-87.83365 41.97535, -87.83366 ...",31,0,0,3100,1162
2,"MULTIPOLYGON (((-87.90684 41.97656, -87.91070 ...",16,5,5,1651,468
3,"MULTIPOLYGON (((-87.64492 41.96973, -87.64431 ...",19,1,1,1914,2424
4,"MULTIPOLYGON (((-87.63724 41.96599, -87.63644 ...",19,1,1,1915,790
...,...,...,...,...,...,...
272,"MULTIPOLYGON (((-87.58879 41.78612, -87.58844 ...",03,1,1,314,648
273,"MULTIPOLYGON (((-87.67917 41.78656, -87.67912 ...",08,2,2,825,4708
274,"MULTIPOLYGON (((-87.60854 41.78583, -87.60808 ...",03,1,1,313,1698
275,"MULTIPOLYGON (((-87.69354 41.78458, -87.69354 ...",08,2,2,823,3595


In [70]:
# Replacing beats that didn't merge with any stops to have 0 counts
basic_df.isnull().sum()

basic_df['CARD_NO'].describe()

## We find that actually every beat has a minimum of 144 stops - that is, every beat in the police beat boundaries data matched to 
## beats from our stop reports data.

count     277.000000
mean     1844.404332
std      1302.178797
min       144.000000
25%       926.000000
50%      1528.000000
75%      2427.000000
max      7809.000000
Name: CARD_NO, dtype: float64

In [71]:
# Import the census tract boundaries to do a spatial merge, and map the police beats to census tracts

# Importing the census tracts data and converting it to a GeoDataFrame
acs = pd.read_csv(here('./data/CleanACSFile.csv'))

# Keeping only relevant info from acs file for the spatial merge
acs = acs[['geo_id', 'geometry']]

acs['geometry'] = acs['geometry'].apply(wkt.loads)
acs = gpd.GeoDataFrame(acs, crs='epsg:4326')

# Doing the spatial merge to assign a tract ID to every complaint
basic_df = gpd.sjoin(basic_df, acs[['geo_id', 'geometry']], how='left')

**Now, we have a dataframe where every beat has multiple rows, because a specific beat maps to many different census tracts.** The methodology for resolving these and getting down to a unique tract-level database is as follows:
1. For every beat, evenly divide up the number of stops for each tract that matches to it
2. Aggregate up the stops at a tract-level, so that if a tract exists in multiple beats, the stops associated with it from each beat is added up


In [72]:
# Dividing up the stops in every beat to the different tracts that match to it
basic_df['matching_tract_count'] = basic_df['geo_id'].groupby(basic_df['beat_num']).transform('count')
basic_df['assigned_stops'] = basic_df['CARD_NO'] / basic_df['matching_tract_count']

# Aggregating up the assigned stops for each geo_id (i.e. each census tract)
basic_df['inv_stops_pertract'] = basic_df['matching_tract_count'].groupby(basic_df['geo_id']).transform('sum')

In [73]:
# Dropping duplicates to now get the dataset down to the tract level
basic_df.drop_duplicates(subset=['geo_id'], inplace=True)

# Keeping relevant variables
basic_df = basic_df[['geo_id', 'inv_stops_pertract']]
basic_df.rename(columns={'inv_stops_pertract':'inv_stop_count'}, inplace=True)

# Merging these counts back with the full dataset of census tract IDs
merged = acs.merge(basic_df, how='left', on=['geo_id'])

# Replacing missing crime ID count values with 0 (i.e. missing means there were 0 complaints found in that tract)
merged['inv_stop_count'] = merged['inv_stop_count'].fillna(0)

In [74]:
# Export .CSV file to be used in other scripts
merged[['geo_id', 'inv_stop_count']].to_csv(here('./data/CleanStopReports.csv'),
                                            encoding='utf-8', index=False)

