# Readme

The MCPD claim that they allocate officers (and make traffic stops) based on accidents in each area. This file examines the veracity of that claim

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; }</style>"))

In [26]:
import pandas as pd
import geopandas as gpd
pd.set_option('display.max_columns', None)
import numpy as np
import itertools
import shapely
# from tqdm import tqdm

In [3]:
df = pd.read_csv(r"C:\Users\mikha\Dropbox\mikhael_misc\Projects\Policing Thesis\Modified Dataset.csv")

In [4]:
#convert "Fatal" field to boolean

df['Fatal'] = df['Fatal']=='Yes'

In [5]:
race_set = {'ASIAN', 'BLACK', 'HISPANIC', 'NATIVE AMERICAN', 'OTHER', 'WHITE'}
race_set_w_total = {'ASIAN', 'BLACK', 'HISPANIC', 'NATIVE AMERICAN', 'OTHER', 'TOTAL', 'WHITE'}

In [8]:
df = df[['Number of Citations', 'Latitude', 'Longitude', 'Accident', 'Personal Injury', 'Property Damage', 'Fatal', 'Race']]

In [12]:
MIN_LONGITUDE = -77.6
MAX_LONGITUDE = -76.8

MIN_LATITUDE = 38.92
MAX_LATITUDE = 39.5

## Drop observations from outside (rough) MC bounds

In [21]:
df = df[(MIN_LONGITUDE <= df['Longitude']) & (df['Longitude'] <= MAX_LONGITUDE)]
df = df[(MIN_LATITUDE <= df['Latitude']) & (df['Latitude'] <= MAX_LATITUDE)]

In [22]:
len(df)

1555863

# New Method

Following https://james-brennan.github.io/posts/fast_gridding_geopandas/

In [23]:
gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['Longitude'], df['Latitude']))

In [24]:
gdf.head()

Unnamed: 0,Number of Citations,Latitude,Longitude,Accident,Personal Injury,Property Damage,Fatal,Race,geometry
0,0,39.00102,-77.033344,0,0,0,False,WHITE,POINT (-77.03334 39.00102)
21,1,39.039913,-77.055008,0,0,0,False,BLACK,POINT (-77.05501 39.03991)
22,1,39.039913,-77.055008,0,0,0,False,BLACK,POINT (-77.05501 39.03991)
23,1,39.039913,-77.055008,0,0,0,False,BLACK,POINT (-77.05501 39.03991)
24,0,39.184314,-77.261493,0,0,0,False,ASIAN,POINT (-77.26149 39.18431)


In [28]:
# how many cells across and down
n_cells=1000
cell_size = (MAX_LONGITUDE-MIN_LONGITUDE)/n_cells


# create the cells in a loop
grid_cells = []
for x0 in np.arange(MIN_LONGITUDE, MAX_LONGITUDE+cell_size, cell_size ):
    for y0 in np.arange(MIN_LATITUDE, MAX_LATITUDE+cell_size, cell_size):
        # bounds
        x1 = x0-cell_size
        y1 = y0+cell_size
        grid_cells.append( shapely.geometry.box(x0, y0, x1, y1)  )
cell = gpd.GeoDataFrame(grid_cells, columns=['geometry'])

In [56]:
merged = gpd.sjoin(gdf, cell, how='left', op='within')

In [57]:
merged.head()

Unnamed: 0,Number of Citations,Latitude,Longitude,Accident,Personal Injury,Property Damage,Fatal,Race,geometry,index_right
0,0,39.00102,-77.033344,0,0,0,False,WHITE,POINT (-77.03334 39.00102),514835
21,1,39.039913,-77.055008,0,0,0,False,BLACK,POINT (-77.05501 39.03991),495281
22,1,39.039913,-77.055008,0,0,0,False,BLACK,POINT (-77.05501 39.03991),495281
23,1,39.039913,-77.055008,0,0,0,False,BLACK,POINT (-77.05501 39.03991),495281
24,0,39.184314,-77.261493,0,0,0,False,ASIAN,POINT (-77.26149 39.18431),308154


## Add dummies (should maybe do earlier)

In [40]:
merged = gpd.GeoDataFrame(pd.concat([pd.get_dummies(gdf['Race']), merged], axis=1))

In [55]:
merged.head()

Unnamed: 0,ASIAN,BLACK,HISPANIC,NATIVE AMERICAN,OTHER,WHITE,ASIAN.1,BLACK.1,HISPANIC.1,NATIVE AMERICAN.1,OTHER.1,WHITE.1,Number of Citations,Latitude,Longitude,Accident,Personal Injury,Property Damage,Fatal,Race,geometry,index_right
0,0,0,0,0,0,1,0,0,0,0,0,1,0,39.00102,-77.033344,0,0,0,False,WHITE,POINT (-77.03334 39.00102),514835
21,0,1,0,0,0,0,0,1,0,0,0,0,1,39.039913,-77.055008,0,0,0,False,BLACK,POINT (-77.05501 39.03991),495281
22,0,1,0,0,0,0,0,1,0,0,0,0,1,39.039913,-77.055008,0,0,0,False,BLACK,POINT (-77.05501 39.03991),495281
23,0,1,0,0,0,0,0,1,0,0,0,0,1,39.039913,-77.055008,0,0,0,False,BLACK,POINT (-77.05501 39.03991),495281
24,1,0,0,0,0,0,1,0,0,0,0,0,0,39.184314,-77.261493,0,0,0,False,ASIAN,POINT (-77.26149 39.18431),308154


In [59]:
# dissolve = merged.dissolve(by="index_right", aggfunc="count")
merged.dissolve(aggfunc="count", by='index_right', as_index=False)

Unnamed: 0,index_right,geometry,Number of Citations,Latitude,Longitude,Accident,Personal Injury,Property Damage,Fatal,Race
0,48386,POINT (-77.54764 39.29671),2,2,2,2,2,2,2,2
1,59354,POINT (-77.53596 39.35898),1,1,1,1,1,1,1,1
2,61642,POINT (-77.53318 39.44647),1,1,1,1,1,1,1,1
3,63801,"MULTIPOINT (-77.53058 39.43180, -77.53055 39.4...",13,13,13,13,13,13,13,13
4,65265,POINT (-77.52942 39.44117),2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...
38261,710251,"MULTIPOINT (-76.81784 39.09909, -76.81775 39.0...",4,4,4,4,4,4,4,4
38262,710252,"MULTIPOINT (-76.81791 39.09923, -76.81772 39.0...",5,5,5,5,5,5,5,5
38263,721275,"MULTIPOINT (-76.80597 39.20617, -76.80592 39.2...",23,23,23,23,23,23,23,23
38264,722001,"MULTIPOINT (-76.80554 39.20631, -76.80551 39.2...",4,4,4,4,4,4,4,4


# Old Method

## Data Wrangling

*coordinates_dict[ longitude_i ][ latitude_j ][ FIELD** ][ race ] = information for that area*

** "field" equals things like "Number of Charges @ 9 MPH", "Average Speed NOT in 9,14 MPH", etc

### Set up *coordinates_dict*

In [26]:
CELL_WIDTH = 0.005
DECIMAL_PLACES = 3

MIN_LONGITUDE = -77.6
MAX_LONGITUDE = -76.8

MIN_LATITUDE = 38.92
MAX_LATITUDE = 39.5

def rounder(x):#rounds to nearest CELL_WIDTH; X decimal places
    return np.around(CELL_WIDTH * round(x/CELL_WIDTH), DECIMAL_PLACES)

TOTAL_CELLS = round((((MIN_LONGITUDE-CELL_WIDTH)-(MAX_LONGITUDE+CELL_WIDTH))/CELL_WIDTH)*(((MIN_LATITUDE-CELL_WIDTH)-(MAX_LATITUDE+CELL_WIDTH))/CELL_WIDTH))
print('total # cells ~ ', TOTAL_CELLS)

coordinates_dict = {}
for x in np.arange(MIN_LONGITUDE-CELL_WIDTH, MAX_LONGITUDE+CELL_WIDTH, CELL_WIDTH):
    coordinates_dict[rounder(np.around(x,DECIMAL_PLACES))] = {}

for x in coordinates_dict:
    for y in np.arange(MIN_LATITUDE-CELL_WIDTH, MAX_LATITUDE+CELL_WIDTH, CELL_WIDTH):
        coordinates_dict[x][rounder(np.around(y,DECIMAL_PLACES))] = {
#                                                 "All Speeds Over Limit": {race: [] for race in race_set_w_total},
#                                                "Number of Speeding Charges":{race: 0 for race in race_set_w_total},
#                                                "Average Speed NOT in 9,14 MPH": {race: 0 for race in race_set_w_total},
                                               
#                                                  "Number of Charges @ 9 MPH": {race: 0 for race in race_set_w_total},
#                                                 "Number of Charges in 9,14 MPH": {race: 0 for race in race_set_w_total},
#                                                  "Number of Charges in 10,14 MPH": {race: 0 for race in race_set_w_total},
#                                                 "Number of Charges NOT in 9,14 MPH": {race: 0 for race in race_set_w_total}, 
                                                
#                                                  "Percent of Own Race's Charges @ 9 MPH": {race: 0 for race in race_set_w_total},
#                                                 "Percent of Own Race's Charges in 9,14 MPH": {race: 0 for race in race_set_w_total},
#                                                  "Percent of Own Race's Charges in 10,14 MPH": {race: 0 for race in race_set_w_total},
#                                                 "Percent of Own Race's Charges NOT in 9,14 MPH": {race: 0 for race in race_set_w_total}, 

#                                                 "Percent of Charges that were CHANGED": {race: 0 for race in race_set_w_total}, 
#                                                 "Percent of Charges that were NOT CHANGED": {race: 0 for race in race_set_w_total}, 
                                                                    
# #                                                  "As Percent of All Races' Charges @ 9 MPH" : {race: 0 for race in race_set_w_total},
# #                                                 "As Percent of All Races' in 9,14 MPH": {race: 0 for race in race_set_w_total},
# #                                                  "As Percent of All Races in 10,14 MPH": {race: 0 for race in race_set_w_total},
# #                                                 "As Percent of All Races in 9,14 MPH": {race: 0 for race in race_set_w_total},                                                                      
                                                
#                                                 "Number of Citations for Each Stop": {race: [] for race in race_set_w_total},
#                                                 "Racial Composition": {race: 0 for race in race_set_w_total},
                                                "Total Stops": 0,
                                                "Total Citations": 0,
                                                "Total Accidents": 0,
                                                "Total Injuries": 0,
                                                "Total Fatalities": 0,                                                                     
                                                                    }

        
        


total # cells ~  19116


#### Def Rounding Function

In [27]:
def rounder(x):
    """
    Rounds to nearest CELL_WIDTH; decimal places
    """
    
    return np.around(CELL_WIDTH * round(x/CELL_WIDTH), DECIMAL_PLACES)

### Fill *coordinates_dict*

In [28]:
# zipped_vars=list(zip(df['Longitude'], df['Latitude'], df['Race'], df['Speed Over Posted Limit'], df['Number of Citations']))

for i,x in enumerate(list(zip(df['Longitude'], df['Latitude'], df['Race'], df['Speed Over Posted Limit'], df['Citation - (D_Search Outcome)'], df['Accident'], df['Personal Injury'], df['Fatal']))):
        if MIN_LONGITUDE <= x[0] <= MAX_LONGITUDE: #-77.6 <= x <= -76.8
            if MIN_LATITUDE <= x[1] <= MAX_LATITUDE:

#                 coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Racial Composition" ][ x[2] ] += 1
#                 coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Racial Composition" ][ 'TOTAL' ] += 1
                
                coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Total Stops" ] += 1
                coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Total Citations" ] += x[4]
                coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Total Accidents" ] += x[5] # ==1 if accident
                coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Total Injuries" ] += x[6]
                coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Total Fatalities" ] += x[7]
                

                
#                 if not np.isnan(x[3]): # if df["Speed Over Posted Limit"].iloc[i] != np.nan
                    
#                     coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Speeding Charges" ][ x[2] ] += 1
#                     coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Speeding Charges" ][ 'TOTAL' ] += 1

#                     coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "All Speeds Over Limit" ][ x[2] ].append(x[3])
#                     coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "All Speeds Over Limit" ]['TOTAL'].append(x[3])

#                     coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Speeding Charges" ][ x[2] ] += 1
#                     coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Speeding Charges" ][ 'TOTAL' ] +=1

#                     coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Citations for Each Stop" ][ x[2] ].append(x[4])
#                     coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Citations for Each Stop" ][ 'TOTAL' ].append(x[4])


#                     if x[3] == 9:
#                         coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Charges @ 9 MPH" ][ x[2] ] += 1
#                         coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Charges @ 9 MPH" ][ 'TOTAL' ] += 1

#                     if 9 <= x[3] <= 14:
#                         coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Charges in 9,14 MPH" ][ x[2] ] += 1
#                         coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Charges in 9,14 MPH" ][ 'TOTAL' ] += 1

#                     if 10 <= x[3] <= 14:
#                         coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Charges in 10,14 MPH" ][ x[2] ] += 1
#                         coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Charges in 10,14 MPH" ][ 'TOTAL' ] += 1
                        
#                     if 9 > x[3] or x[3] > 14:
#                         coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Charges NOT in 9,14 MPH" ][ x[2] ] += 1
#                         coordinates_dict[ rounder(x[0]) ][ rounder(x[1]) ][ "Number of Charges NOT in 9,14 MPH" ][ 'TOTAL' ] += 1

### Create pd dataframe from coordinates_dict

In [47]:
#convert dict to pd df
allocation_balance_df = pd.DataFrame.from_dict({(i,j): coordinates_dict[i][j] 
                           for i in coordinates_dict.keys() 
                           for j in coordinates_dict[i].keys()},
                       orient='index')

#give names to multi-index
allocation_balance_df.index.names = ['long', 'lat']

#convert indices to cols
allocation_balance_df.reset_index(level=allocation_balance_df.index.names, inplace=True)

In [48]:
#convert to geopandas dataframe
gdf = gpd.GeoDataFrame(
    allocation_balance_df, geometry=gpd.points_from_xy(allocation_balance_df['long'], allocation_balance_df['lat']))

In [49]:
# create fields
gdf['Stops per Accident'] = gdf['Total Stops'] / gdf['Total Accidents']
gdf['Accidents per Stop'] = gdf['Total Accidents'] / gdf['Total Stops']

gdf['Stops per Injury'] = gdf['Total Stops'] / gdf['Total Injuries']
gdf['Injuries per Stop'] = gdf['Total Injuries'] / gdf['Total Stops']

gdf['Stops per Fatality'] = gdf['Total Stops'] / gdf['Total Fatalities']
gdf['Fatalities per Stop'] = gdf['Total Fatalities'] / gdf['Total Stops']

#replace inf with 0
gdf.replace(np.inf, 0, inplace=True)

#drop cells with less than 30 stops
gdf = gdf[gdf['Total Stops']>=30]

# replace nan with zero
gdf.fillna(0, inplace=True)

# normalize fields
for field in list(set(gdf.columns) - set(['long', 'lat', 'geometry'])): #all cols excep long, lat, geometry
    gdf['Normalized ' + field] = (gdf[field]-gdf[field].mean())/gdf[field].std()
    
# replace nan with zero again after normalizing
gdf.fillna(0, inplace=True)

### Export to CSV

In [39]:
gdf.to_csv("Police Stops vs Accidents - width=.005.csv", index=False)