# Import & Setup

In [53]:
import sys
sys.path.append(r"C:/Users/mikha/Dropbox/mikhael_misc/Projects/My-Package")

import pandas as pd
import numpy as np
# import h3

import plotly.graph_objects as go
from plotly.subplots import make_subplots


df = pd.read_csv(filepath_or_buffer=r"C:/Users/mikha/Dropbox/mikhael_misc/Projects/Policing Thesis/Modified Dataset - 2021 - One Row per Stop.csv",
                 index_col='Stop ID')

## Replace cols with Sparse-filled cols

In [54]:
sparse_cols = df.filter(like='Sparse').columns

# replace non sparse-filled cols with sparse-filled cols
for sparse_col in sparse_cols:
    df[sparse_col.replace(' - Sparse Filled', '')] = df[sparse_col]
    
# drop now-redundant cols
df.drop(columns=sparse_cols, inplace=True)

# Constants

## H3 Encoding

* Resolution options:
  * Official list w/ area https://h3geo.org/docs/core-library/restable/
  * Visualize at https://observablehq.com/@four43/h3-index-visualizer
* Looks like resolution=(7 or 8) is the way to go

In [55]:
RESOLUTIONS = [6,7,8,9]

# Functions

## Create per cols

In [56]:
def create_PER_cols(dataframe:pd.DataFrame, numerator_cols:list, denominator_cols:list) -> None:
    for num_col in numerator_cols:
        for denom_col in denominator_cols:
            dataframe[f'{num_col} per {denom_col}'] = dataframe[num_col] / dataframe[denom_col]

## Create grouped dataframes

In [106]:
def create_grouped_dataframes(dataframe:pd.DataFrame, resolutions:list, numerator_cols:list, denominator_cols:list) -> None:
    """
    Creates 'grouped_dataframes' dict 
    Note that numerator_cols and denominator_cols should all be bools (for aggregation to work properly)
    """
    
    num_agg_dict = {col:'sum' for col in numerator_cols}
    denom_agg_dict = {col:'sum' for col in denominator_cols}
    
    # remaining_cols = list(set(dataframe.columns) - set(num_agg_dict.keys()) - set(denom_agg_dict.keys()))
    other_cols_agg = {'Stops':pd.NamedAgg(column="Latitude", aggfunc='count')}
    
    aggregation_dict = {**num_agg_dict, **denom_agg_dict}
    
    # only count stops that occurred in MC (based on recorded Longitude & Latitude)
    in_mc_mask = ((38.915292 <= dataframe['Latitude']) & (dataframe['Latitude'] <= 39.414658)
                  & (-77.554198 <= dataframe['Longitude']) & (dataframe['Longitude'] <= -76.864813))
    dataframe = dataframe[in_mc_mask]
    
    global grouped_dataframes
    grouped_dataframes = dict()
    for resolution in resolutions:
                
        grouped_dataframes[resolution] = dataframe.groupby(by=f'H3 Encoding - Res={resolution}').agg(aggregation_dict)
        other_col_grouped = dataframe.groupby(by=f'H3 Encoding - Res={resolution}').agg(
            Stops=pd.NamedAgg(column='Latitude', aggfunc='count'))
        
        grouped_dataframes[resolution] = pd.concat([grouped_dataframes[resolution], other_col_grouped], axis='columns')
        
        # add geo-coordinates
        grouped_dataframes[resolution]['Longitude'] = grouped_dataframes[resolution].index.map(lambda x: h3.h3_to_geo(x)[0])
        grouped_dataframes[resolution]['Latitude'] = grouped_dataframes[resolution].index.map(lambda x: h3.h3_to_geo(x)[1])
        
        # Create "... Per Stop", "... Per Citation"
        create_PER_cols(dataframe=grouped_dataframes[resolution],
                        numerator_cols=numerator_cols + ['Stops'],
                        denominator_cols=denominator_cols)


# Run
Note that numerator and denominator columns can be switched, since it's just a ratio.


In [107]:
num_cols = ['Citation', 'Arrest', 'Search Conducted'] # these should be actions the police can decide on - e.g., stopping somebody
denom_cols = ['Fatal', 'Alcohol', 'Accident', 'Personal Injury', 'Property Damage'] # these should be "negative" traffic events that simply occur - e.g., accidents, fatalities
create_grouped_dataframes(dataframe=df,
                          resolutions=RESOLUTIONS, 
                          numerator_cols=num_cols, denominator_cols=denom_cols)

# Analysis

## Stops per Accident

In [95]:
fig = make_subplots(cols=1, rows=len(grouped_dataframes),
                    shared_xaxes=True,
                    shared_yaxes=True,
                    vertical_spacing=.1)
hist_plot_dict = {}

for i,resolution in enumerate(RESOLUTIONS):
    # hist_plot_dict[resolution] = go.Histogram(x=grouped_dataframes[resolution]['Stops per Accident'])

    fig.append_trace(go.Histogram(x=grouped_dataframes[resolution]['Stops per Accident'],
                                  nbinsx=1000,
                                  name=resolution),
                     col=1, row=i+1)

fig.update_xaxes(range=[0, 2000])
fig.update_yaxes(range=[0, 100])
fig.update_layout(title_text="Customizing Subplot Axes", height=700, width=700)
fig.show()


In [51]:
fig = go.Figure()
for resolution in RESOLUTIONS:
    fig.add_trace(go.Histogram(x=grouped_dataframes[resolution]['Stops per Accident']))
    
fig.show()

# QA

## Is #Stops correct?

In [40]:
grouped_dataframes[9].loc['89f04280923ffff', 'Stops']==len(df[df['H3 Encoding - Res=9']=='89f04280923ffff'])

True

## Is #Arrests correct?

In [42]:
grouped_dataframes[9].loc['89f042803abffff', 'Arrest'] == df[df['H3 Encoding - Res=9']=='89f04280923ffff']['Arrest'].sum()

True

## Are divisions being done correctly?

In [49]:
sample_num_stops = len(df[df['H3 Encoding - Res=9']=='89f04280927ffff'])
sample_num_accidents = df[df['H3 Encoding - Res=9']=='89f04280927ffff']['Accident'].sum()

grouped_dataframes[9].loc['89f04280927ffff', 'Stops per Accident'] == sample_num_stops / sample_num_accidents

True

# CHECK EQUALITY ACROSS AGGREGATIONS

In [84]:
for var in ['Stops', 'Citation', 'Accident']:
    for res in RESOLUTIONS:
        print(res, var, grouped_dataframes[res][var].sum())

6 Stops 925117
7 Stops 925117
8 Stops 925117
9 Stops 925117
6 Citation 368364.0
7 Citation 368364.0
8 Citation 368364.0
9 Citation 368364.0
6 Accident 18715.0
7 Accident 18715.0
8 Accident 18715.0
9 Accident 18715.0
