# Import and Setup

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import pandas as pd
import numpy as np
from typing import Tuple

In [3]:
cols_to_use = ['SeqID', 'Date Of Stop', 'Time Of Stop', 'Agency', 'SubAgency',
       'Description', 'Location', 'Latitude', 'Longitude', 'Accident', 'Belts',
       'Personal Injury', 'Property Damage', 'Fatal', 'Commercial License',
       'HAZMAT', 'Commercial Vehicle', 'Alcohol', 'Work Zone',
       'Search Conducted', 'Search Disposition', 'Search Outcome',
       'Search Reason', 'Search Reason For Stop', 'Search Type',
       'Search Arrest Reason', 'State', 'VehicleType', 'Year', 'Make', 'Model',
       'Color', 'Violation Type', 'Charge', 'Article',
       'Contributed To Accident', 'Race', 'Gender', 'Arrest Type']

In [4]:
df = pd.read_csv("C:/Users/mikha/Dropbox/mikhael_misc/Projects/Policing Thesis/Traffic_Violations - Oct 6 2021.csv",
                # nrows=25000,
                usecols=cols_to_use)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Cleaning

## Assign Unique Stop ID 

In [5]:
def assign_stop_IDs(dataframe:pd.DataFrame) -> pd.DataFrame:
    """
    Assigns a unique ID # for each stop.
    Also deletes the 'SeqID' col.
    """
    dataframe['merged_id_col'] = dataframe['SeqID'] + ' _ ' + dataframe['Date Of Stop'] + ' _ ' + dataframe['Time Of Stop']
    
    unique_stops = dataframe['merged_id_col'].unique()
    
    stop_ID_dict = {stop_info:ID for ID, stop_info in enumerate(unique_stops)}

    dataframe.insert(loc=0,
                     column='Stop ID',
                     value=dataframe['merged_id_col'].map(stop_ID_dict))
    
    del dataframe['merged_id_col']
    del dataframe['SeqID']
    
    dataframe.set_index('Stop ID', inplace=True)

assign_stop_IDs(df)

## Convert strings to boolean int (0, 1)

In [6]:
def find_str_bool_cols(dataframe:pd.DataFrame) -> list:
    
    bool_cols = []
    
    str_bools_set = set(['Yes', 'No', np.nan])
    for col in df:
        if all([word in str_bools_set for word in df[col].unique()]):
            bool_cols.append(col)
            
    return bool_cols

def convert_str_bool_cols(dataframe:pd.DataFrame) -> pd.DataFrame:
    
    bool_cols = find_str_bool_cols(dataframe=dataframe)
    
    str_bool_mapping_dict = {'No':0, 'Yes':1}
    
    for col in bool_cols:
        dataframe[col] = dataframe[col].map(str_bool_mapping_dict)
    
convert_str_bool_cols(df)

## Create columns

In [7]:
def create_cols(dataframe:pd.DataFrame) -> pd.DataFrame:
    dataframe['Citation'] = (dataframe['Violation Type']=='Citation').astype(int)
    dataframe['Warning'] = (dataframe['Violation Type']=='Warning').astype(int)    
    
    dataframe['Male'] = (dataframe['Gender']=='M').astype(int)
    dataframe['Female'] = (dataframe['Gender']=='F').astype(int)
    
    dataframe['Probable Cause'] = (dataframe['Search Reason']=='Probable Cause').astype(int)
    dataframe['Arrest'] = (dataframe['Search Outcome']=='Arrest').astype(int)
    
    dataframe['DateTime'] = pd.to_datetime(df['Date Of Stop'] + ' ' + df['Time Of Stop'])
    del dataframe['Date Of Stop'], dataframe['Time Of Stop']

create_cols(df)

# Speed Columns

In [8]:
def get_speed_and_limit_from_split_str(split_description:list) -> Tuple[float, float]:
    """
    :INPUT:
    'split_description'=df['Description].str.split() element

    :OUTPUT:
    [posted_limit, speed_over_limit]
    speed_over_limit is often not recorded, so this sometimes returns [posted_limit, np.nan]
    """
    speeds_from_description = tuple([int(word) for word in split_description if word.isdigit()])
    
    if len(speeds_from_description)==2: # if posted limit and driver's speed are both recorded
        return sorted(speeds_from_description)
    elif len(speeds_from_description)==1: # if only posted limit is recorded
        return (speeds_from_description[0], np.nan)
    elif len(speeds_from_description)==0: # if neither posted limit nor driver speed were recorded
        return (np.nan, np.nan)
    

In [9]:
def create_speed_columns(dataframe:pd.DataFrame) -> pd.DataFrame:
    limit_and_speed_cols = dataframe['Description'].fillna('').str.split().apply(get_speed_and_limit_from_split_str).apply(pd.Series)
    
    limit_and_speed_cols.rename(columns={0:'Speed Limit',
                                         1:'Recorded Speed'}, inplace=True)
    
    return pd.concat([dataframe, limit_and_speed_cols], axis=1)
    
df = create_speed_columns(df)

# Create cols for "ID Corresponds to (accident, search, etc.)
Where it equals 1 for all rows with that stop ID if any row with that stop ID have (accident, search, etc.).

Honestly not sure if this is necessary (didn't see a need for it in my sample) but better to be safe...

In [10]:
def fill_sparse_fields(dataframe:pd.DataFrame, cols:list) -> pd.DataFrame:
        
    for col in cols:
        stop_ID_equals_1_somewhere = set(dataframe.index[dataframe[col]==1])
        stop_ID_doesnt_equal_1_anywhere = set(dataframe.index) - stop_ID_equals_1_somewhere
        
        filled_col_name = f'{col} - Sparse Filled'
        
        dataframe[filled_col_name] = np.nan
        
        dataframe.loc[stop_ID_equals_1_somewhere, filled_col_name] = 1
        dataframe.loc[stop_ID_doesnt_equal_1_anywhere, filled_col_name] = dataframe.loc[stop_ID_doesnt_equal_1_anywhere, col] # assign old value to stops which don't have any rows == 1


potentially_sparse_cols = [col for col in df if set(df[col].unique().tolist())=={0,1}]

sparse_cols = potentially_sparse_cols + ['Speed Limit', 'Recorded Speed']

fill_sparse_fields(dataframe=df, 
                   cols=sparse_cols)

# Export

In [11]:
def get_first_row_of_each_stop(dataframe:pd.DataFrame) -> pd.DataFrame:
    return dataframe[~dataframe.index.duplicated(keep='first')]

In [12]:
df.to_csv("C:/Users/mikha/Dropbox/mikhael_misc/Projects/Policing Thesis/Modified Dataset - 2021.csv")

get_first_row_of_each_stop(df).to_csv("C:/Users/mikha/Dropbox/mikhael_misc/Projects/Policing Thesis/Modified Dataset - 2021 - One Row per Stop.csv")