In [71]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; }</style>"))

In [72]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"C:\Users\mikha\Dropbox\mikhael_misc\Projects\Policing Thesis\OG Traffic Violations.csv")

  interactivity=interactivity, compiler=compiler, result=result)


# Functions

## Homogenize Entries

Purpose of this function is explained in Dummy Variables, Alcohol section (3.1.1)

In [73]:
def homogenize_entries(dataframe, var: str, desired_value, to_return) -> list:
    """
    
    df[var] == homogenize_entries(df, 'Alcohol', 1) will make all citations of a stop have dataframe['Alcohol']==1 
    if ANY of them already have that.
    
    var == column in dataframe
    
    *DEPRECATED*: default == default entry (usually "no")
    
    desired_value == atypical entry (usually "yes")
    
    to_return == new value to return
    """
    
    
    index_dict = {ind:0 for ind in dataframe.index}
    for x in list(zip(dataframe.index,dataframe[var])):
        if x[1]==desired_value:
            index_dict[x[0]] = 1
    
    desired_value_col = list(dataframe[var])
    
    for i, x in enumerate(list(zip(dataframe.index,desired_value_col))):
        if index_dict[x[0]]==1:
            desired_value_col[i] = to_return
        
    return desired_value_col

## Create Dummy Vars

In [74]:
def create_dummy_vars(dataframe, var):
    """
    df = create_dummy_vars(df, var)
    
    Horizontally concatenates boolean dummy columns to df
    """
    dummy_df = pd.get_dummies(dataframe[var])
    
    dummy_df.columns =  [col + ' - (D_' + var + ')' for col in dummy_df.columns]
    
    return pd.concat([dataframe, dummy_df], axis=1)

# New Variables

## Misc

### New (genuinely) Unique ID

In [75]:
df.insert(0, 'Unique Stop ID', [x[0]+x[1]+x[2] for x in zip(df['SeqID'], df['Date Of Stop'], df['Time Of Stop'])])
df = df.sort_values(by=['Date Of Stop', 'Time Of Stop', 'Unique Stop ID'])
df.set_index('Unique Stop ID', inplace=True)

### Total Citations

In [76]:
citation_counting_dict = {ID: 0 for ID in df.index}

for i, ID in enumerate(df.index):
    if df['Violation Type'].iloc[i] == 'Citation':
        citation_counting_dict[ID] += 1    

df.insert(3, 'Number of Citations', [citation_counting_dict[ID] for ID in df.index])

### Total Write-Ups

A write-up is equivalent to a row/observation of the original dataset.
\

In other words, any *recorded* stop is a write-up - whether that stop resulted in a warning, citation, repair-order, or anything else.

In [77]:
writeup_counting_dict = {ID: 0 for ID in df.index}

for i, ID in enumerate(df.index):
    writeup_counting_dict[ID] += 1    

df.insert(3, 'Number of writeups', [writeup_counting_dict[ID] for ID in df.index])

### Car Age

In [78]:
year_of_stop=[int(x[-4:]) for x in df['Date Of Stop']]

car_year = []
for x in df['Year']:
    try:
        car_year.append(int(x))
    except:
        car_year.append(np.nan)

def subtract(a,b):
    try:
        return a-b
    except:
        return np.nan

df['Car Age'] = [subtract(x[0], x[1]) for x in zip(year_of_stop, car_year)]

### Speed Parsing

In [79]:
def speed_parser(string):
    if 'mph' in string or 'MPH' in string:
        temp_holder = []
        for word in string.split():
            if word.isdigit():
                temp_holder.append(int(word))
        if len(temp_holder) == 2:
            return sorted(temp_holder)
        elif len(temp_holder) == 1:
            return [temp_holder[0], np.nan]
        else:
            return [np.nan, np.nan]
    else:
        return [np.nan, np.nan]


Speed_Limit = [np.nan] * len(df)
Driving_Speed = [np.nan] * len(df)
Speed_over_posted_limit = [np.nan] * len(df)

for index, x in enumerate(df['Description']):
    if type(x) == str:
        temp = speed_parser(x)
        Speed_Limit[index] = temp[0]
        Driving_Speed[index] = temp[1]
        if Driving_Speed[index] != np.nan:
            Speed_over_posted_limit[index] = Driving_Speed[index]-Speed_Limit[index]
        else:
            Speed_over_posted_limit[index] = np.nan
    else:
        Speed_over_posted_limit[index] = np.nan
        
for i,x in enumerate(Speed_over_posted_limit):
    if x>100:
        Speed_over_posted_limit[i] = np.nan

        
        
df.insert(5, 'Speed Limit', Speed_Limit)

df.insert(6, 'Driving Speed', Driving_Speed)

df.insert(7, 'Speed Over Posted Limit', Speed_over_posted_limit)

### Cited Speed Modified (or not)

In [80]:
altered = []
not_altered = []

for x in df['Speed Over Posted Limit']:
    if x==9:
        altered.append(1)
        not_altered.append(0)
    elif 10 <= x <= 14:
        altered.append(0)
        not_altered.append(1)
    else:
        altered.append(np.nan)
        not_altered.append(np.nan)

df['Speed Altered'] = altered
df['Speed NOT Altered'] = not_altered

## Dummy Variables

### Alcohol

Take the example of a DUI stop that resulted in 15 citations. In this dataset, it might be the case that only 1 of those 15 recorded citations has *df['Alcohol']=="Yes"* (and df['Alcohol']=="No" for the other 14).  
\
In reality, that entire stop is alcohol-related, so df['Alcohol'] should equal "Yes" for all entries pertaining to this stop.
\

I call this *homogenizing* an entry (see the function *homogenize_entries(df, var, desired_value, to_return)*)

#### Fixing df['Alcohol'] and df['Description'] discrepancies

Sometimes *df['Alcohol'].iloc[i]=="No"* but the word "alcohol" is mentioned in *df['Description'].iloc[i]*.

\
When this happens, I assign *df['Alcohol'].iloc[i]="Yes"*

In [81]:
from tqdm import tqdm

for i in tqdm(range(len(df))):
    if df['Alcohol'].iloc[i] == 'No':
        if type(df['Description'].iloc[i]) == str:
            if 'ALCOHOL' in df['Description'].iloc[i]:
                df['Alcohol'].iloc[i] = 'Yes'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
100%|██████████████████████████████████████████████████████████████████████| 1675856/1675856 [03:44<00:00, 7459.93it/s]


#### Homogenizing Entries

In [82]:
df['Alcohol'] = homogenize_entries(df, 'Alcohol', 'Yes', 1)
df['Alcohol'] = df['Alcohol'].replace(to_replace='No', value=0)

### Simple Binary Dummy Assignment

In [83]:
DUMMY_VARS = ['Accident','Belts', 'Commercial Vehicle', 'HAZMAT',
              'Work Zone', 'Personal Injury', 'Property Damage']


dummy_dict = {'No':0, 'Yes':1, np.nan:np.nan}

for var in DUMMY_VARS:
#     print(var)
    df[var] = [dummy_dict[x] for x in df[var]]
    
    df[var] = homogenize_entries(df, var, 1, 1)

### Non-Binary Dummy Assignment

In [84]:
#Might want to add more vars to this
MULTI_CAT_DUMMY_VARS = ['Race', 'SubAgency', 'Violation Type', 'Search Conducted', 'Search Outcome']

for var in MULTI_CAT_DUMMY_VARS:
    df = create_dummy_vars(df, var)
    
# df['Search Outcome']

## Special Cases

### Gender

In [85]:
gender_dummy_dict = {"F":0, "M":1, "U":np.nan} #these are all 3 gender values in the dataset

df['Male'] = [gender_dummy_dict[x] for x in df['Gender']]

### Driver's State, Driver's License State

In [86]:
# def driver_state_dummifier(x):
#     return x!='MD'

df['Driver State != MD'] = [x != "MD" for x in df['Driver State']]
df['DL State != MD'] = [x != "MD" for x in df['DL State']]

# Homogenize Remaining variables

In [87]:
df['Arrest - (D_Search Outcome)'] = homogenize_entries(df, 'Arrest - (D_Search Outcome)', 1, 1)

# Deletions

In [88]:
del df['SeqID']
del df['Geolocation']
del df['Year']
del df['Gender']

# Screening Outliers (not done yet)

Will see if necessary

# Save Dataset

In [89]:
df.to_csv(r"Modified Dataset.csv", index=True)

In [90]:
"""
Import the dataset with the following command to have the unique stop id as the new index
"""

# df = pd.read_csv(r"Modified Dataset.csv", index_col=(0,1))

'\nImport the dataset with the following command to have the unique stop id as the new index\n'