In [906]:
# Load in necessary packages

import pandas as pd

# Suppress setting with copy warnings in preprocessing
pd.set_option('mode.chained_assignment', None)

In [907]:
# Read in the data from the .csv file

# This dataset covers US Police Shootings from 2015-2024. More information can be found at 
# https://www.kaggle.com/datasets/aquibahmad7/police-shootings-in-the-united-states-2015-2024?resource=download
data = pd.read_csv('C:/Users/bourb/OneDrive/Datasets/2024-07-23-washington-post-police-shootings-export.csv')

In [908]:
# Inspect the data to confirm successful load
data.head(3)

Unnamed: 0,date,name,age,gender,armed,race,city,state,flee,body_camera,signs_of_mental_illness,police_departments_involved
0,2015-01-02,Lewis Lee Lembke,47.0,male,gun,White,Aloha,OR,not,False,False,"Washington County Sheriff's Office, OR"
1,2015-01-02,Tim Elliot,53.0,male,gun,Asian,Shelton,WA,not,False,True,"Mason County Sheriff's Office, WA"
2,2015-01-03,John Paul Quintero,23.0,male,unarmed,Hispanic,Wichita,KS,not,False,False,"Wichita Police Department, KS"


In [909]:
# Check datatypes, row count, and # of non-null values in each column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9893 entries, 0 to 9892
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   date                         9893 non-null   object 
 1   name                         9556 non-null   object 
 2   age                          9509 non-null   float64
 3   gender                       9865 non-null   object 
 4   armed                        9682 non-null   object 
 5   race                         9893 non-null   object 
 6   city                         9821 non-null   object 
 7   state                        9893 non-null   object 
 8   flee                         8548 non-null   object 
 9   body_camera                  9893 non-null   bool   
 10  signs_of_mental_illness      9893 non-null   bool   
 11  police_departments_involved  9892 non-null   object 
dtypes: bool(2), float64(1), object(9)
memory usage: 792.3+ KB


In [910]:
def fill_column_nulls(df, column, value):
    df[column] = df[column].fillna(value)

In [911]:


def preprocess(df):
    
    # Convert the 'date' column values to a datetime datatype
    df['date'] = pd.to_datetime(df['date'])

    # Filter the data from to remove values from 2024
    df = df[df['date'] < '2024-01-01']

    # Title case all column names
    df.columns = df.columns.str.title()

    # Title case the below columns
    df['Gender'] = df['Gender'].str.title()
    df['Armed'] = df['Armed'].str.title()
    df['Flee'] = df['Flee'].str.title()

    # Create a list of tuples with column names in index 0 and replacement values in index 1 to populate the nulls in columns of the DataFrame
    null_fillers = [('Name', 'Unknown'), ('Gender', 'Unknown'), ('Armed', 'Unknown'), ('City', 'Unknown'), ('Flee', 'Unknown'), ('Age', 0.0)]

    # Iterate through null_fillers passing each column and replacement into the function which will then coerce the nulls silently
    for i in null_fillers:
        fill_column_nulls(df, i[0], i[1])

    # Iterate through the values in the armed column looking for values with a ','. If ',' is found then replace the value with
    # 'multiple weapons'
    for value in df['Armed']:
        if ',' in value:
            df['Armed'] = df['Armed'].replace(value, 'Multiple Weapons')

    # Create a dictionary of state abbreviation to name mappings
    state_names = {'AL':'Alabama', 'AK':'Alaska', 'AZ':'Arizona', 'AR':'Arkansas', 'CA':'California', 'CO':'Colorado', 
                'CT':'Connecticut', 'DE':'Delaware', 'FL':'Florida', 'GA':'Georgia', 'HA':'Hawaii', 'ID':'Idaho',
                'IL':'Illinois', 'IN':'Indiana', 'IA':'Iowa', 'KS':'Kansas', 'KY':'Kentucky', 'LA':'Louisiana',
                'ME':'Maine', 'MD':'Maryland', 'MA':'Massachusetts', 'MI':'Michigan', 'MN':'Minnesota', 'MS':'Mississippi',
                'MO':'Missouri', 'MT':'Montana', 'NE':'Nebraska', 'NV':'Nevada', 'NH':'New Hampshire', 'NJ':'New Jersey',
                'NM':'New Mexico', 'NY':'New York', 'NC':'North Carolina', 'ND':'North Dakota', 'OH':'Ohio', 'OK':'Oklahoma',
                'OR':'Oregon', 'PA':'Pennsylvania', 'RI':'Rhode Island', 'SC':'South Carolina', 'SD':'South Dakota',
                'TN':'Tennessee', 'TX':'Texas', 'UT':'Utah', 'VT':'Vermont', 'VA':'Virginia', 'WA':'Washington', 
                'WV':'West Virginia', 'WI':'Wisconsin', 'WY':'Wyoming'}
    
    # Map the state abbreviations to their names. This is done so that we can join on this column later and it will 1:1 match
    # the DataFrame we will be joining to
    df['State'] = df['State'].map(state_names)

    # Coerce the state name 'DC' if the city name = 'Washington'
    df['State'] = df.apply(lambda row: 'DC' if 'Washington' in row['City'] else row['State'], axis=1)

    # Fill the rest of the State column with Hawaii, after searing unique values in the city column for NULL values in the state
    # column I have confirmed all cities remaining with a NULL state are in Hawaii 
    fill_column_nulls(df, 'State', 'Hawaii')

    # Create a list of states for each region and save them into the variable associated with the name of the region
    south = ['Texas', 'Oklahoma', 'Arkansas', 'Louisiana', 'Mississippi', 'DC', 'Alabama', 'Georgia', 'Florida', 'South Carolina', 'North Carolina', 'Tennessee', 'Kentucky', 'Virginia', 'West Virginia', 'Delaware', 'Maryland']
    midwest = ['Illinois', 'Indiana', 'Michigan', 'Ohio', 'Wisconsin', 'Iowa', 'Kansas', 'Minnesota', 'Missouri', 'Nebraska', 'North Dakota', 'South Dakota']
    northeast = ['Connecticut', 'Maine', 'Massachusetts', 'New Hampshire', 'Rhode Island', 'Vermont', 'New Jersey', 'New York', 'Pennsylvania']
    west = ['Arizona', 'Colorado', 'Idaho', 'Montana', 'Nevada', 'New Mexico', 'Utah', 'Wyoming', 'Alaska', 'California', 'Hawaii', 'Oregon', 'Washington']

    # Create a function to check if a state is in a list of states associated with a region
    def states_to_regions(df):
        if df['State'] in northeast:
            return 'Northeast'
        elif df['State'] in south:
            return 'South'
        elif df['State'] in midwest:
            return 'Midwest'
        elif df['State'] in west:
            return 'West'
        
    # Create a new column in the Dataframe using region values returned from the function
    df['Region'] = df.apply(lambda row: states_to_regions(row), axis=1)

    # I've decided to coerce values with a comma in the 'Race' column to be represented as 'Multiple Races' as opposed to the more unsightly values
    # that are inherent to the column 
    for value in df['Race']:
        if ',' in value:
            df['Race'] = df['Race'].replace(value, 'Multiple Races')

    # I want to convert to integer from float to avoid unsightly decimal places as all decimals are .0 thus mathematically meaningless
    df['Age'] = df['Age'].astype(int)

    # Create a 'year' column pulling the year from the 'date' column
    df['Year'] = pd.DatetimeIndex(df['Date']).year

    # Create a 'month' column pulling the month from the 'date' column and coercing it into the month name
    df['Month'] = pd.DatetimeIndex(df['Date']).month_name()

    # Engineer a quarter column to determine which fiscal quarter an incident occurred in (this is useful for creation of a time series line plot)
    df['Quarter'] = df['Date'].dt.quarter

    # Create age bins so we can assess the age of shooting victims within an age range
    df['Age_Bins'] = pd.cut(x=df['Age'], bins=[-1,1,18,29,40,50,65,80,100], labels=["'0'", "'1-18'", "'19-29'", "'30-40'", "'41-50'", "'51-65'", "'66-80'", "'81-100'"])

    # Convert the Age_Bins column to strings. This was done to prevent a behavior when opening the resultant .csv in Excel where
    # Excel believes 1-18 to be January 18th
    df['Age_Bins'] = df['Age_Bins'].astype('str')

    # Rearrange the columns into a more logical order 

    df = df[['Date', 'Month', 'Quarter', 'Year', 'Name', 'Age', 'Age_Bins', 'Gender', 'Armed', 'Race', 'City', 'State', 'Flee', 'Body_Camera', 
      'Signs_Of_Mental_Illness', 'Police_Departments_Involved', 'Region']]

    return df

data = preprocess(data)

In [913]:
# Inspect the DataFrame to give it a good last look over

data.head(3)

Unnamed: 0,Date,Month,Quarter,Year,Name,Age,Age_Bins,Gender,Armed,Race,City,State,Flee,Body_Camera,Signs_Of_Mental_Illness,Police_Departments_Involved,Region
0,2015-01-02,January,1,2015,Lewis Lee Lembke,47,'41-50',Male,Gun,White,Aloha,Oregon,Not,False,False,"Washington County Sheriff's Office, OR",West
1,2015-01-02,January,1,2015,Tim Elliot,53,'51-65',Male,Gun,Asian,Shelton,Washington,Not,False,True,"Mason County Sheriff's Office, WA",West
2,2015-01-03,January,1,2015,John Paul Quintero,23,'19-29',Male,Unarmed,Hispanic,Wichita,Kansas,Not,False,False,"Wichita Police Department, KS",Midwest


In [914]:
# Read in 2020 US Census data for the purposes of adding a column to show the population of the city in which an offense occurred

pop = pd.read_csv('C:/Users/bourb/Data_Projects/US_Police_Shooting_2015-2024/US_Population_By_City_2020.csv', low_memory=False)

pop.head(3)

Unnamed: 0,Label (Grouping),United States,"Abanda CDP, Alabama","Abbeville city, Alabama","Adamsville city, Alabama","Addison town, Alabama","Akron town, Alabama","Alabaster city, Alabama","Albertville city, Alabama","Alexander City city, Alabama",...,"Villa Hugo II comunidad, Puerto Rico","Villalba zona urbana, Puerto Rico","Villa Quintero comunidad, Puerto Rico","Villas del Sol comunidad, Puerto Rico","Villa Sin Miedo comunidad, Puerto Rico","Voladoras comunidad, Puerto Rico","Yabucoa zona urbana, Puerto Rico","Yauco zona urbana, Puerto Rico","Yaurel comunidad, Puerto Rico","Yeguada comunidad, Puerto Rico"
0,Total:,331449281,133,2358,4366,659,225,33284,22386,14843,...,1355,3215,342,145,117,670,5196,13569,769,1418
1,Population of one race:,297600338,132,2275,4221,643,220,30751,20502,14311,...,678,1663,225,76,64,290,3052,7455,598,871
2,White alone,204277273,95,1165,1741,624,19,22323,13830,8724,...,98,618,37,6,8,136,924,2887,18,229


In [915]:
# Transpose our population dataframe so that the cities will be rows

pop = pd.DataFrame(pop.T)

In [916]:
# This is done to coerce the values in row 1 (index 0) to become the new column labels, then to drop them from the 1st row. We
# Then write over our DataFrame keeping only the total population and removing the racial counts

pop.columns = pop.iloc[0]

pop.drop(pop.index[0], inplace=True)

pop.index = pop.index.str.title()

pop = pd.DataFrame(pop['Total:'])

In [917]:
# Create a DataFrame of our data grouped by City and State with a count of the number of police shooting within that city

grouped = pd.DataFrame(data[['Date', 'City', 'State']].groupby(['City', 'State']).count())

In [918]:
# Display the DataFrame as a sanity check

grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Date
City,State,Unnamed: 2_level_1
Abbeville,Alabama,1
Abbeville,Louisiana,1
Abbeville,South Carolina,1
Aberdeen,North Carolina,1
Aberdeen,Washington,1
...,...,...
Yuma,Arizona,5
Zanesville,Ohio,1
Zebulon,North Carolina,1
Zion,Illinois,3


In [919]:
# Create a new blank DataFrame which we will append rows when we find a match of city and state. We are using .startswith and 
# .endswith to allow for mismatches in between which were observed and deemed to be problematic

temp = pd.DataFrame()

for i, j in grouped.index:
    temp_pop = pop[pop.index.str.startswith(i) & pop.index.str.endswith(j)]
    temp = pd.concat([temp, temp_pop])


In [920]:
# Remove common problematic values that were causing mismatches and resulting in misleadingly low outcomes after the join

temp.index = temp.index.str.replace(' City', '')

temp.index = temp.index.str.replace(' Cdp', '')

temp.index = temp.index.str.replace(' Town', '')

In [921]:
# Title case the index of City/State combos to further aid the matching process

temp.index = temp.index.str.title()

In [922]:
# Back to our original DataFrame we create a column of City/State combos to match our population data removing ' City' which has
# found to prevent a large number of matches

data['City_State'] = data['City'] + ', ' + data['State'].replace(' City', '')

In [923]:
# Join our shooting and population DataFrames to create a column in the master showing the population. There is a match ~ 81%
# of the time. The rest of the time we will have Nan or NULL values in the population column

pop_data = pd.merge(data, temp, left_on='City_State', right_on=temp.index, how='left')

In [924]:
# Rename the population column which had carried 'Total:' over from the population DataFrame for the purposes of human
# readability and interpretability when we will potentially use it for visualization

pop_data.rename(columns={'Total:':'Population'}, inplace=True)

In [925]:
# Convert the population column to an integer datatype so we can calculate an annual per capita

pop_data['Population'] = pop_data['Population'].str.replace(',', '').astype('Int64')

In [926]:
pop_data.head(3)

Unnamed: 0,Date,Month,Quarter,Year,Name,Age,Age_Bins,Gender,Armed,Race,City,State,Flee,Body_Camera,Signs_Of_Mental_Illness,Police_Departments_Involved,Region,City_State,Population
0,2015-01-02,January,1,2015,Lewis Lee Lembke,47,'41-50',Male,Gun,White,Aloha,Oregon,Not,False,False,"Washington County Sheriff's Office, OR",West,"Aloha, Oregon",53828
1,2015-01-02,January,1,2015,Tim Elliot,53,'51-65',Male,Gun,Asian,Shelton,Washington,Not,False,True,"Mason County Sheriff's Office, WA",West,"Shelton, Washington",10371
2,2015-01-03,January,1,2015,John Paul Quintero,23,'19-29',Male,Unarmed,Hispanic,Wichita,Kansas,Not,False,False,"Wichita Police Department, KS",Midwest,"Wichita, Kansas",397532


In [927]:
# Read this data out to a .csv file on our hard drive

pop_data.to_csv('c:/users/bourb/data_projects/US_Police_Shooting_2015-2024/cleaned_data.csv', index=False)

In [928]:
# Create a grouped datasetconverted to a DataFrame showing the # of shootings by city, state, and population

pop_grouped = pd.DataFrame(pop_data[['City', 'State', 'Gender', 'Population']].groupby(['City', 'State', 'Population']).count())

In [929]:
# Reset the index so that city, state and population can be treated as regular columns

pop_grouped.reset_index(inplace=True)

In [930]:
# Rename the column that was counted to Number_of_Shootings

pop_grouped.columns = ['City', 'State', 'Population', 'Number_of_Shootings']

In [931]:
# Rearrange the columns 

pop_grouped = pop_grouped.reindex(columns=['City', 'State', 'Number_of_Shootings', 'Population'])

In [932]:
# Engineer the annual per capita column

pop_grouped['Annual_Shootings_per_100k'] = (pop_grouped['Number_of_Shootings'] / (pop_grouped['Population'] / 100000) / 9).round(2)

In [933]:
# Filter the dataset to only include cities with a population > 100k. This prevents places with small populations from having
# staggeringly high numbers not because shootings are regular, but because their population is small enough adjust to per capita
# makes it look inflated.

pop_sample = pop_grouped[pop_grouped['Population'] > 99999]

# Print the DataFrame sorted to highest per capita shootings

pop_sample.sort_values('Annual_Shootings_per_100k', ascending=False)

Unnamed: 0,City,State,Number_of_Shootings,Population,Annual_Shootings_per_100k
1903,Pueblo,Colorado,25,111876,2.48
1272,Las Cruces,New Mexico,19,111385,1.9
1366,Louisville,Kentucky,34,246161,1.53
208,Billings,Montana,16,117116,1.52
2240,St. Louis,Missouri,38,301578,1.4
...,...,...,...,...,...
901,Glendale,California,1,196543,0.06
367,Cary,North Carolina,1,174721,0.06
2087,Santa Rosa,California,1,178127,0.06
2569,Worcester,Massachusetts,1,206518,0.05


In [934]:
# Save this Dataframe as .csv to our hard drive

pop_sample.to_csv('c:/users/bourb/data_projects/US_Police_Shooting_2015-2024/Annual_Shootings_per_100k.csv', index=False)

In [935]:
pop_sample.sort_values('Annual_Shootings_per_100k', ascending=False).head(25)

Unnamed: 0,City,State,Number_of_Shootings,Population,Annual_Shootings_per_100k
1903,Pueblo,Colorado,25,111876,2.48
1272,Las Cruces,New Mexico,19,111385,1.9
1366,Louisville,Kentucky,34,246161,1.53
208,Billings,Montana,16,117116,1.52
2240,St. Louis,Missouri,38,301578,1.4
2053,San Bernardino,California,28,222101,1.4
32,Albuquerque,New Mexico,68,564559,1.34
1273,Las Vegas,Nevada,77,641903,1.33
753,Evansville,Indiana,14,117298,1.33
2504,Westminster,Colorado,12,116317,1.15


In [936]:
# Create a grouped dataset converted to a DataFrame showing the # of shootings by region, and population

region_grouped = pd.DataFrame(pop_data[['Region', 'Gender']].groupby(['Region']).count())

In [937]:
# Reset the index so that region and population can be treated as regular columns

region_grouped.reset_index(inplace=True)

In [938]:
# Rename the column that was counted to Number_of_Shootings

region_grouped.columns = ['Region', 'Number_of_Shootings']

In [939]:
region_grouped.sort_values('Number_of_Shootings', ascending=False)

Unnamed: 0,Region,Number_of_Shootings
2,South,3978
3,West,3192
0,Midwest,1511
1,Northeast,644
