In [1]:
import pandas as pd
import re

ISO = 'ISO-8859-1'
UTF8 = 'utf8'
PREFIX = '/kaggle/input/fatal-police-shootings-in-the-us/'
ON = ['city', 'geographic area']
FILENAMES = [
    ('ShareRaceByCity.csv', UTF8),
    ('PercentOver25CompletedHighSchool.csv', ISO),
    ('PercentagePeopleBelowPovertyLevel.csv', ISO),
    ('MedianHouseholdIncome2015.csv', ISO),
]
LETHAL_WEAPONS = ['gun', 'knife', 'vehicle', 'machete', 'sword', 'ax', 'gun and knife', 'crossbow', 'box cutter']

def read_csv(file, enc):
    df = pd.read_csv(PREFIX + file, encoding=enc)
    df.columns = [c.lower() for c in df.columns]
    return df

dfs = [read_csv(file, enc) for (file, enc) in FILENAMES]
metrics_df = dfs[0].merge(dfs[1], on=ON).merge(dfs[2], on=ON).merge(dfs[3], on=ON)
metrics_df['city'] = metrics_df['city'].map(lambda c: re.sub(r'\s(CDP|city|town)$', '', c))
metrics_df = metrics_df.rename(columns={'geographic area': 'state'})
metrics_df['city_state'] = metrics_df['city'] + ', ' + metrics_df['state']

killings_df = read_csv('PoliceKillingsUS.csv', ISO)
killings_df['city_state'] = killings_df['city'] + ', ' + killings_df['state']
killings_df['had_gun'] = killings_df['armed'].eq('gun')
killings_df['had_lethal_weapon'] = killings_df['armed'].isin(LETHAL_WEAPONS)

state_pop_df = pd.read_csv('/kaggle/input/us-state-populations-2018/State Populations.csv')
state_abbrev_df = pd.read_csv('/kaggle/input/usa-state-name-code-and-abbreviation/data.csv')
state_df = state_pop_df.merge(state_abbrev_df, on='State')
state_df = state_df.rename(columns={'Code': 'state'})
state_df['pop_100k'] = round(state_df['2018 Population'] / (10 ** 5), 1)

data_df = killings_df.merge(metrics_df, on=['city', 'state', 'city_state'])
data_df = data_df.merge(state_df[['state', 'pop_100k']], on='state')
data_df = data_df.set_index('id')
data_df.head()

Unnamed: 0_level_0,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,...,had_lethal_weapon,share_white,share_black,share_native_american,share_asian,share_hispanic,percent_completed_hs,poverty_rate,median income,pop_100k
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,...,True,78.9,0.8,3.7,1.1,19.2,80.1,28.6,37072,75.3
138,Antonio Zambrano-Montes,10/02/15,shot and Tasered,unarmed,35.0,M,H,Pasco,WA,True,...,False,55.8,1.9,0.5,1.9,55.7,71.9,18.8,55319,75.3
295,Jamison Childress,19/03/15,shot,unarmed,20.0,M,W,Sumas,WA,True,...,False,83.7,1.5,2.4,1.5,15.8,80.5,8.0,55161,75.3
393,Daniel Covarrubias,21/04/15,shot,unarmed,37.0,M,N,Lakewood,WA,True,...,False,59.3,11.8,1.3,9.0,15.3,88.1,20.7,44902,75.3
1296,Marcos Perea,20/02/16,shot,gun,41.0,M,B,Lakewood,WA,False,...,True,59.3,11.8,1.3,9.0,15.3,88.1,20.7,44902,75.3


In [2]:
# Top 10 states sorted by police killing rate

summary_df = data_df.groupby(['state', 'pop_100k']).name.count().to_frame(name='count')
summary_df = summary_df.reset_index()
summary_df['rate'] = round(summary_df['count'] / summary_df['pop_100k'], 2)
summary_df.sort_values('rate', ascending=False)[0:10]

Unnamed: 0,state,pop_100k,count,rate
31,NM,20.9,44,2.11
35,OK,39.4,69,1.75
3,AZ,71.2,118,1.66
49,WY,5.7,8,1.4
0,AK,7.4,10,1.35
32,NV,30.6,41,1.34
5,CO,56.8,70,1.23
48,WV,18.0,22,1.22
40,SD,8.8,10,1.14
17,LA,46.8,49,1.05


In [3]:
# Top 10 states sorted by police killing unarmed rate

summary_df = data_df[data_df.armed == 'unarmed'].groupby(['state', 'pop_100k']).name.count().to_frame(name='count')
summary_df = summary_df.reset_index()
summary_df['rate'] = round(summary_df['count'] / summary_df['pop_100k'], 2)
summary_df.sort_values('rate', ascending=False)[0:10]

Unnamed: 0,state,pop_100k,count,rate
32,VT,6.2,1,0.16
24,OK,39.4,6,0.15
2,AZ,71.2,10,0.14
18,ND,7.6,1,0.13
28,SD,8.8,1,0.11
10,KS,29.2,3,0.1
19,NE,19.3,2,0.1
16,MS,29.8,3,0.1
12,MD,60.8,6,0.1
14,MN,56.3,5,0.09


In [4]:
# Top 10 states sorted by police killing non-gun rate

summary_df = data_df[data_df.had_gun == False].groupby(['state', 'pop_100k']).name.count().to_frame(name='count')
summary_df = summary_df.reset_index()
summary_df['rate'] = round(summary_df['count'] / summary_df['pop_100k'], 2)
summary_df.sort_values('rate', ascending=False)[0:10]

Unnamed: 0,state,pop_100k,count,rate
49,WY,5.7,5,0.88
3,AZ,71.2,56,0.79
35,OK,39.4,29,0.74
4,CA,397.8,244,0.61
31,NM,20.9,12,0.57
32,NV,30.6,17,0.56
48,WV,18.0,9,0.5
17,LA,46.8,23,0.49
5,CO,56.8,26,0.46
46,WA,75.3,32,0.42


In [5]:
# Top 10 states sorted by police killing non-lethal-weapon rate

summary_df = data_df[data_df.had_lethal_weapon == False].groupby(['state', 'pop_100k']).name.count().to_frame(name='count')
summary_df = summary_df.reset_index()
summary_df['rate'] = round(summary_df['count'] / summary_df['pop_100k'], 2)
summary_df.sort_values('rate', ascending=False)[0:10]

Unnamed: 0,state,pop_100k,count,rate
45,WY,5.7,3,0.53
2,AZ,71.2,24,0.34
28,NV,30.6,10,0.33
31,OK,39.4,12,0.3
3,CA,397.8,113,0.28
21,MS,29.8,8,0.27
14,LA,46.8,12,0.26
12,KS,29.2,7,0.24
27,NM,20.9,5,0.24
36,SD,8.8,2,0.23


In [6]:
# Top 10 non-lethal weapons

data_df[data_df['had_lethal_weapon'] == False].armed.value_counts()[0:10]

unarmed           150
undetermined      104
toy weapon         89
unknown weapon     12
Taser               8
hammer              5
blunt object        5
baseball bat        5
metal pipe          4
scissors            3
Name: armed, dtype: int64