# California District Attorney Project

The data used in this project can be viewed [here](https://drive.google.com/open?id=1lTwNX7moD-4BvAV5Fd5HbqL3luheYGc0).

In [1]:
import pandas as pd
import numpy as np
import logging

In [110]:
# read in the data 
police_violence_df = pd.read_excel('./data/police_violence_2013_18.xlsx', sheet_name='Police_Killings_2013_18')
ca_race_df = pd.read_csv('./data/ca_race.csv')

In [33]:
police_violence_df.head()

Unnamed: 0,Victim's name,Victim's age,Victim's gender,Victim's race,URL of image of victim,Date of Incident (month/day/year),Street Address of Incident,City,State,Zipcode,...,Criminal Charges?,Link to news article or photo of official document,Symptoms of mental illness?,Unarmed,Alleged Weapon (Source: WaPo),Alleged Threat Level (Source: WaPo),Fleeing (Source: WaPo),Body Camera (Source: WaPo),WaPo ID (If included in WaPo database),Unnamed: 24
0,Name withheld by police,Unknown,Male,Unknown race,,2018-05-18,Southwest Pleasant View and Highland drive,Gresham,OR,,...,No Known Charges,http://www.kxl.com/police-shoot-and-kill-man-i...,,,,,,,,
1,Name withheld by police,Unknown,Male,Unknown race,,2018-05-17,Britton Rd. and Kilpatrick Turnpike,Oklahoma City,OK,,...,No Known Charges,http://kfor.com/2018/05/17/emergency-crews-inv...,,Allegedly Armed,gun,attack,Not fleeing,,,
2,Name withheld by police,Unknown,Male,Unknown race,,2018-05-16,Washington Blvd,Belmont,OH,,...,No Known Charges,http://wtov9.com/news/local/new-details-releas...,,,,,,,,
3,David Romansky,34,Male,White,,2018-05-15,,Lake Mary,FL,,...,No Known Charges,http://www.orlandosentinel.com/news/breaking-n...,,,,,,,,
4,Marcus-David L. Peters,24,Male,Black,http://www.fatalencounters.org/wp-content/uplo...,2018-05-14,I-95 and Chamberlayne Avenue,Richmond,VA,23227.0,...,No Known Charges,http://www.nbc12.com/story/38187605/naked-man-...,No,Unarmed,unarmed,attack,Car,No,3696.0,


In [36]:
ca_race_df.head()

Unnamed: 0,Geography,total_population,his_latino,his_latino_percent,white,white_percent,black,black_percent,american_native,american_native_percent,...,asian_percent,pac_islander,pac_islander_percent,other,other_percent,two_plus,two_plus_percent,Summary_Level,County,Place
0,California,38654206,14903982,38.6,14837242,38.4,2158363,5.6,136582,0.4,...,13.7,138956,0.4,90413,0.2,1107850,2.9,40.0,,
1,Alameda County,1605217,362070,22.6,523797,32.6,176819,11.0,4959,0.3,...,28.0,13223,0.8,4524,0.3,70149,4.4,50.0,1.0,
2,Alameda city,77409,9123,11.8,33269,43.0,5742,7.4,135,0.2,...,30.4,550,0.7,413,0.5,4625,6.0,160.0,1.0,562.0
3,Albany city,19420,2506,12.9,9442,48.6,740,3.8,63,0.3,...,26.4,156,0.8,149,0.8,1246,6.4,160.0,1.0,674.0
4,Ashland CDP,24288,10872,44.8,3146,13.0,4529,18.6,75,0.3,...,19.3,149,0.6,47,0.2,779,3.2,160.0,1.0,2980.0


## CA Police Violence Data

In [119]:
# select for only CA entries
ca_police_violence_df = police_violence_df[police_violence_df['State'] == 'CA']

In [120]:
# remove whitespace and special chars from col names
ca_police_violence_df.columns = ca_police_violence_df.columns.str.replace('\s+', '_')
ca_police_violence_df.columns = ca_police_violence_df.columns.str.replace("'", '')
ca_police_violence_df.columns = ca_police_violence_df.columns.str.replace('(', '')
ca_police_violence_df.columns = ca_police_violence_df.columns.str.replace(')', '')
ca_police_violence_df.columns = ca_police_violence_df.columns.str.replace('?', '')
ca_police_violence_df.columns = ca_police_violence_df.columns.str.replace('/', '')
ca_police_violence_df.columns = ca_police_violence_df.columns.str.replace(':', '')

In [121]:
# save csv of the ca police violence dataframe 
ca_police_violence_df.to_csv('ca_police_violence.csv', encoding='utf-8', index='false')

In [127]:
# get the per county incidents
ca_counties_df = ca_police_violence_df.County.value_counts()

In [128]:
# reindex the dataframe to include a county column
ca_counties_df = pd.DataFrame(ca_counties_df)
ca_counties_df.reset_index(level=0, inplace=True)
ca_counties_df.columns = ['county', 'police_violence_incidents']

In [129]:
ca_counties_df['county'] = ca_counties_df['county'].astype(str) + ' County'

In [132]:
# add a totals row
total = pd.Series({'county':'California', 'police_violence_incidents': ca_counties_df.police_violence_incidents.sum()})
ca_counties_df = ca_counties_df.append(total, ignore_index=True)

In [133]:
ca_counties_df.head()

Unnamed: 0,county,police_violence_incidents
0,Los Angeles County,283
1,Riverside County,70
2,San Diego County,61
3,San Bernardino County,60
4,Alameda County,48


## Demographic Data

In [39]:
ca_county_race_df = ca_race_df[(ca_race_df['Geography'].str.contains('County')) | (ca_race_df['Geography'] == 'California')]

In [91]:
# remove census designated places
cdp = ca_county_race_df['Geography'].str.contains("CDP")
ca_county_race_df = ca_county_race_df[~cdp]

In [141]:
ca_county_race_df = ca_county_race_df.rename(columns={'Geography': 'county'})

In [142]:
ca_county_race_df.head()

Unnamed: 0,county,total_population,his_latino,his_latino_percent,white,white_percent,black,black_percent,american_native,american_native_percent,...,asian_percent,pac_islander,pac_islander_percent,other,other_percent,two_plus,two_plus_percent,Summary_Level,County,Place
0,California,38654206,14903982,38.6,14837242,38.4,2158363,5.6,136582,0.4,...,13.7,138956,0.4,90413,0.2,1107850,2.9,40.0,,
1,Alameda County,1605217,362070,22.6,523797,32.6,176819,11.0,4959,0.3,...,28.0,13223,0.8,4524,0.3,70149,4.4,50.0,1.0,
22,Alpine County,1184,92,7.8,804,67.9,10,0.8,224,18.9,...,0.8,0,0.0,0,0.0,45,3.8,50.0,3.0,
28,Amador County,36963,4822,13.0,29436,79.6,860,2.3,458,1.2,...,1.4,72,0.2,4,0.0,790,2.1,50.0,5.0,
47,Butte County,223877,34503,15.4,164398,73.4,3279,1.5,1553,0.7,...,4.2,380,0.2,340,0.2,9927,4.4,50.0,7.0,


## Merging

In [143]:
# merge the two dataframes
merged_df = ca_counties_df.merge(ca_county_race_df, how='inner', on='county')

In [145]:
merged_df.head()

Unnamed: 0,county,police_violence_incidents,total_population,his_latino,his_latino_percent,white,white_percent,black,black_percent,american_native,...,asian_percent,pac_islander,pac_islander_percent,other,other_percent,two_plus,two_plus_percent,Summary_Level,County,Place
0,Los Angeles County,283,10057155,4861648,48.3,2687787,26.7,801182,8.0,18765,...,14.1,24439,0.2,29351,0.3,220878,2.2,50.0,37.0,
1,Riverside County,70,2323892,1102968,47.5,865631,37.2,137779,5.9,9407,...,6.0,6262,0.3,3749,0.2,58988,2.5,50.0,65.0,
2,San Diego County,61,3253356,1076319,33.1,1519704,46.7,154251,4.7,11833,...,11.3,14043,0.4,5543,0.2,102611,3.2,50.0,73.0,
3,San Bernardino County,60,2106754,1089104,51.7,642786,30.5,170376,8.1,6840,...,6.6,6368,0.3,4417,0.2,48112,2.3,50.0,71.0,
4,Alameda County,48,1605217,362070,22.6,523797,32.6,176819,11.0,4959,...,28.0,13223,0.8,4524,0.3,70149,4.4,50.0,1.0,


In [146]:
# save to csv 
merged_df.to_csv('ca_county_pd_incidents_and_demographics.csv', encoding='utf-8', index=False)