# Exploratory Data Analysis

### Import Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
df = pd.read_csv('cleaned_terry_stop_for_eda.csv')
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,subject_age_group,subject_id,go_sc_num,terry_stop_id,weapon,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,...,incident_month,officer_age,call_made,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
0,Unknown,unassigned,20140000120677,92317,,M,Black or African American,Asian,M,Unknown,...,10,31,0,Y,N,Y,0,1,0,0
1,Unknown,unassigned,20150000002351,45252,,M,Hispanic,Other,F,Unknown,...,5,30,0,Y,Y,Y,0,0,0,1
2,Unknown,unassigned,20150000002363,45182,,M,Hispanic,White,M,Unknown,...,5,30,0,Y,N,Y,0,0,0,1
3,Unknown,unassigned,20150000002392,45365,,M,White,White,F,Unknown,...,5,29,0,N,Y,Y,0,1,0,0
4,Unknown,unassigned,20150000002451,46430,,M,Hispanic,Not Specified,Unable to Determine,Unknown,...,5,30,0,Y,Y,Y,1,0,0,0


In [3]:
df.columns

Index(['subject_age_group', 'subject_id', 'go_sc_num', 'terry_stop_id',
       'weapon', 'officer_gender', 'officer_race', 'subject_perceived_race',
       'subject_perceived_gender', 'initial_call_type', 'final_call_type',
       'officer_squad', 'frisk_flag', 'precinct', 'sector', 'beat',
       'repeat_offenders', 'arrest_made', 'incident_year', 'incident_month',
       'officer_age', 'call_made', 'dif_race', 'dif_gender', 'dif_race_gender',
       '12am_6am', '7am_12pm', '1pm_6pm', '7pm_11pm'],
      dtype='object')

In [4]:
pip install pandas-profiling



Note: you may need to restart the kernel to use updated packages.


In [7]:
from pandas_profiling import ProfileReport

In [8]:
profile = ProfileReport (df, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile

HBox(children=(HTML(value='Summarize dataset'), FloatProgress(value=0.0, max=43.0), HTML(value='')))




HBox(children=(HTML(value='Generate report structure'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value='Render HTML'), FloatProgress(value=0.0, max=1.0), HTML(value='')))






### 1. Is there a relationship with age and being stopped?

In [None]:
df.subject_age_group.value_counts()

In [None]:
filter_age = df['subject_age_group'] != '-'
df_filter_age = df[filter_age]
x = df_filter_age['subject_age_group'].value_counts().index
y = df_filter_age['subject_age_group'].value_counts()

fig, ax = plt.subplots()
fig.set_size_inches(15, 7)

graph_age = sns.barplot(x=x, 
            y=y, 
            order=['1 - 17', '18 - 25', '26 - 35', '36 - 45', '46 - 55', '56 and Above'] )
graph_age.set(ylabel = 'Quantity of Stopped People', 
                          xlabel = 'Age Range', 
                          title = 'Stops by Age')
plt.show()

### Does The Difference In Race Between Officer and Subject Play A Role?

In [15]:
df.officer_race.value_counts()

White                        33668
Hispanic                      2523
Multi-Racial                  2464
Asian                         1833
Black or African American     1770
Not Specified                 1212
Other                          432
Native American                301
Name: officer_race, dtype: int64

In [16]:
df.subject_perceived_race.value_counts()

White                        21601
Black or African American    13167
Not Specified                 4084
Hispanic                      1671
Asian                         1399
Native American               1289
Multi-Racial                   799
Other                          193
Name: subject_perceived_race, dtype: int64

In [48]:
# Check the amount of Black people arrested per age group by White officers
df.loc[(df['officer_race'] == 'White') & (df['subject_perceived_race'] == 'Black or African American')]


Unnamed: 0,subject_age_group,subject_id,go_sc_num,terry_stop_id,weapon,officer_gender,officer_race,subject_perceived_race,subject_perceived_gender,initial_call_type,...,incident_month,officer_age,call_made,dif_race,dif_gender,dif_race_gender,12am_6am,7am_12pm,1pm_6pm,7pm_11pm
11,Unknown,unassigned,20150000002678,49779,,M,White,Black or African American,M,Unknown,...,6,56,0,Y,N,Y,0,1,0,0
37,Unknown,unassigned,20150000003506,61980,,M,White,Black or African American,M,Unknown,...,7,29,0,Y,N,Y,0,1,0,0
40,Unknown,unassigned,20150000003561,63097,,M,White,Black or African American,M,Unknown,...,7,29,0,Y,N,Y,1,0,0,0
72,Unknown,unassigned,20150000005064,87375,,M,White,Black or African American,M,Unknown,...,10,24,0,Y,N,Y,1,0,0,0
74,Unknown,unassigned,20150000005064,87377,,M,White,Black or African American,F,Unknown,...,10,24,0,Y,Y,Y,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44178,56 and Above,12809788634,20200000117523,12809778581,Unknown,M,White,Black or African American,M,SUSPICIOUS STOP - OFFICER INITIATED ONVIEW,...,4,24,1,Y,N,Y,0,0,0,1
44189,56 and Above,13104767847,20200000143964,13104760486,Unknown,F,White,Black or African American,M,SUSPICIOUS STOP - OFFICER INITIATED ONVIEW,...,5,44,1,Y,Y,Y,1,0,0,0
44190,56 and Above,13116341881,20200000151443,13116539265,Unknown,M,White,Black or African American,M,"SUSPICIOUS PERSON, VEHICLE OR INCIDENT",...,5,52,1,Y,N,Y,0,0,1,0
44192,56 and Above,13267123297,20200000175347,13267123866,Unknown,M,White,Black or African American,M,"WEAPN-IP/JO-GUN,DEADLY WPN (NO THRT/ASLT/DIST)",...,5,28,1,Y,N,Y,0,0,0,1


  This tells us that 23% of stops were made by white officers with the subjects being Black/African American; where they comprise of 6.8% of the population.

In [65]:
treatment = df[['arrest_made', 'frisk_flag', 'dif_race']]
treatment.head()

Unnamed: 0,arrest_made,frisk_flag,dif_race
0,Y,N,Y
1,N,N,Y
2,N,N,Y
3,N,N,N
4,N,N,Y


Let's make 2 datasets: One where officer and subject are the same race and another where they are not.

In [None]:
same = treatment[treatment['dif_race'] == 'N']
diff = treatment[treatment['dif_race'] == 'Y']
print(same.info())
print(diff.shape)

In [None]:
df.arrest_made.value_counts()

In [None]:
df.frisk_flag.value_counts()

In [None]:
df.dif_race.value_counts()

In [None]:

# set width of bar
barWidth = 0.25
 
# set height of bar
bars1 = [10715,9801,26590]
bars2 = [33488,33937,17613]
#bars3 = [26590,17613]
 
# Set position of bar on X axis
r1 = np.arange(len(bars1))
r2 = [x + barWidth for x in r1]
#r3 = [x + barWidth for x in r2]
 
# Make the plot
plt.bar(r1, bars1, color='#7f6d5f', width=barWidth, edgecolor='white', label='var1')
plt.bar(r2, bars2, color='#557f2d', width=barWidth, edgecolor='white', label='var2')
#plt.bar(r3, bars3, color='#2d7f5e', width=barWidth, edgecolor='white', label='var3')
 
# Add xticks on the middle of the group bars
plt.xlabel('group', fontweight='bold')
plt.xticks([r + barWidth for r in range(len(bars1))], ['Arrest Made', 'B', 'Different Race', 'D', 'E'])
 
# Create legend & Show graphic
plt.legend()
plt.show()


Now lets separate our data into categories we can compare: Same Race Arrests and Frisks & Different Race Arrests and Frisks

In [None]:
same_arrest = 0
same_frisk = 0
for i in range(len(same)):
    if same.arrest_made.iloc[i] == 'Y':
        same_arrest += 1
    if same.frisk_flag.iloc[i] == 'Y':
        same_frisk += 1
print(f'Same Race: Arrests = {same_arrest}, Frisk Searches = {same_frisk}')

diff_arrest = 0
diff_frisk = 0

for i in range(len(same)):
    if diff.arrest_made.iloc[i] == 'Y':
        diff_arrest += 1
    if diff.frisk_flag.iloc[i] == 'Y':
        diff_frisk += 1
print(f'Different Race: Arrests = {diff_arrest}, Frisk Searches = {diff_frisk}')

In [None]:
# Let's look at these in Ratios
arrest_same_ratio = same_arrest/len(same)
frisk_same_ratio = same_frisk/len(same)
arrest_diff_ratio = diff_arrest/len(diff)
frisk_diff_ratio = diff_frisk/len(diff)

In [None]:
keys = ['Same Race: Arrests', 'Same Race: Frisk Searches']
vals = [arrest_same_ratio, frisk_same_ratio]

same_race = {}

for key in keys:
    for val in vals:
        same_race[key] = val
        vals.remove(val)
        break

same_race_df = pd.DataFrame(same_race, index=[0])
same_race_df

In [None]:
keys = ['Different Race: Arrests', 'Different Race: Frisk Searches']
vals = [arrest_diff_ratio, frisk_diff_ratio]

diff_race = {}

for key in keys:
    for val in vals:
        diff_race[key] = val
        vals.remove(val)
        break

diff_race = pd.DataFrame(diff_race, index=[0])
diff_race

Let us show these together in a graph

In [None]:
race_rel = pd.concat([same_race_df, diff_race], axis=1)

race_rel

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(10, 10)
graph_stop_resolution = ax.pie(x=arrest_made, y= dif_race, 
                               labels=race_rel,
                               autopct='%1.1f%%')

ax.set_title('Officer and Subject Treatment')

plt.show()