In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline

%matplotlib notebook



Pros/Cons:

![image.png](attachment:image.png)

In [2]:
#load shooting data
median_household_income = pd.read_csv('MedianHouseholdIncome2015.csv', encoding="windows-1252")
police_killings = pd.read_csv('PoliceKillingsUS.csv', encoding="windows-1252")
city_population_2015 = pd.read_csv('city_populations_2015.csv')

#Group killings by city and add count (death_count)
df_killings_grouped_city_state = police_killings.groupby(['city', 'state']).size().reset_index(name='death_count').sort_values(['death_count'], ascending=[False])

#Replace the word ' city' with empty string since each city contains this string at the end
median_household_income['city'] = median_household_income['city'].str.replace(' city','')

#Join grouped death count by city with median income by city, and then with city population
df_killings_grouped_city_state_median = pd.merge(df_killings_grouped_city_state, median_household_income, on=['city','state'], how='outer')
df_killings_grouped_city_state_median_population = pd.merge(df_killings_grouped_city_state_median, city_population_2015, on=['city','state'], how='outer')

#Where cities aren't found, drop the record
df_killings_grouped_city_state_median_population = df_killings_grouped_city_state_median_population.dropna(subset=['median_income'])
df_killings_grouped_city_state_median_population = df_killings_grouped_city_state_median_population.dropna(subset=['pop'])
df_killings_grouped_city_state_median_population = df_killings_grouped_city_state_median_population.dropna(subset=['death_count'])

#median income = 48617.500000
df_killings_grouped_city_state_median_population['income_level'] = 'other'
df_killings_grouped_city_state_median_population.loc[df_killings_grouped_city_state_median_population['median_income'] <= 48617.500000, 'income_level'] = 'low'
df_killings_grouped_city_state_median_population.loc[df_killings_grouped_city_state_median_population['median_income'] > 48617.500000, 'income_level'] = 'high'
df_killings_grouped_city_state_median_population['death_count_per_million_per_year'] = ((df_killings_grouped_city_state_median_population['death_count']/df_killings_grouped_city_state_median_population['pop'])*1000000)/3


#median population = 1.051500e+05
df_killings_grouped_city_state_median_population['population_size'] = 'other'
df_killings_grouped_city_state_median_population.loc[df_killings_grouped_city_state_median_population['pop'] <= 1.051500e+05, 'population_size'] = 'low'
df_killings_grouped_city_state_median_population.loc[df_killings_grouped_city_state_median_population['pop'] > 1.051500e+05, 'population_size'] = 'high'


#median death count per million per year = 5.021565
df_killings_grouped_city_state_median_population['death_count'] = 'other'
df_killings_grouped_city_state_median_population.loc[df_killings_grouped_city_state_median_population['death_count_per_million_per_year'] <= 5.021565, 'death_count'] = 'low'
df_killings_grouped_city_state_median_population.loc[df_killings_grouped_city_state_median_population['death_count_per_million_per_year'] > 5.021565, 'death_count'] = 'high'

#Create dataframe for top 15 cities with most police fatalities using normalized data
df_top_normalized = df_killings_grouped_city_state_median_population.sort_values(['death_count_per_million_per_year'], ascending=[False]).head(15)


#Determine median income level to assign low/high income
df_killings_grouped_city_state_median_population.describe()
df_killings_grouped_city_state_median_population.head()


Unnamed: 0,city,state,death_count,median_income,pop,income_level,death_count_per_million_per_year,population_size
0,Los Angeles,CA,low,50205.0,3971883.0,high,3.273007,high
1,Phoenix,AZ,high,47326.0,1563025.0,low,6.611112,high
2,Houston,TX,low,46187.0,2296224.0,low,3.774312,high
3,Chicago,IL,low,48522.0,2720546.0,low,3.063111,high
4,Las Vegas,NV,high,50202.0,623747.0,high,11.222499,high


In [67]:
#Question 1:  Choose one variable and plot that variable four different ways.
#top 15 counts
#plot 1
plt.figure(figsize=(10, 5))
sns.barplot(x="city", y="death_count_per_million_per_year", data=df_top_normalized)
plt.title('Plot 1:  Barplot - Top 15 Police Fatality Count Cities in the United States annually based on data from 2015 - 2017')
plt.xticks(rotation=90)
sns.set(style="darkgrid")

g = sns.factorplot(x="gender", y="age", data=police_killings,
                   size=6, kind="bar", palette="pastel", ci=95)
g.despine(left=True)
g.set_ylabels("Age")
g.set_xlabels("")
plt.title('Plot 1A:  Barplot - Age by Gender')
plt.show()



#plot 2
fig, ax = plt.subplots()
fig.set_size_inches(10,5)
plt.hist(df_killings_grouped_city_state_median_population['death_count_per_million_per_year'], color='red',label='Death coiunt')  # alpha just controls the opacity
plt.xlabel('Death count per million per year')
plt.legend(loc='upper right')
plt.title('Plot 2: Histogram - Death count per million per year')
plt.show()

#plot 3
# Setting the overall aesthetic.
fig, ax = plt.subplots()
fig.set_size_inches(10,5)
sns.set(style="whitegrid")
ax = sns.boxplot(x='death_count_per_million_per_year',data=df_killings_grouped_city_state_median_population,palette='pastel')  
plt.title('Plot 3: Boxplot - Death count per million per year')
sns.despine(offset=10, trim=True)
ax.set(xlabel='Death count per million per year')
plt.show()

#plot 4
# Setting the overall aesthetic.
sns.set(style="whitegrid")
g = sns.factorplot(y="death_count_per_million_per_year", data=df_killings_grouped_city_state_median_population,
                   size=6, kind="point", palette="pastel",ci=95,dodge=True,join=False)
#g.despine(left=True)
g.set(xlabel='Death count per million per year')
plt.title('Plot 4:  Pointplot - Death count per million per year')
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
df_killings_grouped_city_state_median_population.head()

Unnamed: 0,city,state,death_count,median_income,pop,income_level,death_count_per_million_per_year,population_size
0,Los Angeles,CA,low,50205.0,3971883.0,high,3.273007,high
1,Phoenix,AZ,high,47326.0,1563025.0,low,6.611112,high
2,Houston,TX,low,46187.0,2296224.0,low,3.774312,high
3,Chicago,IL,low,48522.0,2720546.0,low,3.063111,high
4,Las Vegas,NV,high,50202.0,623747.0,high,11.222499,high


In [5]:
#variable = age of men that were shot
menshotage = police_killings.loc[(police_killings['gender']=='M')&(police_killings['manner_of_death']=='shot'),'age']
plt.figure(figsize=(10, 5))

#Plot 1
plt.hist(menshotage.dropna(), color='red',label='Men shot')  # alpha just controls the opacity
plt.xlabel('Age')
plt.legend(loc='upper right')
plt.title('Plot 1: Age of Men Shot by Police')
plt.show()

#Plot 2
# Showing the  information in a seaborn facet grid.
sns.set(style="ticks")  #Setting the overall aesthetic


# Tell seaborn about the structure of our data.
ax = sns.FacetGrid(police_killings, row="gender", col="manner_of_death", size=5)
# Name the plot type and the variable to be plotted using the structure.
ax.map(plt.hist, "age", color="steelblue",  lw=0)
# Moving the plots apart to make room for our titles.
plt.subplots_adjust(top=0.9)
# Making a more informative axis name.
ax.set_axis_labels('Age')
plt.suptitle('Plot 2: Age by gender and manner of shooting')
# Removing excess lines around the plot.
sns.despine(trim=True)
plt.show()

#Plot 3
# Comparing groups using boxplots.
fig, ax = plt.subplots()
fig.set_size_inches(10,5)
ax = sns.boxplot(x='manner_of_death',y='age',hue='gender',data=police_killings)  
plt.title('Plot 3: Age by Gender and Manner of death')
sns.despine(offset=10, trim=True)
ax.set(xlabel='', ylabel='Age')
plt.show()

#Plot 4
# Setting the overall aesthetic.
fig, ax = plt.subplots()
fig.set_size_inches(10,5)
sns.set(style="whitegrid")
ax = sns.boxplot(x='gender',y='age',hue='manner_of_death',data=police_killings,palette='pastel')  
plt.title('Plot 4: Age by Gender and Manner of death')
sns.despine(offset=10, trim=True)
ax.set(xlabel='', ylabel='Age')
plt.show()

#Plot 5
# Setting the overall aesthetic.
sns.set(style="darkgrid")

g = sns.factorplot(x="gender", y="age", hue="manner_of_death", data=police_killings,
                   size=6, kind="bar", palette="pastel", ci=95)
g.despine(left=True)
g.set_ylabels("Age")
g.set_xlabels("")
plt.title('Plot 5:  Barplot - Age by Gender and Manner of death')
plt.show()


#Plot 6
# Setting the overall aesthetic.
sns.set(style="whitegrid")

g = sns.factorplot(x="gender", y="age", hue="manner_of_death", data=police_killings,
                   size=6, kind="point", palette="pastel",ci=95,dodge=True,join=False)
g.despine(left=True)
g.set_ylabels("Age")
g.set_xlabels("")
plt.title('Plot 6:  Pointplot - Age by Gender and Manner of death')
plt.show()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [6]:
g = sns.lmplot(y='death_count_per_million_per_year', # Variable 1.
               x='median_income', # Variable 2.
               data=df_killings_grouped_city_state_median_population, # Data
               fit_reg=False, # If set to true, pldf_killings_grouped_city_state_median_populationots a regression line.
               scatter_kws={'alpha':0.4}) # Set points to semi-transparent to see overlaping points.
g.set_ylabels("Deaths per year")
g.set_xlabels("City Median income")
plt.title('Scatterplot: Death count by median income')
plt.show()

g = sns.lmplot(y='death_count_per_million_per_year', 
               x='median_income',
               data=df_killings_grouped_city_state_median_population, # Data.
               fit_reg=True, # The regression line also includes a 95% confidence envelope.
               scatter_kws={'alpha':0.4})
g.set_ylabels("Deaths per year")
g.set_xlabels("City Median income")
plt.title('Scatterplot with regression line: Death count by median income')
plt.show()


#size marker for scatter plot
z= df_killings_grouped_city_state_median_population['pop']
#s = lambda z : (((z-z.min())/float(z.max()-z.min())+1)*8)**2
s = [(((x-z.min())/float(z.max()-z.min())+1)*8)**2 for x in z]

g = sns.lmplot(y='death_count_per_million_per_year', # Variable 1.
               x='median_income', # Variable 2.
               data=df_killings_grouped_city_state_median_population, # Data
               fit_reg=False, # If set to true, pldf_killings_grouped_city_state_median_populationots a regression line.
               scatter_kws={'s':s}) # Set points to semi-transparent to see overlaping points.
g.set_ylabels("Deaths per year")
g.set_xlabels("City Median income")
plt.title('Scatterplot using markers with weights: Death count by median income')
plt.show()






<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:

sample_string = '.78'
print(sample_string.isdigit())
print(sample_string.isalpha())
print(sample_string.isspace())
print(sample_string.isalnum())
print(sample_string.isnumeric())

# Experiment on the sample string below using the other methods listed above.
# What does each do? When would they be useful? What hapens when you try them
# on different slices of the sample string, like sample_string[:4]?


False
False
False
False
False


In [23]:
# Create a series of dirty, annoying values.
money = pd.Series([400, 111, '$20', 57, 'Lots'])

# Running `money.isdigit()` throws an error because .isdigit() is a string
# attribute, _not_ a series attribute. Uncomment the line below to see.

# print(money.isdigit())

# Instead, let's define a new function that takes a string as an argument
# and returns True if the string is all digits, otherwise False.

def is_a_string(x):
    # First make sure we're operating on a string, then use our string method.
    return str(x).isdigit()

# Now let's apply our custom function to each element in our series.
c = money.apply(is_a_string)
money.head()

0     400
1     111
2     $20
3      57
4    Lots
dtype: object

In [24]:
# Dirty list
money = pd.Series([400, 111, '$20', 57, 'Lots'])

# Here's a lambda function that mirrors the is_a_digit function above.
# Read this print statement carefully and compare to the previous one.
print(money.apply(lambda x: str(x).isdigit()))

0     True
1     True
2    False
3     True
4    False
dtype: bool


In [33]:
# Dirty list
money = pd.Series([400, 111, '$20', 57, 'Lots'])

# Here's a lambda function that mirrors the is_a_digit function above.
# Read this print statement carefully and compare to the previous one.
print(filter([str(x).isdigit() for x in money],money))
print([str(x).isdigit() for x in money])
#c= filter([str(x).isdigit() for x in money],money)
#c

<filter object at 0x0DD54290>
[True, True, False, True, False]


In [26]:
# We're using list() on the result because filter() returns an iterator.

print('Filtering the whole series:')
print(list(filter(lambda x: str(x).isdigit(), money)))

print('\nApplying filter() to each value in the series:')
print(money.apply(lambda x: ''.join(list(filter(str.isdigit, str(x))))))

Filtering the whole series:
[400, 111, 57]

Applying filter() to each value in the series:
0    400
1    111
2     20
3     57
4       
dtype: object


In [64]:
# Create a series of dirty, annoying strings.
words = pd.Series([
    'MollymaLone$molmal@gmail.com',
    'JeffreyJones$jefjo@hotmail.com',
    'DeadParrot$fjords@gmail.com'
])

# Split on '$'. We'll use the Pandas split method.
word_split = words.str.split('$', expand=True)
names = word_split[0]
emails = word_split[1]
print(names, '\n')
print(emails)

0     MollymaLone
1    JeffreyJones
2      DeadParrot
Name: 0, dtype: object 

0     molmal@gmail.com
1    jefjo@hotmail.com
2     fjords@gmail.com
Name: 1, dtype: object


In [65]:
# Splitting on capital letters.
# Just because we can doesn't mean we should:
print(names.str.split('[A-Z]', expand=True))

  0       1      2
0    ollyma    one
1    effrey   ones
2       ead  arrot


In [66]:
import re

# We expect the first name to follow the first capital letter.
firstname = names.apply(lambda x: re.findall('[A-Z][a-z]*', x)[0])

# We expect the last name to follow the second capital letter.
lastname = names.apply(lambda x: re.findall('[A-Z][a-z]*', x)[1])

print(firstname, '\n')
print(lastname)

0    Mollyma
1    Jeffrey
2       Dead
Name: 0, dtype: object 

0      Lone
1     Jones
2    Parrot
Name: 0, dtype: object


In [68]:
np.corrcoef(df_killings_grouped_city_state_median_population['median_income'], df_killings_grouped_city_state_median_population['death_count_per_million_per_year'])

array([[ 1.        , -0.18256621],
       [-0.18256621,  1.        ]])