This notebook contains sample code to load the scraped data and create visualizations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
race_q_dict = {}

race_q_dict['0'] = 'All Races'
race_q_dict['1'] = 'White'
race_q_dict['2'] = 'Black'
race_q_dict['3'] = 'Am IndianAK Native'
race_q_dict['4'] = 'AsianPac Islander'

In [None]:
eth_q_dict = {}

eth_q_dict['0'] = 'All Ethnicity'
eth_q_dict['1'] = 'Non-Hispanic'
eth_q_dict['2'] = 'Hispanic'
eth_q_dict['3'] = 'Unknown'

In [None]:
sex_q_dict = {}

sex_q_dict['0'] = 'Both Sexes'
sex_q_dict['1'] = 'Males'
sex_q_dict['2'] = 'Females'

In [None]:
# Load all sexes, all races, non-hispanic

state_name = 'All funded states (For years selected)'

In [None]:
def get_dict_key(eth_name, race_name, sex_name):
    return eth_name + ' ' + race_name + ' ' + sex_name

In [None]:
df_dict = {}

for eth_ind, eth_name in eth_q_dict.items():
    for race_ind, race_name in race_q_dict.items():
        for sex_ind, sex_name in sex_q_dict.items():

            csv_name = 'data/firearmhomicide_' + state_name + '_' + eth_name + '_' + race_name + '_' + sex_name + '_.csv'
            
            # Load the CSV as a pandas dataframe.
            this_df = pd.read_csv(csv_name)
            df_dict[get_dict_key(eth_name, race_name, sex_name)] = this_df


In [None]:
df_dict.keys()

In [None]:
at_home = {}

# For each of the groups, calculate the percentage of homicides that happened at home.

for name, df in df_dict.items():
    print(df['row_name'])
    home_rows = df.iloc[1]
    at_home[name] = home_rows['data_1']


In [None]:
# Races, then genders

# https://stackoverflow.com/questions/14270391/python-matplotlib-multiple-bars

x = {}

for r in list(at_home.keys()):
    print(r)
    race = r.rsplit(' ', 1)[0]
    if race not in x:
        x[race] = []
    x[race].append(float(at_home[r]))

In [None]:
gender_x = {}
gender_x['Female'] = []
gender_x['Male'] = []

for r in x.keys():
    gender_x['Female'].append(x[r][1])
    gender_x['Male'].append(x[r][0])

In [None]:
races = ['Non-Hispanic \nWhite',
 'Non-Hispanic \nBlack',
 'Non-Hispanic \nAm Indian, AK Native',
 'Non-Hispanic \n Asian, Pac Islander',
 'Hispanic \nAll Races']

In [None]:
from matplotlib import pyplot as plt


def bar_plot(ax, data, title, colors=None, total_width=0.8, single_width=1, legend=True):
    """Draws a bar plot with multiple bars per data point.

    Parameters
    ----------
    ax : matplotlib.pyplot.axis
        The axis we want to draw our plot on.

    data: dictionary
        A dictionary containing the data we want to plot. Keys are the names of the
        data, the items is a list of the values.

        Example:
        data = {
            "x":[1,2,3],
            "y":[1,2,3],
            "z":[1,2,3],
        }

    colors : array-like, optional
        A list of colors which are used for the bars. If None, the colors
        will be the standard matplotlib color cyle. (default: None)

    total_width : float, optional, default: 0.8
        The width of a bar group. 0.8 means that 80% of the x-axis is covered
        by bars and 20% will be spaces between the bars.

    single_width: float, optional, default: 1
        The relative width of a single bar within a group. 1 means the bars
        will touch eachother within a group, values less than 1 will make
        these bars thinner.

    legend: bool, optional, default: True
        If this is set to true, a legend will be added to the axis.
    """

    # Check if colors where provided, otherwhise use the default color cycle
    if colors is None:
        colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

    # Number of bars per group
    n_bars = len(data)

    # The width of a single bar
    bar_width = total_width / n_bars

    # List containing handles for the drawn bars, used for the legend
    bars = []

    # Iterate over all data
    for i, (name, values) in enumerate(data.items()):
        # The offset in x direction of that bar
        x_offset = (i - n_bars / 2) * bar_width + bar_width / 2

        # Draw a bar for every value of that type
        for x, y in enumerate(values):
            bar = ax.bar(x + x_offset, y, width=bar_width * single_width, color=colors[i % len(colors)])
            height = y
            label_x_pos = (x + x_offset) 
            ax.text(label_x_pos, height + 2, s=f'{height}', ha='center', va='bottom')

        # Add a handle to the last drawn bar, which we'll need for the legend
        bars.append(bar[0])

    # Draw legend if we need
    if legend:
        ax.legend(bars, data.keys())
    
    #plt.ylim((0, 100))
    plt.xticks(range(5), races, fontsize=10)
    plt.title(title)



In [None]:
fig, ax = plt.subplots(figsize=(10,4))
bar_plot(ax, gender_x, 'Percentage of firearm homicides that occur in the home', total_width=.8, single_width=.9)
plt.savefig('charts/percentage_firearmhomicides_home.png')
plt.show()



# Percentage of homicides involving a firearm

In [None]:
homicide_prop = {}
substr = 'Total number'

# For each of the groups, calculate the percentage of homicides that involving a firearm.

for name, df in df_dict.items():
    
    homicide_count = df[df['row_name'].str.contains(substr)].reset_index().iloc[0]['data_0']
    print(df[df['row_name'].str.contains(substr)].reset_index())

    all_df = all_df_dict[name]
    print(all_df[all_df['row_name'].str.contains(substr)].reset_index())
    total_death_count = all_df[all_df['row_name'].str.contains(substr)].reset_index().iloc[0]['data_0']
    
    
    if '-' in str(homicide_count) or '-' in str(total_death_count):
        homicide_count = 0.
        total_death_count = 1.
    
    homicide_prop[name] = float(homicide_count) / float(total_death_count)

In [None]:
# Races, then genders

# https://stackoverflow.com/questions/14270391/python-matplotlib-multiple-bars

x = {}

for r in list(homicide_prop.keys()):
    print(r)
    race = r.rsplit(' ', 1)[0]
    if race not in x:
        x[race] = []
    x[race].append(float(homicide_prop[r]))

In [None]:
gender_x = {}
gender_x['Female'] = []
gender_x['Male'] = []

for r in x.keys():
    gender_x['Female'].append(round(100 * x[r][1], 2))
    gender_x['Male'].append(round(100 * x[r][0], 2))

In [None]:
races = ['Non-Hispanic \nWhite',
 'Non-Hispanic \nBlack',
 'Non-Hispanic \nAm Indian, AK Native',
 'Non-Hispanic \n Asian, Pac Islander',
 'Hispanic \nAll Races']

In [None]:
fig, ax = plt.subplots(figsize=(10,4))
bar_plot(ax, gender_x, 'Percentage of homicides that involved a firearm', total_width=.8, single_width=.9)
plt.savefig('charts/percentage_homicides_firearm.png')
plt.show()


# Count the population of each sex, age, and ethnicity in the included states.

In [None]:
state_html_str = '<select multiple="selected" name="q6s3" size="4" onchange="q6s3SelectBox(document.nvdrsForm,this)" id="myStateList"><option value="0">All funded states (For years selected)</option><option value="01">Alabama ( 2018 )</option><option value="02">Alaska ( 2003 - 2018 )</option><option value="04">Arizona ( 2015 - 2018 )</option><option value="06">California ( 2017 - 2018 )</option><option value="08">Colorado ( 2004 - 2018 )</option><option value="09">Connecticut ( 2015 - 2018 )</option><option value="10">Delaware ( 2017 - 2018 )</option><option value="11">District of Columbia ( 2017 - 2018 )</option><option value="13">Georgia ( 2004 - 2018 )</option><option value="15">Hawaii ( 2015 - 2016 )</option><option value="17">Illinois ( 2016 - 2018 )</option><option value="18">Indiana ( 2016 - 2018 )</option><option value="19">Iowa ( 2016 - 2018 )</option><option value="20">Kansas ( 2015 - 2018 )</option><option value="21">Kentucky ( 2005 - 2018 )</option><option value="22">Louisiana ( 2018 )</option><option value="23">Maine ( 2015 - 2018 )</option><option value="24">Maryland ( 2003 - 2018 )</option><option value="25">Massachusetts ( 2003 - 2018 )</option><option value="26">Michigan ( 2014 - 2018 )</option><option value="27">Minnesota ( 2015 - 2018 )</option><option value="29">Missouri ( 2018 )</option><option value="31">Nebraska ( 2018 )</option><option value="32">Nevada ( 2017 - 2018 )</option><option value="33">New Hampshire ( 2015 - 2018 )</option><option value="34">New Jersey ( 2003 - 2018 )</option><option value="35">New Mexico ( 2005 - 2018 )</option><option value="36">New York ( 2015 - 2018 )</option><option value="37">North Carolina ( 2004 - 2018 )</option><option value="39">Ohio ( 2011 - 2018 )</option><option value="36">New York ( 2015 - 2018 )</option><option value="40">Oklahoma ( 2004 - 2018 )</option><option value="41">Oregon ( 2003 - 2018 )</option><option value="42">Pennsylvania ( 2016 - 2018 )</option><option value="44">Rhode Island ( 2004 - 2018 )</option><option value="45">South Carolina ( 2003 - 2018 )</option><option value="49">Utah ( 2005 - 2018 )</option><option value="50">Vermont ( 2015 - 2018 )</option><option value="51">Virginia ( 2003 - 2018 )</option><option value="53">Washington ( 2016 - 2018 )</option><option value="54">West Virginia ( 2017 - 2018 )</option><option value="55">Wisconsin ( 2004 - 2018 )</option></select>'

In [None]:
# Transform the state HTML string into a dictionary. 

state_q_name = 'q6s3'

state_q_dict = {}

# Split the string based on the word 'option'

options = state_html_str.split('option')

for option in options:
    value_substr = 'value="'
    
    # The value is the key, the text is the value
    if value_substr in option:
        d_key = option.split(value_substr)[1].split('"')[0]
        
        # Get the text.
        d_val = option.split('<')[0].split('>')[1]
        
        
        
        
        if '2018' in d_val:
            d_val = d_val.split('(')[0]
            state_q_dict[d_key] = d_val[:-1]

In [None]:
census_df = pd.read_csv('data/sc-est2019-alldata6.csv')

In [None]:
cols = ['NAME', 'SEX', 'RACE', 'ORIGIN', 'POPESTIMATE2018', 'AGE']

In [None]:
# Only include states that were included in 2018.

In [None]:
states_to_keep = list(state_q_dict.values())

In [None]:
race_eths = ['Non-Hispanic White Males', 'Non-Hispanic White Females', 'Non-Hispanic Black Males', 'Non-Hispanic Black Females', 'Non-Hispanic Am IndianAK Native Males', 'Non-Hispanic Am IndianAK Native Females', 'Non-Hispanic AsianPac Islander Males', 'Non-Hispanic AsianPac Islander Females', 'Hispanic All Races Males', 'Hispanic All Races Females']

total_pop_counts = {}

In [None]:
census_df = census_df[cols]

In [None]:
# Only keep the census columns with NAME in the states_to_keep.

census_df = census_df[census_df['NAME'].isin(states_to_keep)]

In [None]:
census_df['NAME'].value_counts().shape

In [None]:
def return_pop_count(race_num, eth_num, sex_num):
    sub_df = census_df[census_df['RACE'] == race_num]
    sub_df = sub_df[sub_df['ORIGIN'] == eth_num]
    sub_df = sub_df[sub_df['SEX'] == sex_num]
    
    return np.sum(sub_df['POPESTIMATE2018'])

The key for SEX is as follows:
0 = Total
1 = Male
2 = Female

The key for ORIGIN is as follows:
0 = Total
1 = Not Hispanic
2 = Hispanic


The key for RACE is as follows:
1 = White Alone
2 = Black or African American Alone
3 = American Indian or Alaska Native Alone
4 = Asian Alone
5 = Native Hawaiian and Other Pacific Islander Alone
6 = Two or more races

race_eths = ['Non-Hispanic White Males', 'Non-Hispanic White Females', 'Non-Hispanic Black Males', 'Non-Hispanic Black Females', 'Non-Hispanic Am IndianAK Native Males', 'Non-Hispanic Am IndianAK Native Females', 'Non-Hispanic AsianPac Islander Males', 'Non-Hispanic AsianPac Islander Females', 'Hispanic All Races Males', 'Hispanic All Races Females']


In [None]:
# Add across states, for each race.

total_pop_counts['Non-Hispanic White Males'] = return_pop_count(1, 1, 1)
total_pop_counts['Non-Hispanic White Females'] = return_pop_count(1, 1, 2)
total_pop_counts['Non-Hispanic Black Males'] = return_pop_count(2, 1, 1)
total_pop_counts['Non-Hispanic Black Females'] = return_pop_count(2, 1, 2)
total_pop_counts['Non-Hispanic Am IndianAK Native Males'] = return_pop_count(3, 1, 1)
total_pop_counts['Non-Hispanic Am IndianAK Native Females'] = return_pop_count(3, 1, 2)
total_pop_counts['Non-Hispanic AsianPac Islander Males'] = return_pop_count(4, 1, 1) + return_pop_count(5, 1, 1)
total_pop_counts['Non-Hispanic AsianPac Islander Females'] = return_pop_count(4, 1, 2) + return_pop_count(5, 1, 2)
total_pop_counts['Hispanic All Races Males'] = np.sum([return_pop_count(i, 2, 1) for i in range(1, 7)])
total_pop_counts['Hispanic All Races Females'] = np.sum([return_pop_count(i, 2, 2) for i in range(1, 7)])


In [None]:
import pickle

In [None]:
pickle.dump( total_pop_counts, open( "data/pop_counts.p", "wb" ) )

# Vulnerability to firearm death in home

In [None]:
import pickle

In [None]:
total_pop_counts = pickle.load( open( "data/pop_counts.p", "rb" ) )

In [None]:
home_prop = {}
substr = 'Total number'

for name, df in df_dict.items():
    
    in_home = df.iloc[1]['data_0']
    
    pop_den = total_pop_counts[name]
    
    home_prop[name] = float(in_home) / pop_den

In [None]:
home_prop

In [None]:
x = {}

for r in list(home_prop.keys()):
    print(r)
    race = r.rsplit(' ', 1)[0]
    if race not in x:
        x[race] = []
    x[race].append(float(home_prop[r]))

In [None]:
fig, ax = plt.subplots(figsize=(10,4))
bar_plot(ax, gender_x, 'Vulnerability to firearm homicide in the home, number of deaths per 1,000', total_width=.8, single_width=.9)
plt.savefig('charts/vul_home_homicide_firearm.png')
plt.show()