In [1]:
import pandas as pd
import numpy as np
import random
import calendar
import math
from vega_datasets import data
from scipy.stats import skew
import missingno as msno
from datetime import datetime as dt
from dateutil import rrule
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import altair as alt
alt.renderers.enable('notebook')

ModuleNotFoundError: No module named 'chardet'

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

In [None]:
incidents = pd.read_csv("incidents_vis.csv")
incidents.drop("Unnamed: 0", axis=1, inplace=True)
participants = pd.read_csv("participants_vis.csv")
participants.drop("Unnamed: 0", axis=1, inplace=True)
guns = pd.read_csv("guns_vis.csv")
guns.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
incidents["date"] = pd.to_datetime(incidents["date"])
incidents["year"] = incidents["date"].apply(lambda x: x.year)

In [None]:
incidents.shape

In [None]:
participants.shape

In [None]:
guns.shape

# Data Exploration

The number of casualties ranges from 0 to 59 for deaths, and from 0 to 441 for injuries respectively. Both values show a relatively high positive skewness ($\gamma_1$ = 11 and $\gamma_1$ = 230), revealing that incidents with few casualties are predominant in both instances. This can also be visualised in the table below, where the upper quartile (lowest 75%) of **n_killed** is 0, and of **n_injured** is 1. A plot of the two values needs to be constructed to better assess the distribution of these two variables. The dataset description also shows that incidents may involve from 1 to 400 guns, although the analysis suggests that the number of guns involved is highly skewed in the favour of low numbers of guns ($\gamma_1$ = 51). As depicted in the description table, the upper quartile is 1, meaning that at least 75% of the incidents recorded involved only one gun. The true distribution of the data will be visualised later in this notebook.

In [None]:
incidents[["n_killed", "n_injured", "n_guns_involved"]].describe(include='all').T

In [None]:
participants.drop("incident_id", axis=1).describe(include=object).T

In [None]:
participants.drop("incident_id", axis=1).describe(include=float).T

In [None]:
pd.DataFrame(incidents.drop(["incident_id", "latitude", "longitude"], axis=1).skew(), columns=["Skewness"]).T

In [None]:
pd.DataFrame(incidents.drop(["incident_id", "latitude", "longitude"], axis=1).kurt(), columns=["Kurtosis"]).T

In [None]:
pd.DataFrame(participants.drop("incident_id", axis=1).skew(), columns=["Skewness"]).T

In [None]:
pd.DataFrame(participants.drop("incident_id", axis=1).kurt(), columns=["Kurtosis"]).T

## Missing data inspection

The data presents an invariate number of missing values for each feature, most notably for **participant_relationship** and **location_description**, where 93.4% and 82.4% of the values are missing respectively. The other features have 51% or less missing fields. 8 of the features, on the other hand, contain no missing values, 5 of which being critical for analysis: **date**, **n_killed**, **n_injured**, **state**, and **city_or_county**. Three other features have an insignificant number of missing instances (<25%), including **incidents_characteristics**. Despite 6.88% of the address fields being empty, only 3.31% of latitude/longitude information is unrecorded, suggesting that an accurate estimation of addresses could be extracted from those. Lastly, despite being absent for more than 41% of the instaces, information about guns' origin, type, and number of guns involved seems to be available for later incidents.

In terms of uniqueness, what stands out is that only 96.65% of the names are unique, meaning that some participants could potentially be involved in more than one incident. On top of that, only 88.73% of the crimes have a unique address, which suggests that some locations might have a history of gun violence incidents that's worth analysing. Moreover, the statistics suggest that all recorded incidents had happened in 12,898 cities or counties.

In [None]:
def missing_values_table(df):
    total = df.isnull().sum().sort_values(ascending=False)
    percent = (round(df.isnull().sum() / df.isnull().count() * 100, 2)).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Missing', 'Percentage'])
    missing_data['Missing'] = missing_data['Missing'].map('{:,.0f}'.format)
    return missing_data.T

In [None]:
missing_values_table(incidents)

In [None]:
missing_values_table(guns)

In [None]:
missing_values_table(participants)

In [None]:
msno.matrix(incidents)

In [None]:
msno.matrix(participants)

In [None]:
def unique_values_table(df):
    total = df.nunique().sort_values(ascending=False)
    percent = (round(df.nunique() / df.count() * 100, 2)).sort_values(ascending=False)
    unique_data = pd.concat([total, percent], axis=1, keys=['Unique', 'Percentage'])
    unique_data['Unique'] = unique_data['Unique'].map('{:,.0f}'.format)
    return unique_data.T

In [None]:
unique_values_table(incidents)

In [None]:
# Spearman's correlation coefficient
incidents[["n_killed", "n_injured"]].corr(method="spearman")

In [None]:
# Check dates with missing incidents
pd.date_range(start = '2014-01-01', end = '2018-03-31' ).difference(incidents.date)

# Data Visualisation

In [None]:
alt.Chart(incidents.groupby("n_killed").count().reset_index().loc[:, ["n_killed", "date"]].rename(columns={"date":"count"})).mark_point(size=40).encode(
    alt.X("n_killed:Q", scale=alt.Scale(type="symlog"), title="Number of deaths"),
    alt.Y("count:Q", scale=alt.Scale(type="log"), title="Number of incidents"),
    tooltip = [alt.Tooltip('n_killed:Q', title="Number of deaths"), alt.Tooltip('count:Q', title="Number of incidents")]
).interactive().properties(width=800, height=400, title='Number of incidents by total deaths')

In [None]:
alt.Chart(incidents.groupby("n_injured").count().reset_index().loc[:, ["n_injured", "date"]].rename(columns={"date": "count"})).mark_point(size=40).encode(
    alt.X("n_injured", scale=alt.Scale(type="symlog"), title="Number of injuries"),
    alt.Y("count", scale=alt.Scale(type="log"), title="Number of incidents"),
    tooltip = [alt.Tooltip('n_injured:Q', title="Number of injuries"), alt.Tooltip('count:Q', title="Number of incidents")]
).interactive().properties(width=800, height=400, title='Number of incidents by total casualties')

In [None]:
alt.Chart(incidents.groupby("n_guns_involved").count().reset_index().loc[:, ["n_guns_involved", "date"]].rename(columns={"date": "count"})).mark_point(size=40).encode(
    alt.X("n_guns_involved", scale=alt.Scale(type="symlog"), title="Number of guns involved"),
    alt.Y("count", scale=alt.Scale(type="log"), title="Number of incidents"),
    tooltip = [alt.Tooltip('n_guns_involved:Q', title="Number of guns"), alt.Tooltip('count:Q', title="Number of incidents")]
).interactive().properties(width=800, height=400, title='Number of incidents by total guns involved')

In [None]:
normal = alt.Chart(guns.groupby("type").count().reset_index().loc[:, ["type", "incident_id"]].rename(columns={"incident_id": "n_incidents"}).nlargest(5, "n_incidents")).mark_bar(width=8).encode(
    alt.X("type:N", sort=alt.EncodingSortField(field="n_incidents", order='descending'), title="Gun type"),
    alt.Y("n_incidents:Q", scale=alt.Scale(type="linear"), title="Number of incidents"),
    alt.Tooltip("n_incidents:Q")
).interactive().properties(width=90, height=350, title="Incidents")

mass = alt.Chart(guns.set_index('incident_id').join(incidents.set_index('incident_id')[["n_killed", "n_injured", "mass_shooting"]]).groupby("type").sum().reset_index().loc[:, ["type", "n_killed", "n_injured", "mass_shooting"]].nlargest(5, "mass_shooting")).mark_bar(width=8).encode(
    alt.X("type:N", sort=alt.EncodingSortField(field="mass_shooting", order='descending'), title="Gun type"),
    alt.Y("mass_shooting:Q", scale=alt.Scale(type="linear"), title=""),
    alt.Tooltip("mass_shooting:Q")
).interactive().properties(width=90, height=350, title="Mass Shootings")

school = alt.Chart(guns.set_index('incident_id').join(incidents.set_index('incident_id')[["n_killed", "n_injured", "school_shooting"]]).groupby("type").sum().reset_index().loc[:, ["type", "n_killed", "n_injured", "school_shooting"]].nlargest(5, "school_shooting")).mark_bar(width=8).encode(
    alt.X("type:N", sort=alt.EncodingSortField(field="school_shooting", order='descending'), title="Gun type"),
    alt.Y("school_shooting:Q", scale=alt.Scale(type="linear"), title=""),
    alt.Tooltip("school_shooting:Q")
).interactive().properties(width=90, height=350, title="School Shootings")

defensive = alt.Chart(guns.set_index('incident_id').join(incidents.set_index('incident_id')[["n_killed", "n_injured", "defensive"]]).groupby("type").sum().reset_index().loc[:, ["type", "n_killed", "n_injured", "defensive"]].nlargest(5, "defensive")).mark_bar(width=8).encode(
    alt.X("type:N", sort=alt.EncodingSortField(field="defensive", order='descending'), title="Gun type"),
    alt.Y("defensive:Q", scale=alt.Scale(type="linear"), title=""),
    alt.Tooltip("defensive:Q")
).interactive().properties(width=90, height=350, title="Defensive use incidents")

accidental = alt.Chart(guns.set_index('incident_id').join(incidents.set_index('incident_id')[["n_killed", "n_injured", "accidental"]]).groupby("type").sum().reset_index().loc[:, ["type", "n_killed", "n_injured", "accidental"]].nlargest(5, "accidental")).mark_bar(width=8).encode(
    alt.X("type:N", sort=alt.EncodingSortField(field="accidental", order='descending'), title="Gun type"),
    alt.Y("accidental:Q", scale=alt.Scale(type="linear"), title=""),
    alt.Tooltip("accidental:Q")
).interactive().properties(width=90, height=350, title="Accidental Incidents")

child = alt.Chart(guns.set_index('incident_id').join(incidents.set_index('incident_id')[["n_killed", "n_injured", "child_involved"]]).groupby("type").sum().reset_index().loc[:, ["type", "n_killed", "n_injured", "child_involved"]].nlargest(5, "child_involved")).mark_bar(width=8).encode(
    alt.X("type:N", sort=alt.EncodingSortField(field="child_involved", order='descending'), title="Gun type"),
    alt.Y("child_involved:Q", scale=alt.Scale(type="linear"), title=""),
    alt.Tooltip("child_involved:Q")
).interactive().properties(width=90, height=350, title="Child Involved Incidents")

alt.hconcat(
    normal,
    defensive, 
    mass,
    school,
    accidental,
    child
)

In [None]:
gun_type_effectiveness = guns.set_index('incident_id').join(incidents.set_index('incident_id')[["n_killed", "n_injured"]]).groupby("type").sum().reset_index().loc[:, ["type", "n_killed", "n_injured"]]
deaths_by_incident_rate = guns.groupby("type").count().reset_index().loc[:, ["type", "incident_id"]].rename(columns={"incident_id": "n_incidents"})
deaths_by_incident_rate["n_incidents"] = deaths_by_incident_rate["n_incidents"].astype(np.float16)
for i in deaths_by_incident_rate.index:
    deaths_by_incident_rate.at[i, "n_incidents"] = gun_type_effectiveness.loc[i]["n_killed"] * 100 / deaths_by_incident_rate.loc[i]["n_incidents"]
deaths_by_incident_rate.columns = ["type", "rate"]

In [None]:
alt.Chart(deaths_by_incident_rate).mark_bar(height=8).encode(
    alt.Y("type:N", sort=alt.EncodingSortField(field="rate", order='descending'), title="Gun type"),
    alt.X("rate:Q", scale=alt.Scale(type="linear"), title="Death rate"),
    alt.Tooltip("rate")
).interactive().properties(width=800, height=350, title="Rate of deaths per incident by gun type")

In [None]:
victims_by_age_f = participants[(participants["type"] == "Victim") & (participants["gender"] == "Female")].groupby("age").count().reset_index().loc[:, ["age", "incident_id"]].rename(columns={"incident_id": "n_victims"})
victims_by_age_m = participants[(participants["type"] == "Victim") & (participants["gender"] == "Male")].groupby("age").count().reset_index().loc[:, ["age", "incident_id"]].rename(columns={"incident_id": "n_victims"})
suspects_by_age_f = participants[(participants["type"] == "Subject-Suspect") & (participants["gender"] == "Female")].groupby("age").count().reset_index().loc[:, ["age", "incident_id"]].rename(columns={"incident_id": "n_suspects"})
suspects_by_age_m = participants[(participants["type"] == "Subject-Suspect") & (participants["gender"] == "Male")].groupby("age").count().reset_index().loc[:, ["age", "incident_id"]].rename(columns={"incident_id": "n_suspects"})

In [None]:
plot_m = alt.Chart(victims_by_age_m).mark_bar(size=5).encode(
    alt.X("age:N", title="Age"),
    alt.Y("n_victims:Q", scale=alt.Scale(type="sqrt"), title="Number of victims"),
    alt.Tooltip("n_victims")
).interactive().properties(
    width=880, 
    height=350
)

plot_f = alt.Chart(victims_by_age_f).mark_bar(size=5, color="pink").encode(
    alt.X("age:N", title="Age"),
    alt.Y("n_victims:Q", scale=alt.Scale(type="sqrt"), title="Number of victims"),
    alt.Tooltip("n_victims")
).interactive().properties(
    width=880, 
    height=350,
    title="Number of victims by age and gender"
)

plot_m + plot_f

In [None]:
plot_m = alt.Chart(suspects_by_age_m).mark_bar(size=5).encode(
    alt.X("age:N", title="Age"),
    alt.Y("n_suspects:Q", scale=alt.Scale(type="sqrt"), title="Number of suspects"),
    alt.Tooltip("n_suspects")
).interactive().properties(
    width=880, 
    height=350
)

plot_f = alt.Chart(suspects_by_age_f).mark_bar(size=5, color='pink').encode(
    alt.X("age:N", title="Age"),
    alt.Y("n_suspects:Q", scale=alt.Scale(type="sqrt"), title="Number of suspects"),
    alt.Tooltip("n_suspects")
).interactive().properties(
    width=880, 
    height=350,
    title="Number of suspects by age and gender"
)

plot_m + plot_f

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=["Female", "Male"], values=participants[participants["type"] == "Victim"].groupby("gender").count().loc[["Male", "Female"], "incident_id"], name="Victims", marker_colors=["#1f77b4", "pink"]), 1, 1)
fig.add_trace(go.Pie(labels=["Female", "Male"], values=participants[participants["type"] == "Subject-Suspect"].groupby("gender").count().loc[["Male", "Female"], "incident_id"], name="Suspects", marker_colors=["#1f77b4", "pink"]), 1, 2)
fig.update_traces(hole=.5, hoverinfo="label+percent+name")

fig.update_layout(
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Victims', x=0.18, y=0.5, font_size=20, showarrow=False),
                 dict(text='Suspects', x=0.835, y=0.5, font_size=20, showarrow=False)])
fig.show()

In [None]:
alt.Chart(participants.groupby("relationship").count().reset_index().loc[:, ["relationship", "incident_id"]].rename(columns={"incident_id": "n_incidents"})).mark_bar(height=8).encode(
    alt.Y("relationship:N", sort=alt.EncodingSortField(field="n_incidents", order='descending')),
    alt.X("n_incidents:Q", scale=alt.Scale(type="log")),
    alt.Tooltip("n_incidents")
).interactive().properties(width=600, height=300)

In [None]:
n_incidents_by_year = pd.DataFrame(incidents.groupby(incidents.date.dt.year).count().loc[:, "date"]).rename(columns={"date": "n_incidents"})
n_casualties_by_year = pd.DataFrame(incidents.groupby(incidents.date.dt.year).sum().loc[:, ["n_killed", "n_injured"]])
stats_by_year = pd.concat([n_casualties_by_year, n_incidents_by_year], axis=1)
stats_by_year.index.name = "year"
stats_by_year = stats_by_year.reset_index()

In [None]:
alt.Chart(stats_by_year.melt("year")).mark_bar(size=30).encode(
    alt.X("variable:N", axis=alt.Axis(title=''), sort=alt.EncodingSortField(field="variable", order='descending')),
    alt.Y("value:Q", axis=alt.Axis(title='')),
    alt.Tooltip('value:Q'),
    alt.Color("variable:N"),
    alt.Column("year", title="Year (Jan 2014 - Mar 2018)")
).properties(width=140)

In [None]:
n_incidents_by_month = incidents.groupby([incidents.date.dt.year, incidents.date.dt.month]).count().loc[:,"date"]
# Divide number of incidents for each month by the number of days
for date in rrule.rrule(rrule.MONTHLY, dtstart=dt(2014, 1, 1), until=dt(2018, 3, 1)):
    n_incidents_by_month[date.year][date.month] = n_incidents_by_month[date.year][date.month] / calendar.monthrange(date.year, date.month)[1]
# Average month values over all years
incidents_by_month_ave = pd.DataFrame(n_incidents_by_month).groupby(n_incidents_by_month.index.get_level_values(1)).sum() // pd.DataFrame(n_incidents_by_month).groupby(n_incidents_by_month.index.get_level_values(1)).count()

In [None]:
n_incidents_by_month = pd.DataFrame(n_incidents_by_month)
n_incidents_by_month.index.names = ["year", "month"]
n_incidents_by_month.columns = ["n_incidents"]
n_incidents_by_month = n_incidents_by_month.reset_index()

In [None]:
alt.Chart(n_incidents_by_month).mark_line().encode(
    alt.X("month"),
    alt.Y("n_incidents", scale=alt.Scale(zero=False), title="Number of incidents"),
    alt.Column("year", title="Year (Jan 2014 - Mar 2018)"),
    alt.Tooltip('n_incidents:Q')
).properties(width=140)

In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 
          'October', 'November', 'December']

In [None]:
incidents_by_month_ave.index = months
incidents_by_month_ave = incidents_by_month_ave.reset_index().reset_index()
incidents_by_month_ave.columns = ["order", "month", "n_incidents"]

In [None]:
month_chart = alt.Chart(incidents_by_month_ave).mark_line().encode(
    alt.X("month:N", sort=None),
    alt.Y("n_incidents:Q", scale=alt.Scale(zero=False)),
    alt.Tooltip('n_incidents:Q'),
    alt.Order("order")
).properties(width=600, height=300)

month_chart + month_chart.mark_circle()

In [None]:
def format_year(date):
    date = date.replace(year = 2000)
    return date

In [None]:
casualties = incidents.groupby("date").sum().loc[:, ["n_killed", "n_injured"]].reset_index()

In [None]:
casualties_by_date = casualties.copy()
casualties_by_date["date"] = casualties_by_date["date"].apply(format_year)
casualties_by_date = casualties_by_date.groupby("date").median().reset_index().melt('date', var_name='casualties', value_name='n_casualties')

In [None]:
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['date'], empty='none')
# The basic line
line = alt.Chart(casualties_by_date).mark_line(strokeWidth=1.3).encode(
    x=alt.X('date:T'),
    y=alt.Y('n_casualties:Q', scale=alt.Scale(domain=(0, 200))),
    color='casualties:N'
)
point = alt.Chart(casualties_by_date).mark_point(size=5).encode(
    x=alt.X('date:T'),
    y=alt.Y('n_casualties:Q', scale=alt.Scale(domain=(0, 200))),
    color='casualties:N'
).interactive()
# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(casualties_by_date).mark_point().encode(
    x='date:T',
    opacity=alt.value(0),
).add_selection(
    nearest
)
# Draw points on the line, and highlight based on selection
points = line.mark_point().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)
# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'n_casualties:Q', alt.value(' '))
)
# Draw a rule at the location of the selection
rules = alt.Chart(casualties_by_date).mark_rule(color='gray').encode(
    x='date:T',
).transform_filter(
    nearest
)

# Put the five layers into a chart and bind the data
alt.layer(
    line, point, selectors, points, rules, text
).properties(
    width=820, height=500
)

In [None]:
casualties_by_year = casualties.copy()
casualties_by_year = casualties_by_year.groupby("date").median().reset_index().melt('date', var_name='casualties', value_name='n_casualties')
casualties_by_year["year"] = casualties_by_year["date"].dt.year

In [None]:
alt.Chart(casualties_by_year).mark_bar(opacity=.4).encode(
    alt.X('count()', title=""),
    alt.Y('n_casualties:Q', scale=alt.Scale(domain=(20, 200), clamp=True), title="Number of casualties"),
    alt.Column("year", title="Density of casualties per incident"),
    alt.Color('casualties:N')
).properties(
    height=300,
    width=145
)

In [None]:
inc = incidents.groupby("date").count().loc[:, "incident_id"].reset_index()
inc["date"] = inc["date"].apply(format_year)
inc = inc.groupby("date").mean()

In [None]:
threshold = pd.DataFrame([{"threshold": incidents.groupby("date").count().loc[:, "incident_id"].mean()}])

bars1 = alt.Chart(incidents.groupby("date").count().loc[:, "incident_id"].reset_index().nlargest(10, "incident_id")).mark_bar(width=20).encode(
    alt.X("date:N", sort=alt.EncodingSortField(field="n_incidents", order='ascending'), timeUnit='yearmonthdate', title="Date"),
    alt.Y("incident_id:Q", scale=alt.Scale(domain=(0, 350)), title="Number of incidents relative to the mean"),
    tooltip = alt.Tooltip('incident_id:Q', title="Number of incidents")
).interactive().properties(width=400, height=400, title='Top 10 dates with most gun violence incidents')

highlight1 = alt.Chart(incidents.groupby("date").count().loc[:, "incident_id"].reset_index().nlargest(10, "incident_id")).mark_bar(color="#e45755", width=20).encode(
    x=alt.X('date:N', sort=alt.EncodingSortField(field="n_incidents", order='ascending'), timeUnit='yearmonthdate', title="Date"),
    y='baseline:Q',
    y2='incident_id:Q',
    tooltip = alt.Tooltip('incident_id:Q', title="Number of incidents relative to the mean")
).transform_filter(
    alt.datum.incident_id > incidents.groupby("date").count().loc[:, "incident_id"].mean()
).transform_calculate("baseline", str(incidents.groupby("date").count().loc[:, "incident_id"].mean()))

rule1 = alt.Chart(threshold).mark_rule().encode(
    y='threshold:Q'
)

bars2 = alt.Chart(inc.nlargest(10, "incident_id").reset_index()).mark_bar(width=20).encode(
    alt.X("date:N", sort=alt.EncodingSortField(field="incident_id", order='descending'), timeUnit='monthdate', title="Day of the year"),
    alt.Y("incident_id:Q", scale=alt.Scale(domain=(0, 350)), title=""),
    tooltip = alt.Tooltip('incident_id:Q', title="Number of incidents")
).interactive().properties(width=400, height=400, title='Top 10 days of the year with most gun violence incidents')

highlight2 = alt.Chart(inc.nlargest(10, "incident_id").reset_index()).mark_bar(color="#e45755", width=20).encode(
    x=alt.X('date:N', sort=alt.EncodingSortField(field="incident_id", order='descending'), timeUnit='monthdate', title="Day of the year"),
    y='baseline:Q',
    y2='incident_id:Q',
    tooltip = alt.Tooltip('incident_id:Q', title="")
).transform_filter(
    alt.datum.incident_id > incidents.groupby("date").count().loc[:, "incident_id"].mean()
).transform_calculate("baseline", str(incidents.groupby("date").count().loc[:, "incident_id"].mean()))

rule2 = alt.Chart(threshold).mark_rule().encode(
    y='threshold:Q'
)

alt.hconcat(
    bars1 + highlight1 + rule1,
    bars2 + highlight2 + rule2
)

In [None]:
holidays = {"New Year's Day": dt(2000, 1, 1), "Martin Luther King, Jr. Day": dt(2000, 1, 21), "George Washingtonâ€™s Birthday": dt(2000, 2, 18), "Memorial Day": dt(2000, 5, 27), "Independence Day": dt(2000, 7, 4), "Labor Day": dt(2000, 9, 2), "Columbus Day": dt(2000, 10, 14), "Veterans Day": dt(2000, 11, 11), "Thanksgiving Day": dt(2000, 11, 28), "Christmas Day": dt(2000, 12, 25)}

In [None]:
bars1 = alt.Chart(inc.loc[list(holidays.values())].rename(index={v: k for k, v in holidays.items()}).reset_index()).mark_bar(width=20).encode(
    alt.X("date:N", sort=alt.EncodingSortField(field="incident_id", order='descending'), title="Holiday"),
    alt.Y("incident_id:Q", scale=alt.Scale(domain=(0, 350)), title="Number of incidents relative to the mean"),
    tooltip = alt.Tooltip('incident_id:Q', title="Number of incidents")
).interactive().properties(width=400, height=400, title='Number of gun violence incidents per federal holiday')

highlight1 = alt.Chart(inc.loc[list(holidays.values())].rename(index={v: k for k, v in holidays.items()}).reset_index()).mark_bar(color="#e45755", width=20).encode(
    x=alt.X('date:N', sort=alt.EncodingSortField(field="incident_id", order='descending'), title="Holiday"),
    y='baseline:Q',
    y2='incident_id:Q',
    tooltip = alt.Tooltip('incident_id:Q', title="Number of incidents relative to the mean")
).transform_filter(
    alt.datum.incident_id > incidents.groupby("date").count().loc[:, "incident_id"].mean()
).transform_calculate("baseline", str(incidents.groupby("date").count().loc[:, "incident_id"].mean()))

rule1 = alt.Chart(threshold).mark_rule().encode(
    y='threshold:Q'
)

inc = inc.reset_index()
inc["day"] = inc["date"].dt.weekday

bars2 = alt.Chart(inc.groupby("day").mean().rename(index={0:"Sunday", 1:"Monday", 2:"Tuesday", 3:"Wednesday", 4:"Thursday", 5:"Friday", 6:"Saturday"}).reset_index()).mark_bar(width=20).encode(
    alt.X("day:N", sort=alt.EncodingSortField(field="incident_id", order='descending'), title="Weekday"),
    alt.Y("incident_id:Q", scale=alt.Scale(domain=(0, 350)), title="Number of incidents relative to the mean"),
    tooltip = alt.Tooltip('incident_id:Q', title="Number of incidents")
).interactive().properties(width=400, height=400, title='Number of gun violence incidents per week day')

highlight2 = alt.Chart(inc.groupby("day").mean().rename(index={0:"Sunday", 1:"Monday", 2:"Tuesday", 3:"Wednesday", 4:"Thursday", 5:"Friday", 6:"Saturday"}).reset_index()).mark_bar(color="#e45755", width=20).encode(
    x=alt.X('day:N', sort=alt.EncodingSortField(field="incident_id", order='descending'), title="Weekday"),
    y='baseline:Q',
    y2='incident_id:Q',
    tooltip = alt.Tooltip('incident_id:Q', title="Number of incidents relative to the mean")
).transform_filter(
    alt.datum.incident_id > incidents.groupby("date").count().loc[:, "incident_id"].mean()
).transform_calculate("baseline", str(incidents.groupby("date").count().loc[:, "incident_id"].mean()))

rule2 = alt.Chart(threshold).mark_rule().encode(
    y='threshold:Q'
)

alt.hconcat(
    bars1 + highlight1 + rule1,
    bars2 + highlight2 + rule2
)

## Cities

### Cities with most gun violence incidents

In [None]:
alt.Chart(incidents.groupby("city_or_county").count()["incident_id"].reset_index().nlargest(25, "incident_id")).mark_bar(size=20).encode(
    alt.X("city_or_county:N", sort=alt.EncodingSortField(field="n_incidents", order='ascending'), title="City"),
    alt.Y("incident_id:Q", title="Number of incidents relative to the mean"),
    tooltip = alt.Tooltip('incident_id:Q', title="Number of incidents")
).interactive().properties(width=800, height=400, title='Cities with most gun violence incidents (Jan 2014 - Mar 2018)')

In [None]:
city_comparison = incidents.groupby([incidents.date.dt.year, incidents.date.dt.month, incidents.city_or_county]).count()["incident_id"]
city_comparison.index.names = ["year", "month", "city_or_county"]
city_comparison = city_comparison.reset_index()

In [None]:
alt.Chart(city_comparison[city_comparison["city_or_county"].isin(["Chicago", "Washington", "Los Angeles", "Philadelphia"])]).mark_line().encode(
    alt.X("month", title="Month"),
    alt.Y("incident_id:Q", scale=alt.Scale(zero=False), title="Number of incidents"),
    alt.Column("year", title="Year (Jan 2014 - Mar 2018)"),
    alt.Color("city_or_county", title="City")
).properties(height=500, width=140)

In [None]:
alt.Chart(incidents.groupby("state").count()["incident_id"].reset_index().nlargest(25, "incident_id")).mark_bar(size=20).encode(
    alt.X("state:N", sort=alt.EncodingSortField(field="n_incidents", order='ascending'), title="State"),
    alt.Y("incident_id:Q", title="Number of incidents relative to the mean"),
    tooltip = alt.Tooltip('incident_id:Q', title="Number of incidents")
).interactive().properties(width=800, height=400, title='States with most gun violence incidents (Jan 2014 - Mar 2018)')

In [None]:
state_comparison = incidents.groupby([incidents.date.dt.year, incidents.date.dt.month, incidents.state]).count()["incident_id"]
state_comparison.index.names = ["year", "month", "state"]
state_comparison = state_comparison.reset_index()

In [None]:
alt.Chart(state_comparison[state_comparison["state"].isin(["California", "Illinois", "Florida"])]).mark_line().encode(
    alt.X("month", title="Month"),
    alt.Y("incident_id:Q", scale=alt.Scale(zero=False), title="Number of incidents"),
    alt.Column("year", title="Year (Jan 2014 - Mar 2018)"),
    alt.Color("state:N", title="State")
).properties(height=500, width=140)

In [None]:
state_population = {'Alabama': 4853193, 'Alaska': 738438, 'Arizona': 6839353, 'Arkansas': 2979818, 'California': 38893516, 'Colorado': 5446126, 'Connecticut': 3585952, 'District of Columbia': 940788, 'Delaware': 674093, 'Florida': 20250908, 'Georgia': 10188251, 'Hawaii': 1419621, 'Idaho': 1659273, 'Illinois': 12852933, 'Indiana': 6612724, 'Iowa': 3119893, 'Kansas': 2905172, 'Kentucky': 4427480, 'Louisiana': 4656533, 'Maine': 1330774, 'Maryland': 5979634, 'Massachusetts': 6792551, 'Michigan': 9940970, 'Minnesota': 5487856, 'Mississippi': 2989215, 'Missouri': 6072902, 'Montana': 1031982, 'Nebraska': 1891988, 'Nevada': 2871365, 'New Hampshire': 1337613, 'New Jersey': 8871814, 'New Mexico': 2091906, 'New York': 19635618, 'North Carolina': 10047620, 'North Dakota': 744586, 'Ohio': 11619306, 'Oklahoma': 3900162, 'Oregon': 4028386, 'Pennsylvania': 12785093, 'Rhode Island': 1056172, 'South Carolina': 4891931, 'South Dakota': 856293, 'Tennessee': 6595774, 'Texas': 27442726, 'Utah': 2992711, 'Vermont': 624959, 'Virginia': 8360838, 'Washington': 7179800, 'West Virginia': 1838663, 'Wisconsin': 5763068, 'Wyoming': 582713}

In [None]:
def calculate_per_capita(n_incidents, state):
    return round(n_incidents * 100000 / state_population[state], 3)

In [None]:
incidents_by_state = incidents.groupby(["state", "year"]).count()["date"].reset_index().rename(columns={"date": "n_incidents"})

In [None]:
incidents_per_capita = incidents_by_state.copy()
incidents_per_capita["n_incidents"] = np.vectorize(calculate_per_capita)(incidents_per_capita["n_incidents"], incidents_per_capita["state"])
incidents_per_capita = incidents_per_capita.groupby("state").sum()["n_incidents"].reset_index()

alt.Chart(incidents_per_capita.nlargest(25, "n_incidents")).mark_bar(size=20).encode(
    alt.X("state:N", sort=alt.EncodingSortField(field="n_incidents", order='descending'), title="State"),
    alt.Y("n_incidents:Q", scale=alt.Scale(domain=(0, 400)), title="Number of incidents relative to the mean"),
    tooltip = alt.Tooltip('n_incidents:Q', title="Number of incidents")
).interactive().properties(width=800, height=400, title='States with most gun violence incidents (Jan 2014 - Mar 2018)')

In [None]:
state_per_c = incidents.groupby([incidents.date.dt.year, incidents.date.dt.month, incidents.state]).count()["incident_id"]
state_per_c.index.names = ["year", "month", "state"]
state_per_c = state_per_c.reset_index()
state_per_c["incident_id"] = np.vectorize(calculate_per_capita)(state_per_c["incident_id"], state_per_c["state"])

In [None]:
alt.Chart(state_per_c[state_per_c["state"].isin(["District of Columbia", "Delaware", "Alaska"])]).mark_line().encode(
    alt.X("month", title="Month"),
    alt.Y("incident_id:Q", scale=alt.Scale(zero=False), title="Number of incidents"),
    alt.Column("year", title="Year (Jan 2014 - Mar 2018)"),
    alt.Color("state:N", title="State")
).properties(height=500, width=140)

### Cities with most gun violence incidents per capita

## Nr killed/injured

### Pie chart with number of people killed/injured/casualties

### Maybe number of casualties over time

### Maybe incident characteristics/location description too

### Maybe gun type too