## Import Libraries

Import the required libraries that will be used later on.

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import geopy
import folium

## Load Dataset

Edit the 'file' variable so that it contains the file path to the gun violence dataset. Then execute the cell to load the dataset.

In [2]:
file = 'gun_violence.csv' # file path to shootings dataset
df_guns = pd.read_csv(file)

## Helper Functions

Execute the cell below to load all helper functions.

In [8]:
def clean_participants_type(row_value):
    if row_value is np.NaN:
        return {'Victim':0, 'Subject-Suspect':0}    
    elif '::' in row_value:
        types = pd.Series([person.split('::')[1] for person in row_value.split('||')])
    else:
        types = pd.Series([person.split(':')[1] for person in row_value.split('|')])
        
    types = dict(types.value_counts())
    
    if 'Victim' not in types.keys():
        types['Victim'] = 0
    if 'Subject-Suspect' not in types.keys():
        types['Subject-Suspect'] = 0
        
    return types

def assign_shooting_size(size):
    if size <= 1:
        return 200
    elif size <= 3:
        return 250
    elif size > 3:
        return 400
    else:
        return 200
    
def assign_deadliness(n_injured, n_killed):
    if (n_injured <= 2) and (n_killed == 0):
        return 0.05
    elif (n_injured > 2) and (n_killed == 0):
        return 0.10
    elif (n_injured == 0) and (n_killed == 1):
        return 0.35
    elif (n_injured > 0) and (n_killed == 1):
        return 0.40
    elif (n_killed > 1):
        return 0.50
    else:
        return 0.10
    
def add_circle(map_, coordinates, color, fill_opac, radius=300):
    folium.Circle(
        location=coordinates,
        radius=radius,
        weight=0,
        fill=True,
        fill_color=color,
        fill_opacity=fill_opac
    ).add_to(map_)

## Analysis Generalized to Other Cities

Below, change the 'city' and 'state' variables to the city that you wish to view. Then execute the cell after it to display the visualization. The top 10 cities with most incidents in the dataset are listed below.

#### Top 10 Cities and Total Incidents
1. Chicago, Illinois ............... 10,814
2. Baltimore, Maryland ............... 3,943
3. Washington, District of Columbia ............... 3,279
4. New Orleans, Louisiana ............... 3,071
5. Philadelphia, Pennsylvania ............... 2,963
6. Houston, Texas ............... 2,501
7. Saint Louis, Missouri ............... 2,501
8. Milwaukee, Wisconsin ............... 2,487
9. Jacksonville, Florida ............... 2,448
10. Memphis, Tennessee ............... 2,386

In [28]:
city = 'Chicago'
state = 'Illinois'

In [29]:
# Filter data frame to desired city and state
df_city = df_guns.loc[df_guns['city_or_county'].str.contains(city)]
df_city = df_city.loc[df_guns['state']==state]
df_city = df_city.loc[df_city['longitude'].notnull()]

# percent change threshold for KMeans algorithm
pct_change = 10

# KMeans implementation - test different number of clusters and record each's within-cluster-sum-of-squares
K = np.arange(21)[1:]
inertias = []
for k in K:
    kmeans = KMeans(n_clusters=k).fit(df_city[['latitude', 'longitude']])
    inertias.append(kmeans.inertia_)

# KMeans implementation - determine optimal number of clusters according to elbow method
inertias = np.array(inertias)
start = inertias[0]
inertias = -100 * np.diff((inertias[1:] - np.repeat(start, len(inertias[1:])))/np.repeat(start, len(inertias[1:])))
n_clusters = np.argwhere(inertias < pct_change)[0][0]
n_clusters = n_clusters + 3

# Assign clusters to data frame and extract relevant features
kmeans = KMeans(n_clusters=n_clusters).fit(df_city[['longitude', 'latitude']])
cluster_assignments = pd.DataFrame({'cluster_assignment': list(kmeans.labels_)})
df_plot = df_city[['latitude', 'longitude', 'n_injured', 'n_killed', 'participant_type']].reset_index(drop=True)
df_plot = pd.concat([df_plot, cluster_assignments], axis=1)

# Clean relevant features, and assign deadliness factor and shooting size weight
df_plot['n_victims'] = df_plot.apply(lambda x: clean_participants_type(x['participant_type'])['Victim'], axis=1)
df_plot['n_suspects'] = df_plot.apply(lambda x: clean_participants_type(x['participant_type'])['Subject-Suspect'], axis=1)
df_plot['n_participants'] = df_plot['n_victims'] + df_plot['n_suspects']
df_plot['shooting_size_weight'] = df_plot.apply(lambda x: assign_shooting_size(x['n_participants']), axis=1)
df_plot['deadliness_factor'] = df_plot.apply(lambda x: assign_deadliness(x['n_injured'], x['n_killed']), axis=1)

# Use GeoPy to retrieve coordinates of desired city
locator = geopy.geocoders.Nominatim(user_agent="MyCoder")
location = locator.geocode(city)
location = [location.latitude, location.longitude]

# Create Folium map object at city's location
m = folium.Map(location=location, tiles="cartodbpositron",
zoom_start=11)
colors = ['#db3d13', '#3399ff', '#027f45', '#040d22', '#ff61b1', '#6a329f', '#f1c232']

# Iterate through all clusters, and plot each shooting within each cluster
for i in range(n_clusters):
    df_cluster = df_plot.loc[df_plot['cluster_assignment']==i]
    for j in range(len(df_cluster)):
        coordinates = [df_cluster['latitude'].iloc[j], df_cluster['longitude'].iloc[j]]
        color = colors[i]
        add_circle(m, coordinates, color, df_cluster['deadliness_factor'].iloc[j], int(df_cluster['shooting_size_weight'].iloc[j]))

# Display map object
m