# Examining Gun Violence

In [8]:
from IPython.display import display
from time import time
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer

## Import file

In [9]:
filename = 'gun-violence-data_01-2013_03-2018.csv'
df_raw = pd.read_csv(filename)
print(df_raw.shape)

## Cleaning and formatting

In [35]:
df_raw['date'] = pd.to_datetime(df_raw['date'], format='%Y-%m-%d')

#df_mid = df_raw.drop(['incident_id', 'address', 'incident_url', 'source_url', 'incident_url_fields_missing', 'participant_name', 'sources'], axis=1)

df_desc = df_raw.loc[:, ['incident_characteristics', 'location_description', 'notes', 'participant_age_group', 'participant_gender', 'participant_relationship', 'participant_status', 'participant_type']]

df_desc = df_desc.apply(lambda x: x.str.lower().str.replace(r':', ' ').str.replace(r'[^a-zA-Z ]+', ' ').fillna(''))

df = df_raw.loc[:, ['date', 'state', 'city_or_county', 'n_killed', 'n_injured']]

#df_states_sum = df.groupby('state').sum()
#df_date_sum = df.groupby('date').sum()
#df_states_mean = df.groupby('state').mean()
#df_date_mean = df.groupby('date').mean()

## Bags of Words

In [36]:
for col in list(df_desc.columns):
    vectorizer = CountVectorizer(max_features=100, stop_words='english')
    X = vectorizer.fit_transform(df_desc[col])
    bag_of_words = pd.DataFrame(X.toarray(), columns=(col + '.' + s for s in vectorizer.get_feature_names()))
    df = df.join(bag_of_words)

In [37]:
df

Unnamed: 0,date,state,city_or_county,n_killed,n_injured,incident_characteristics.abductions,incident_characteristics.accidental,incident_characteristics.action,incident_characteristics.aggression,incident_characteristics.alcohol,incident_characteristics.applies,incident_characteristics.armed,incident_characteristics.arrest,incident_characteristics.atf,incident_characteristics.attempt,incident_characteristics.bar,incident_characteristics.brandishing,incident_characteristics.business,incident_characteristics.car,incident_characteristics.carry,incident_characteristics.child,incident_characteristics.club,incident_characteristics.commission,incident_characteristics.confiscation,incident_characteristics.crime,incident_characteristics.crimes,incident_characteristics.dead,incident_characteristics.death,incident_characteristics.defensive,incident_characteristics.dgu,incident_characteristics.diagram,incident_characteristics.discharge,incident_characteristics.domestic,incident_characteristics.drive,incident_characteristics.drug,incident_characteristics.drugs,incident_characteristics.establishment,incident_characteristics.evidence,incident_characteristics.felon,incident_characteristics.fired,incident_characteristics.flourishing,incident_characteristics.gang,incident_characteristics.group,incident_characteristics.gun,incident_characteristics.home,incident_characteristics.hostage,incident_characteristics.illegally,incident_characteristics.incident,incident_characteristics.influence,incident_characteristics.injured,...,notes.street,notes.suicide,notes.suspect,notes.suspects,notes.teen,notes.threat,notes.times,notes.traffic,notes.unclear,notes.vehicle,notes.vic,notes.victim,notes.wife,notes.woman,notes.wounded,participant_age_group.adult,participant_age_group.child,participant_age_group.teen,participant_gender.female,participant_gender.male,participant_relationship.aquaintance,participant_relationship.armed,participant_relationship.current,participant_relationship.does,participant_relationship.drive,participant_relationship.family,participant_relationship.friends,participant_relationship.gang,participant_relationship.home,participant_relationship.invasion,participant_relationship.know,participant_relationship.knows,participant_relationship.mass,participant_relationship.neighbor,participant_relationship.perp,participant_relationship.random,participant_relationship.robbery,participant_relationship.shooting,participant_relationship.significant,participant_relationship.victim,participant_relationship.victims,participant_relationship.vs,participant_relationship.worker,participant_status.arrested,participant_status.injured,participant_status.killed,participant_status.unharmed,participant_type.subject,participant_type.suspect,participant_type.victim
0,2013-01-01,Pennsylvania,Mckeesport,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4,0,0,1,1,4
1,2013-01-01,California,Hawthorne,1,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,1,0,1,1,4
2,2013-01-01,Ohio,Lorain,1,3,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,1,2,2,2,3
3,2013-01-05,Colorado,Aurora,4,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,1,3
4,2013-01-07,North Carolina,Greensboro,2,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,1,2,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,0,1,1,3
5,2013-01-07,Oklahoma,Tulsa,4,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,4,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,4,2,2,2,4
6,2013-01-19,New Mexico,Albuquerque,5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,1,3,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,5,1,1,1,5
7,2013-01-21,Louisiana,New Orleans,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,1,1,5
8,2013-01-21,California,Brentwood,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,3,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,1,1,1,4
9,2013-01-23,Maryland,Baltimore,1,6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,6,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,1,0,0,0,7


## Exploratory Data Analysis

In [23]:
print(df.columns)

Index(['date', 'state', 'city_or_county', 'n_killed', 'n_injured',
       'congressional_district', 'gun_stolen', 'gun_type',
       'incident_characteristics', 'latitude', 'location_description',
       'longitude', 'n_guns_involved', 'notes', 'participant_age',
       'participant_age_group', 'participant_gender',
       'participant_relationship', 'participant_status', 'participant_type',
       'state_house_district', 'state_senate_district'],
      dtype='object')


In [14]:
print(df_desc.columns)

Index(['incident_characteristics', 'location_description', 'notes',
       'participant_age_group', 'participant_gender',
       'participant_relationship', 'participant_status', 'participant_type'],
      dtype='object')


In [21]:
df_desc

Unnamed: 0,incident_characteristics,location_description,notes,participant_age_group,participant_gender,participant_relationship,participant_status,participant_type
0,shot wounded injured mass shooting victims...,,julian sims under investigation four shot and...,adult adult adult adult adult,male male male female,,arrested injured injured injured in...,victim victim victim victim subject...
1,shot wounded injured shot dead murder ac...,,four shot one killed unidentified shooter in...,adult adult adult adult,male,,killed injured injured injured,victim victim victim victim subject...
2,shot wounded injured shot dead murder ac...,cotton club,,adult adult adult adult adult,male male male male male,,injured unharmed arrested unharmed arr...,subject suspect subject suspect victim ...
3,shot dead murder accidental suicide offic...,,,adult adult adult adult,female male male male,,killed killed killed killed,victim victim victim subject suspect
4,shot wounded injured shot dead murder ac...,,two firearms recovered attempted murder sui...,adult adult teen adult,female male male female,family,injured injured killed killed,victim victim victim subject suspect
5,shot dead murder accidental suicide home ...,fairmont terrace,,adult adult adult adult adult ...,female female female female male ...,,killed killed killed killed unharme...,victim victim victim victim subject...
6,shot dead murder accidental suicide mass ...,,,adult adult child child child ...,male female male female female male,family,killed killed killed killed killed ...,victim victim victim victim victim ...
7,shot wounded injured drive by car to street...,,unprovoked drive by results in multiple teens ...,,male male male male male,,injured injured injured injured inj...,victim victim victim victim victim ...
8,shot wounded injured drive by car to street...,,perps were likely motivated by gang affliations,teen teen teen adult,male male male male male,,injured injured injured injured unh...,victim victim victim victim subject...
9,shot wounded injured shot dead murder ac...,,shooting occurred over illegal dice game vict...,teen adult adult adult adult ...,male,,killed injured injured injured inju...,victim victim victim victim victim ...


In [15]:
summation = pd.DataFrame()
summation['null_count'] = df.apply(lambda d: d.isnull().sum())
summation['unique'] = df.apply(lambda d: d.nunique())
display(summation.sort_values('null_count'))

Unnamed: 0,null_count,unique
date,0,1725
state,0,51
city_or_county,0,12898
n_killed,0,16
n_injured,0,23
incident_characteristics,326,18126
latitude,7923,101240
longitude,7923,112347
congressional_district,11944,54
participant_type,24863,259


In [22]:
#for f in vectorizer.get_feature_names():
#    print('{} {}'.format(f, test.loc[test['char'].str.find(f) != -1, 'char'].value_counts().sum()))
# display(test.loc[test['relationship'].str.find('armed') != -1])

abductions 2677
accident 59855
accidental 59855
act 9349
action 7209
aggression 3050
ak 2132
alcohol 3494
alert 1478
animal 723
applies 3494
ar 86091
armed 20047
arrest 21528
assault 2052
atf 17991
attempt 2810
attempted 469
bar 4590
bb 21878
brandishing 19017
burglary 597
business 11352
buy 131
bystander 169
car 36815
carry 19938
castle 1392
child 2123
cleaning 501
club 4590
college 431
commission 30863
concealed 1170
confiscation 17991
cop 148
crime 35622
crimes 30863
criminal 2146
dead 53409
death 28769
deceased 98
defensive 7391
defined 2052
dgu 19723
diagram 2331
discharge 6452
disgruntled 175
doctrine 1392
domestic 10841
drive 13655
drug 19996
drugs 3494
elementary 2244
employee 175
enforcement 2253
established 1392
establishment 4590
evidence 21703
excluding 1641
family 80
felon 17165
firearm 654
fired 41996
flourishing 19017
friend 80
gang 5677
ghost 81
good 538
ground 1392
group 11046
gun 51596
guns 233
gv 1478
hate 102
home 10641
hostage 2677
house 767
hunting 504
id 138665
i