# Awesome public data analysis
Source: https://en.wikipedia.org/wiki/List_of_school_shootings_in_the_United_States

In [2]:
import wikipedia
from bs4 import BeautifulSoup
import pandas as pd
from geopy.geocoders import Nominatim

In [4]:
page = wikipedia.page("List_of_school_shootings_in_the_United_States")

In [5]:
html = page.html().encode("UTF-8")

In [6]:
print html[:1000]

<div class="mw-parser-output"><p>
</p>
<div role="note" class="hatnote navigation-not-searchable plainlinks">This list is <a href="/wiki/Wikipedia:WikiProject_Lists#Incomplete_lists" title="Wikipedia:WikiProject Lists">incomplete</a>; you can help by <a class="external text" href="//en.wikipedia.org/w/index.php?title=List_of_school_shootings_in_the_United_States&amp;action=edit">expanding it</a>.</div>
<div role="note" class="hatnote navigation-not-searchable">For a broader coverage related to  this topic, see <a href="/wiki/Gun_violence_in_the_United_States" title="Gun violence in the United States">Gun violence in the United States</a>.</div>
<p>

</p><p><b>List of school shootings in the United States</b> lists in chronology and provides additional details of incidents in which a firearm was discharged at a school infrastructure or campus, including incidents of shootings on a <a href="/wiki/School_bus" title="School bus">school bus</a>. This list contains <a href="/wiki/School_shoo

In [7]:
soup = BeautifulSoup(html, 'html.parser')

In [10]:
table = soup.find_all('table')[19]

In [16]:
for span in table.select("span.sortkey"):
    span.decompose()

In [17]:
df = pd.read_html(str(table), header=0)[0]

In [19]:
df.head(n=3)

Unnamed: 0,Date,Location,Deaths,Injuries,Description
0,"January 15, 2015","Milwaukee, Wisconsin",0,3,"A 15-year-old boy, a student's father, and a t..."
1,"January 16, 2015","Ocala, Florida",0,2,Two were injured in gunfire that occurred afte...
2,"February 4, 2015","Frederick, Maryland",0,2,Two students were shot near the gymnasium of F...


In [20]:
df.describe()

Unnamed: 0,Deaths,Injuries
count,57.0,57.0
mean,1.263158,2.403509
std,2.682791,3.83992
min,0.0,0.0
25%,0.0,0.0
50%,1.0,1.0
75%,1.0,3.0
max,17.0,18.0


In [21]:
import nltk

In [22]:
df['tokenized'] = df['Description'].apply(nltk.word_tokenize)

In [23]:
df.head(n=3)

Unnamed: 0,Date,Location,Deaths,Injuries,Description,tokenized
0,"January 15, 2015","Milwaukee, Wisconsin",0,3,"A 15-year-old boy, a student's father, and a t...","[A, 15-year-old, boy, ,, a, student, 's, fathe..."
1,"January 16, 2015","Ocala, Florida",0,2,Two were injured in gunfire that occurred afte...,"[Two, were, injured, in, gunfire, that, occurr..."
2,"February 4, 2015","Frederick, Maryland",0,2,Two students were shot near the gymnasium of F...,"[Two, students, were, shot, near, the, gymnasi..."


In [28]:
def return_coordinates(local):
    location = geolocator.geocode(local)
    return pd.Series({'latitude': location.latitude, "longitude": location.longitude})

In [29]:
geolocator = Nominatim()

In [33]:
print return_coordinates("New York City, New York")

latitude     40.730646
longitude   -73.986614
dtype: float64


In [34]:
print df['Location'].dtype

object


In [35]:
df[['latitude', 'longitude']] = df.apply(lambda row: return_coordinates(str(row['Location'])), axis=1)

In [36]:
df.head(n=5)

Unnamed: 0,Date,Location,Deaths,Injuries,Description,tokenized,latitude,longitude
0,"January 15, 2015","Milwaukee, Wisconsin",0,3,"A 15-year-old boy, a student's father, and a t...","[A, 15-year-old, boy, ,, a, student, 's, fathe...",43.034993,-87.922497
1,"January 16, 2015","Ocala, Florida",0,2,Two were injured in gunfire that occurred afte...,"[Two, were, injured, in, gunfire, that, occurr...",29.187199,-82.140092
2,"February 4, 2015","Frederick, Maryland",0,2,Two students were shot near the gymnasium of F...,"[Two, students, were, shot, near, the, gymnasi...",39.414219,-77.410927
3,"February 14, 2015","Merced, California",1,0,A teenager was found killed in the parking lot...,"[A, teenager, was, found, killed, in, the, par...",37.302957,-120.484327
4,"February 23, 2015","Daytona Beach, Florida",0,3,Two students argued outside the music building...,"[Two, students, argued, outside, the, music, b...",29.210815,-81.022833


In [41]:
def split_and_strip(row):
    return [x.strip() for x in row["Location"].split(',')][1]

In [42]:
df['state'] = df.apply(lambda row: split_and_strip(row), axis=1)

In [43]:
df.head(n=2)

Unnamed: 0,Date,Location,Deaths,Injuries,Description,tokenized,latitude,longitude,state
0,"January 15, 2015","Milwaukee, Wisconsin",0,3,"A 15-year-old boy, a student's father, and a t...","[A, 15-year-old, boy, ,, a, student, 's, fathe...",43.034993,-87.922497,Wisconsin
1,"January 16, 2015","Ocala, Florida",0,2,Two were injured in gunfire that occurred afte...,"[Two, were, injured, in, gunfire, that, occurr...",29.187199,-82.140092,Florida


In [44]:
df['deaths_and_injuries'] = df['Deaths'] + df['Injuries']

In [45]:
df.describe()

Unnamed: 0,Deaths,Injuries,latitude,longitude,deaths_and_injuries
count,57.0,57.0,57.0,57.0,57.0
mean,1.263158,2.403509,36.938559,-93.04302,3.666667
std,2.682791,3.83992,6.440343,21.448544,5.813572
min,0.0,0.0,8.654539,-123.341738,0.0
25%,0.0,0.0,33.558997,-111.83345,1.0
50%,1.0,1.0,36.862431,-88.326464,2.0
75%,1.0,3.0,40.572851,-82.140092,4.0
max,17.0,18.0,47.603832,12.214805,31.0


# Now it's time to map

In [50]:
import bokeh
from bokeh.sampledata.us_states import data as statesData
from bokeh.plotting import figure, show, output_file

In [51]:
del statesData["HI"]
del statesData["AK"]

In [52]:
df.head(n=3)

Unnamed: 0,Date,Location,Deaths,Injuries,Description,tokenized,latitude,longitude,state,deaths_and_injuries
0,"January 15, 2015","Milwaukee, Wisconsin",0,3,"A 15-year-old boy, a student's father, and a t...","[A, 15-year-old, boy, ,, a, student, 's, fathe...",43.034993,-87.922497,Wisconsin,3
1,"January 16, 2015","Ocala, Florida",0,2,Two were injured in gunfire that occurred afte...,"[Two, were, injured, in, gunfire, that, occurr...",29.187199,-82.140092,Florida,2
2,"February 4, 2015","Frederick, Maryland",0,2,Two students were shot near the gymnasium of F...,"[Two, students, were, shot, near, the, gymnasi...",39.414219,-77.410927,Maryland,2


In [53]:
state_df = df.groupby(df['state'])['deaths_and_injuries'].sum()

In [54]:
state_df.head(n=5)

state
Alabama        7
Arizona        6
California    44
Florida       38
Georgia        1
Name: deaths_and_injuries, dtype: int64

In [81]:
colors = bokeh.palettes.OrRd5[::-1]

In [82]:
colors

['#fef0d9', '#fdcc8a', '#fc8d59', '#e34a33', '#b30000']

In [83]:
color_mapper = bokeh.models.mappers.LinearColorMapper(palette=colors)

In [63]:
statesData['WA'].keys()

[u'region', u'lons', u'name', u'lats']

In [64]:
state_dict = state_df.to_dict()

In [67]:
# State level information
new_state_xs = []
new_state_ys = []
state_name = []
state_count = []

for abbr, state in statesData.iteritems():
    new_state_xs.append(state['lons'])
    new_state_ys.append(state['lats'])
    state_name.append(state['name'])
    state_count.append(state_dict.get(state['name'],0))

In [70]:
incident_data_source = bokeh.models.sources.ColumnDataSource(df)

In [76]:
state_data_source = bokeh.models.sources.ColumnDataSource(data=dict(x=new_state_xs,
                                                                y=new_state_ys,
                                                                color=state_count,
                                                                ))

In [77]:
plot = figure(title="School shootings, 2015 - present", plot_width=800, plot_height=500)

In [94]:
plot.patches('x', 'y', source=state_data_source,
             color={'field': 'color', 'transform': color_mapper},
            line_color="white", line_width=0.5)

circle = bokeh.models.markers.Circle(x='longitude', y='latitude', size='deaths_and_injuries')
plot.add_glyph(incident_data_source, circle)

In [95]:
show(plot)