# Scraping murder data

#### Author: Lydia Kim
#### Description: Scrapes murder totals, estimates, rates by MSA for 2006-2016 from the FBI website
#### Output: murder_data.csv

In [6]:
import numpy as np
import scipy as sp
import pandas as pd
import time
import requests 
from bs4 import BeautifulSoup 
import json
requests.packages.urllib3.disable_warnings()

In [7]:
def getHTML(the_year, url):
    link = url.format(the_year, the_year)
    print(the_year)
    
    #Get html
    req = requests.get(link, timeout=20, verify=False)
    time.sleep(2)
    page = req.text
    
    yearstext[the_year] = page

In [8]:
yearstext = dict()
range1 = lambda start, end: range(start, end+1)

for r in range(2006, 2010):
    url = "https://www2.fbi.gov/ucr/cius{}/data/table_06.html".format(r)
    print(r)
    
    #Get html
    req = requests.get(url, timeout=20, verify=False)
    time.sleep(2)
    page = req.text
    
    yearstext[r] = page
    
for r in range(2010, 2012):
    getHTML(r,"https://ucr.fbi.gov/crime-in-the-u.s/{}/crime-in-the-u.s.-{}/tables/table-6")
    
for r in range(2012, 2014):
    getHTML(r,"https://ucr.fbi.gov/crime-in-the-u.s/{}/crime-in-the-u.s.-{}/tables/6tabledatadecpdf")
    
for r in range(2014, 2016):
    getHTML(r,"https://ucr.fbi.gov/crime-in-the-u.s/{}/crime-in-the-u.s.-{}/tables/table-6")

getHTML(2016, "https://ucr.fbi.gov/crime-in-the-u.s/{}/crime-in-the-u.s.-{}/tables/table-4")

2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016


In [9]:
def parse_year(the_year):
    soup = BeautifulSoup(yearstext.get(the_year,{}), 'html.parser')
    
    #Extract table
    tables = soup.find_all("table", attrs={"class":"data"})
    table = tables[0]
    rows = table.find_all("tr")[1:]
    
    #Extract singles
    dataset, newdata = [], []

    for no, tr in enumerate(rows):
        for th_no, data in enumerate(tr.find_all('th')):
            if data.has_attr("rowspan"):
                if int(data["rowspan"])>2:
                    MSA = data.get_text().split('M.S.A.', 1)[0].replace('\n','')
                    spanrows = int(data["rowspan"])
                    start = no+1
                    end = (no + spanrows)-1
                    obs = {'MSA': MSA}

                    for r in range1(start, end):
                        cells = rows[r].findAll(["th", "td"])
                        varname = cells[0].get_text().replace('\n','')
                        val = cells[3].get_text().replace('\n','')

                        entry = {'varname': varname, 'values': val}
                        vlist = list(entry.values())
                        name = vlist[0]
                        obs[name] = vlist[1]

                    dataset.append(obs)

            else:
                pass
            
    for x in dataset:
        try:
            estimated = x['Estimated total']
        except KeyError:
            estimated = x['Total area actually reporting']
        try:
            rate = x['Rate per 100,000 inhabitants']
        except KeyError:
            rate = 'NaN'
        try:
            total = x['Total area actually reporting']
        except KeyError:
            total = 'NaN'
            
        tempdict = {'MSA': x['MSA'], 'Total': total, 'Rate': rate, 'Estimated': estimated, 'Year': the_year}
        newdata.append(tempdict)
            
                
    return newdata

In [10]:
yearinfo = dict()
for r in range(2006, 2017):
    info = parse_year(r)
    yearinfo[r] = info

In [11]:
fd = open("data/yearinfo.json","w")
json.dump(yearinfo, fd)
fd.close()
del yearinfo

In [12]:
with open("data/yearinfo.json", "r") as fd:
    yearinfo = json.load(fd)
    

In [13]:
frame = []

for key in yearinfo:
    dftemp = pd.DataFrame.from_dict(yearinfo[key], orient='columns')
    frame.append(dftemp)
    
dftemp2 = pd.concat(frame, ignore_index=True)
df = dftemp2.sort_values(['MSA', 'Year'])
df = df.reset_index(drop=True)
df = df[['MSA', 'Year', 'Total', 'Estimated', 'Rate']]
df

Unnamed: 0,MSA,Year,Total,Estimated,Rate
0,"Abilene, TX",2006,6,6,3.7
1,"Abilene, TX",2007,10,10,6.3
2,"Abilene, TX",2008,7,7,4.4
3,"Abilene, TX",2009,7,7,4.4
4,"Abilene, TX",2010,5,5,3.1
5,"Abilene, TX",2011,5,5,3.0
6,"Abilene, TX",2012,3,3,1.8
7,"Abilene, TX",2013,1,1,0.6
8,"Abilene, TX",2014,10,10,5.9
9,"Abilene, TX",2015,9,9,5.3


In [14]:
df.to_csv('raw data/murder_data.csv')