# Scraping Mass_Shooting_2017 Data

In [31]:
import os
import gzip
import json
import lxml.html
import dateutil
import numpy as np
import time
import requests


import pandas as pd
pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

In [32]:
response = requests.get('http://www.gunviolencearchive.org/reports/mass-shooting',
    headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12) AppleWebKit/602.1.43 (KHTML, like Gecko) Version/10.0 Safari/602.1.43'})

In [33]:
response

<Response [200]>

In [34]:
response.status_code

200

In [35]:
response.headers['content-type']

'text/html; charset=utf-8'

In [36]:
response.encoding

'utf-8'

In [37]:
response.content

'<!DOCTYPE html>\n<html xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/terms/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:og="http://ogp.me/ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:sioc="http://rdfs.org/sioc/ns#" xmlns:sioct="http://rdfs.org/sioc/types#" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#">\n<head profile="http://www.w3.org/1999/xhtml/vocab">\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n<link rel="shortcut icon" href="http://www.gunviolencearchive.org/sites/default/files/favicon.png" type="image/png"/>\n<meta name="Generator" content="Drupal 7 (http://drupal.org)"/>\n<meta property="og:description" content="Gun Violence Archive (GVA) is a not for profit corporation formed in 2013 to provide free online public access to accurate informat

In [38]:
reviews = {}

In [39]:
def scrape_page(page_number):
    return requests.get('http://www.gunviolencearchive.org/reports/mass-shooting',
                        headers = {'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12) AppleWebKit/602.1.43 (KHTML, like Gecko) Version/10.0 Safari/602.1.43'},
                        params = {'page': page_number})

In [40]:
def scrape_reviews():
    for page_number in range(1, 10):
        if (page_number in reviews) and (reviews[page_number]['status_code'] == 200):
            continue

        page = scrape_page(page_number)

        print 'page {}: {}'.format(page_number, page.status_code)

        reviews[page_number] = {
            'status_code': page.status_code,
            'content': page.content,
        }

        # Wait for a random interval between page requests (exponential distribution)
        time.sleep(np.random.exponential(10))

In [41]:
scrape_reviews()

page 1: 200
page 2: 200
page 3: 200
page 4: 200
page 5: 200
page 6: 200
page 7: 200
page 8: 200
page 9: 200


In [42]:
for page_number in reviews:
    if reviews[page_number]['status_code'] == 200:
        continue

    print '{}: {}'.format(page_number, reviews[page_number]['status_code'])

In [43]:
with gzip.open(os.path.join('Mass_Shooting_2017.json.gz'), 'wb') as f:
        f.write(json.dumps(reviews, ensure_ascii = False, indent = 4, sort_keys = True))

# Reading Scrapped Pages

In [44]:
with gzip.open(os.path.join('Mass_Shooting_2017.json.gz'), 'rb') as f:
    pages = json.loads(f.read())

In [45]:
len(pages)

9

In [46]:
page = pages['1']['content']

In [47]:
page

u'<!DOCTYPE html>\n<html xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dc="http://purl.org/dc/terms/" xmlns:foaf="http://xmlns.com/foaf/0.1/" xmlns:og="http://ogp.me/ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:sioc="http://rdfs.org/sioc/ns#" xmlns:sioct="http://rdfs.org/sioc/types#" xmlns:skos="http://www.w3.org/2004/02/skos/core#" xmlns:xsd="http://www.w3.org/2001/XMLSchema#">\n<head profile="http://www.w3.org/1999/xhtml/vocab">\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no">\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n<link rel="shortcut icon" href="http://www.gunviolencearchive.org/sites/default/files/favicon.png" type="image/png"/>\n<meta name="Generator" content="Drupal 7 (http://drupal.org)"/>\n<meta property="og:description" content="Gun Violence Archive (GVA) is a not for profit corporation formed in 2013 to provide free online public access to accurate informa

In [48]:
document = lxml.html.fromstring(page)

In [49]:
type(document)

lxml.html.HtmlElement

In [50]:
reviews_class = document.xpath('//tr[@class="Odd"]')

In [51]:
reviews_class

[]

In [52]:
reviews_row = document.xpath('//tr[@td]')

In [53]:
reviews_col = document.xpath('//td[@style]')
reviews_col

[]

In [12]:
reviews_table = document.xpath('//*[@id="block-system-main"]/section/div')

In [13]:
//*[@id="block-system-main"]/section/div/div/div[1]/table

SyntaxError: invalid syntax (<ipython-input-13-02120bfafded>, line 1)

In [15]:
reviews_table = document.xpath('//*[@column="Base.IncidentDate"]/div')

In [16]:
reviews_table

[]

In [20]:
import lxml.html as LH

In [30]:
root = LH.fromstring(response.content)

for table in root.xpath('//table[@class="sticky-header"]'):
    header = [text(th) for th in table.xpath('//th')]        # 1
    data = [[text(td) for td in tr.xpath('td')]  
            for tr in table.xpath('//tr')]                   # 2
    data = [row for row in data if len(row)==len(header)]    # 3 
    data = pd.DataFrame(data, columns=header)                # 4
    print(data)

## process scraped data with beautisoup
https://chrisalbon.com/python/beautiful_soup_scrape_table.html

In [57]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [58]:
# Create a variable with the URL
url = 'http://www.gunviolencearchive.org/reports/mass-shooting'

# Scrape the HTML at the url
r = requests.get(url)

# Turn the HTML into a Beautiful Soup object
soup = BeautifulSoup(r.text, 'lxml')


In [59]:
Date = []
State = []
City = []
Address = []
Number_Killed = []
Number_Injured = []
Operation = []

In [60]:

# Create an object of the first object that is class=dataframe
table = soup.find(class_='table-wrapper')

# Find all the <tr> tag pairs, skip the first one, then for each.
for row in table.find_all('tr')[1:]:
    # Create a variable of all the <td> tag pairs in each <tr> tag pair,
    col = row.find_all('td')

    # Create a variable of the string inside 1st <td> tag pair,
    column_1 = col[0].string.strip()
    # and append it to first_name variable
    Date.append(column_1)

    # Create a variable of the string inside 2nd <td> tag pair,
    column_2 = col[1].string.strip()
    # and append it to last_name variable
    Sate.append(column_2)

    # Create a variable of the string inside 3rd <td> tag pair,
    column_3 = col[2].string.strip()
    # and append it to age variable
    City.append(column_3)

    # Create a variable of the string inside 4th <td> tag pair,
    column_4 = col[3].string.strip()
    # and append it to preTestScore variable
    Address.append(column_4)

    # Create a variable of the string inside 5th <td> tag pair,
    column_5 = col[4].string.strip()
    # and append it to postTestScore variable
    Number_Killed.append(column_5)

    
    # Create a variable of the string inside 6th <td> tag pair,
    column_5 = col[4].string.strip()
    # and append it to postTestScore variable
    Number_Injured.append(column_6)
    
    
    # Create a variable of the string inside 7th <td> tag pair,
    column_5 = col[4].string.strip()
    # and append it to postTestScore variable
    Operations.append(column_7)
    
    
# Create a variable of the value of the columns
columns = {'Incident Date': Date, 'State': State, 'City or County': City, 'Address': Address, '# Killed': Number_Killed, '# Injured': Number_Injured, 'Operations': Operation}

# Create a dataframe from the columns variable
df = pd.DataFrame(columns)

AttributeError: 'NoneType' object has no attribute 'find_all'