Skip to content

Commit

Permalink
copied across code from book
Browse files Browse the repository at this point in the history
  • Loading branch information
paulbradshaw committed Jan 17, 2017
1 parent 30be534 commit 13c4376
Showing 1 changed file with 124 additions and 22 deletions.
146 changes: 124 additions & 22 deletions scraper.py
@@ -1,24 +1,126 @@
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful
#import the libraries we'll need
import scraperwiki
import urllib2
import lxml.etree
import lxml.html

# import scraperwiki
# import lxml.html
#
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
#
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")
#HTML we need to grab from: <table summary="Corporate level publications: What our priorities are and how we are doing: Knife Crime Summaries" class="foidocuments"
#This creates a new function to find the part of the page we want, scrape bits, and follow links in it.
def scrapetable(root):
#create an empty variable 'record', which is a dictionary
record = {}
#create another variable, 'uniqueid', set to zero, which will be added to later on.
uniqueid = 0
#Grab any bits of 'root' (passed into 'scrapetable' as a parameter above)...
#...that have the tag <table containing summary=" and 'Knife' somewhere in that
#...then the contents of <tr.
#Put the results in variable called 'rows'
rows = root.xpath(".//table[contains(@summary, 'Knife')]//tr")

# You don't have to do things with the ScraperWiki and lxml libraries.
# You can use whatever libraries you want: https://morph.io/documentation/python
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
#That will be a list, so we start a for loop to go through each item, calling it 'row'
for row in rows:
#show us the text content of that item
print row.text_content()
#now grab the contents of all <td<a ... tags within that 'row' object
#and put it in the variable 'report'
report = row.cssselect("td a")

#if that exists...
if report:
#get the value of the first (index 0) 'href=' attribute
#and put in 'pdfurl' variable
pdfurl = report[0].attrib.get('href')
#store the value of the first 'href=' and 'title=' attributes
#with the labels 'URL' and 'Date' in the variable 'record'
record["Date"] = report[0].attrib.get('title')
record["URL"] = report[0].attrib.get('href')

#if the 'pdfurl' variable was indeed created...
if pdfurl:
#Start running a PDF scraper on that,
#firstly 'opening' the PDF URL with the urllib2 library's urlopen function
#and 'reading' the PDF into a variable called 'pdfdata'...
pdfdata = urllib2.urlopen (baseurl+pdfurl).read()
#...Then using pdftoxml to convert that into a variable called 'xmldata'
xmldata = scraperwiki.pdftoxml(pdfdata)
#...Then using .fromstring to convert that into a variable called 'pdfxml'
pdfxml = lxml.etree.fromstring(xmldata)
print xmldata
#Use .xpath again to find <text ... top="191" tags,
#and <b tags within those
boldtags1 = pdfxml.xpath ('.//text[contains(@top, "191")]//b')
#Then store the first [0] result's text in 'Date2'
record ["Date2"] = boldtags1[0].text
boldtags = pdfxml.xpath ('.//text[contains(@top, "386")]//b')
#This is the code that the line above is looking for
#<text top="386" left="464" width="75" height="21"
#font="0"<b04/09/2012</b</text
#Then store the second [1] result's text in 'Review Date'
record ["Review Date"] = boldtags[1].text

print record
#Now we grab all the <text ... tags,
#and in the next line loop through them
texttags = pdfxml.xpath ('.//text')
for text in texttags:
left = text.attrib.get('left')
#convert the attribute from a string into an integer:
leftinteger = int(left)
#If leftinteger is between 96 and 99...
#see other options at http://stackoverflow.com/questions/ 618093/how-to-find-whether-a-number-belongs-to-a-particular-range-in-python
#Literally: If 96 is smaller than leftinteger, AND leftinteger is smaller than 99:
if 96 < leftinteger < 99:
#Record the text of 'text' (sorry)
record ["BOCUname"] = text.text
print record
#All the 'if' tests from hereon do similar things:
#store data in a particular place based on its properties
if 324 < leftinteger < 327:
record ["Offences"] = text.text
print record
if 405 < leftinteger < 408:
record ["Sanction_detentions"] = text.text
print record
if 481 < leftinteger < 484:
record ["Sanction_detention_rate"] = text.text
print record
if 587 < leftinteger < 590:
record ["Offences FYTD_2011_12"] = text.text
print record
if 661 < leftinteger < 664:
record ["Offences FYTD_2012_13"] = text.text
print record
if 713 < leftinteger < 716:
record ["Offences percentage change"] = text.text
print record
if 812 < leftinteger < 815:
record ["Sanction detections FYTD 2011_12"] = text.text
if 887 < leftinteger < 890:
record ["Sanction Detections FYTD_2012_13"] = text.text
if 943 < leftinteger < 946:
record ["Sanction Detection rate FYTD 2011_12"] = text.text
if 1021 < leftinteger < 1024:
record ["Sanction Detections rate FYTD 2012_13"] = text.text
uniqueid = uniqueid+1
record ["uniqueid"] = uniqueid
print record
scraperwiki.sqlite.save(["uniqueid","Date"], record)

#This creates a new function to scrape the initial page so we can grab report titles and the links
def scrape_and_look_for_next_link(url):
#scrapes the page and puts it in 'html'
html = scraperwiki.scrape(url)
print html
#turns html from a string into an lxml object called 'root'
root = lxml.html.fromstring(html)
#runs another function - created earlier - on 'root'
scrapetable(root)

#This will be used for relative links in later pages
baseurl = "http://www.met.police.uk/foi/"

#When added to the baseurl, this is our starting page
startingurl = "c_priorities_and_how_we_are_doing.htm"

#Run the function created earlier above on that URL
scrape_and_look_for_next_link(baseurl+startingurl)

0 comments on commit 13c4376

Please sign in to comment.