diff --git a/scraper.py b/scraper.py index 5b5c0cd..b8ed75f 100644 --- a/scraper.py +++ b/scraper.py @@ -9,54 +9,57 @@ #html = scraperwiki.scrape("https://reports.ofsted.gov.uk/provider/files/2631211/urn/103980.pdf") url = "https://reports.ofsted.gov.uk/provider/files/2631211/urn/103980.pdf" -#This function will contain all the lines below, later -#def scrapepdf(url): - -pdfdata = urllib2.urlopen(url).read() -print "The pdf file has %d bytes" % len(pdfdata) - -xmldata = scraperwiki.pdftoxml(pdfdata) -print "After converting to xml it has %d bytes" % len(xmldata) -print "The first 2000 characters are: ", xmldata[:2000] - - -# turn 'xmldata' into an lxml object called 'pdfroot' -pdfroot = lxml.etree.fromstring(xmldata) -#find all tags and put in list variable 'lines' -lines = pdfroot.findall('.//text') -# create new 'linenummber' variable, set at 0 -linenumber = 0 -# create empty dictionary object which we'll fill with data as we go, then store -record = {} - -#school name is in -#We try to identify lines with font="4" -schoolname = pdfroot.findall('.//text[@font="4"]') -for name in schoolname: - #This line tests how many matches we get - print 'SCHOOL NAME? ', name.text.encode('ascii', 'ignore') -#There's only one when tested, so let's store the first and only match -record['schoolname'] = schoolname[0].text.encode('ascii', 'ignore') - -#Now the date, which is in tags and put in list variable 'lines' + lines = pdfroot.findall('.//text') + # create new 'linenummber' variable, set at 0 + linenumber = 0 + # create empty dictionary object which we'll fill with data as we go, then store + record = {} + + #school name is in + #We try to identify lines with font="4" + schoolname = pdfroot.findall('.//text[@font="4"]') + for name in schoolname: + #This line tests how many matches we get + print 'SCHOOL NAME? ', name.text.encode('ascii', 'ignore') + #There's only one when tested, so let's store the first and only match + record['schoolname'] = schoolname[0].text.encode('ascii', 'ignore') + + #Now the date, which is in