Skip to content


Fork of code from ScraperWiki at…
Browse files Browse the repository at this point in the history
  • Loading branch information
paulbradshaw committed Apr 8, 2017
0 parents commit 3518049
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
# Ignore output of scraper
106 changes: 106 additions & 0 deletions
@@ -0,0 +1,106 @@
#This grabs all links to Excel spreadsheet from one page, then scrapes each one
#To test without taking too much time, the ranges are limited to a few spreadsheets, a few sheets, and a few rows

#Key points to remember: there needs to be two unique keys for each record: the SHA amd the URL
#Also, keys with slashes - / - need to be cleaned with .replace before being used


import scraperwiki
import xlrd
import lxml.html

import datetime

def cellval(cell, datemode):
if cell.ctype == xlrd.XL_CELL_DATE:
datetuple = xlrd.xldate_as_tuple(cell.value, datemode)
if datetuple[3:] == (0, 0, 0):
return[0], datetuple[1], datetuple[2])
return[0], datetuple[1], datetuple[2], datetuple[3], datetuple[4], datetuple[5])
if cell.ctype == xlrd.XL_CELL_EMPTY: return None
if cell.ctype == xlrd.XL_CELL_BOOLEAN: return cell.value == 1
return cell.value

URL = ''
#set a variable for the spreadsheet location
XLS = ''
#use the scrape function on that spreadsheet to create a new variable

def scrapespreadsheet(XLS):
xlbin = scraperwiki.scrape(XLS)
#use the open_workbook function on that new variable to create another
book = xlrd.open_workbook(file_contents=xlbin)

#the .nsheets method tells us how many sheets 'book' has
print "nsheets result: ", book.nsheets
#we can use that number to initialise a new variable
sheetstotal = book.nsheets
#and then use that variable to create the end value in a range of numbers, called 'sheetsrange'
sheetsrange = range(3,4) #(0,sheetstotal)
#both lines could of course have been combined into one like this:
#sheetsrange = range(0,book.nsheets)

#print "sheetsrange:" followed by that range of numbers:
print "sheetsrange:", sheetsrange

#create a new variable, 'id', set at 0. We'll add one to this every time a loop runs, so we have a unique id for every row of data
id = 0
#now to loop through the 'sheetsrange' variable (a list) and put each item in 'sheetnum'
for sheetnum in sheetsrange:
print "scraping sheet ", sheetnum
#use the sheet_by_index method to open the first (0) sheet in variable 'book' - and put it into new variable 'sheet'
sheet = book.sheet_by_index(sheetnum)
#use the row_values method and index (1) to grab the second row of 'sheet'
#and put all cells into the list variable 'title'
title = sheet.row_values(1)
#print the string "Title:", followed by the third [2] item (column) in the variable 'title'
print "Title:", title[2]
#put cells from the 15th row into 'keys' variable
keys = []
for cell in sheet.row(14):
print "KEYS",keys
# for key in keys:
# key.replace("'","")
# print "JOINED KEY", key
print "keys:", keys
print "keys[2]", keys[2]
#create an empty dictionary variable, 'record'
record = {}
#loop through a range - from the 16th item (15) to a number generated by using the .nrows method on 'sheet' (to find number of rows in that sheet)
#put each row number in 'rownumber' as you loop
for rownumber in range(15, 25): #range(15, sheet.nrows):
for column in range(1,sheet.ncols):
print "Scraping row", rownumber
record[keys[column]] = sheet.row_values(rownumber)[column]
record['title'] = title[2]
print "RECORD SO FAR:", record
record['URL'] = str(XLS)[keys[2], 'URL'], record, table_name=sheetnum)

def grabexcellinks(URL):
#Use Scraperwiki's scrape function on 'URL', put results in new variable 'html'
html = scraperwiki.scrape(URL)
#and show it us
print html
#Use lxml.html's fromstring function on 'html', put results in new variable 'root'
root = lxml.html.fromstring(html)
#use cssselect method on 'root' to grab all <a> tags within a <p> tag - and put in a new list variable 'links'
links = root.cssselect('p a')
#for each item in that list variable, from the first to the second last [0:-1], put it in the variable 'link'
for link in links[1:4]:
#and print the text_content of that (after the string "link text:")
print "link text:", link.text_content()
#use the attrib.get method on 'link' to grab the href= attribute of the HTML, and put in new 'linkurl' variable
linkurl = link.attrib.get('href')
#print it
print linkurl
#run the function scrapesheets, using that variable as the parameter



0 comments on commit 3518049

Please sign in to comment.