/
scraper.py
53 lines (46 loc) · 2.57 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# Helping https://scraperwiki.com/scrapers/gordo
# Currently this scrapes div id="detailtext" in each page
# Needs to be refined so that you're grabbing <b> and <br/> tags within that - or regex?
# Also needs simplifying/renaming of variables/comments etc.
#If you want to understand this scraper - start at the bottom where it says 'base_url' (line 52 or so)
import scraperwiki
#import urlparse
import lxml.html
#Create a function called 'scrape_table' which is called in the function 'scrape_page' below
#The 'scrape_page' function also passed the contents of the page to this function as 'root'
def scrape_table(root):
#Use cssselect to find the contents of a particular HTML tag, and put it in a new object 'rows'
#there's more than one table, so we need to specify the class="destinations", represented by the full stop
rows = root.cssselect("div#vessel_details_general")
for row in rows:
#Create a new empty record
record = {}
#Put the contents of the first <td> into a record in the column 'FSM'
record['FSM'] = row.text_content()
#this takes the ID number, which has been named item in the for loop below
record['ID'] = item
print record, '------------'
#Save in the SQLite database, with the ID code to be used as the unique reference
scraperwiki.sqlite.save(["ID"], record)
#this creates a new function and (re)names whatever parameter is passed to it - i.e. 'next_link' below - as 'url'
def scrape_page(url):
#now 'url' is scraped with the scraperwiki library imported above, and the contents put into a new object, 'html'
html = scraperwiki.scrape(url)
print html
#now we use the lxml.html function imported above to convert 'html' into a new object, 'root'
root = lxml.html.fromstring(html)
#now we call another function on root, which we write - above
scrape_table(root)
#START HERE: This is the part of the URL which all our pages share
base_url = 'http://www.marinetraffic.com/en/ais/details/ships/shipID:'
#And these are the numbers which we need to complete that URL to make each individual URL
#This list has been compiled using the =JOIN formula in Google Docs on a column of mmsi numbers
schoolIDs = ['408044']
#go through the schoolIDs list above, and for each ID...
for item in schoolIDs:
#show it in the console
print item
#create a URL called 'next_link' which adds that ID to the end of the base_url variable
next_link = base_url+item
#pass that new concatenated URL to a function, 'scrape_page', which is scripted above
scrape_page(next_link)