Skip to content
This repository has been archived by the owner on Jun 12, 2019. It is now read-only.

Commit

Permalink
Merge pull request #2 from emikulic/master
Browse files Browse the repository at this point in the history
Repair the Scenic Rim scraper.
  • Loading branch information
mlandauer committed Jun 17, 2015
2 parents 67965a5 + 1f6ca36 commit 8801c59
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 69 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "lib_icon_rest_xml"]
path = lib_icon_rest_xml
url = https://github.com/planningalerts-scrapers/lib_icon_rest_xml/
4 changes: 4 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
source 'https://rubygems.org'

gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', branch: 'morph_defaults'
gem 'mechanize'
47 changes: 47 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic

GEM
remote: https://rubygems.org/
specs:
domain_name (0.5.24)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.2)
domain_name (~> 0.5)
httpclient (2.6.0.1)
mechanize (2.7.3)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (~> 2.0)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.4)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (2.6.1)
mini_portile (0.6.2)
net-http-digest_auth (1.4)
net-http-persistent (2.9.4)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
ntlm-http (0.1.1)
sqlite3 (1.3.10)
sqlite_magic (0.0.5)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.1)
webrobots (0.1.1)

PLATFORMS
ruby

DEPENDENCIES
mechanize
scraperwiki!
1 change: 1 addition & 0 deletions lib_icon_rest_xml
Submodule lib_icon_rest_xml added at 295449
82 changes: 13 additions & 69 deletions scraper.rb
Original file line number Diff line number Diff line change
@@ -1,70 +1,14 @@
require 'scraperwiki'
require 'mechanize'

starting_url = "http://pdonline.scenicrim.qld.gov.au/Modules/Applicationmaster/default.aspx?page=found&1=thisweek&4a=COM.Bd',%20'COM.Bn',%20'COM.Ip','pbeDevComp','DevEnf','MC.Bd1',%20'MC.Bd2',%20'MC.Bn',%20'MC.I','RL.Bd1','RL.Bd2','RL.Bn','RL.IP','OW.Bd1','OW.Bd2','OW.Bn','OW.Ip','Subdiv&6=F"
comment_url = 'mailto:mail@scenicrim.qld.gov.au'

def clean_whitespace(a)
a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip
end

def scrape_table(doc, comment_url)
doc.search('table tbody tr').each do |tr|
# Columns in table
# Show Number Submitted Details
tds = tr.search('td')

# Yes, this is "where no records"[sic]
break if tds[0].inner_text =~ /There where no records/

h = tds.map{|td| td.inner_html}

info_url = 'http://pdonline.scenicrim.qld.gov.au/Modules/Applicationmaster/' + tds[0].at('a')['href'].strip
info_page = @agent.get(info_url)

record = {
'info_url' => info_url,
'comment_url' => comment_url,
'council_reference' => clean_whitespace(h[1]),
'date_received' => Date.strptime(clean_whitespace(h[2]), '%d/%m/%Y').to_s,
# TODO: Some DAs have multiple addresses, we're just getting the first :(
'address' => clean_whitespace(info_page.at('div#lblProp').at('a').inner_text.strip) + ", QLD",
'description' => CGI::unescapeHTML(info_page.at('div#lblDetails').inner_html.split('<br>')[0].split('Description: ')[1].strip),
'date_scraped' => Date.today.to_s
}
if ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty?
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts "Skipping already saved record " + record['council_reference']
end
end
end

def scrape_and_follow_next_link(doc, comment_url)
scrape_table(doc, comment_url)
nextButton = doc.at('.rgPageNext')
unless nextButton.nil? || nextButton['onclick'] =~ /return false/
form = doc.forms.first

# The joy of dealing with ASP.NET
form['__EVENTTARGET'] = nextButton['name']
form['__EVENTARGUMENT'] = ''
# It doesn't seem to work without these stupid values being set.
# Would be good to figure out where precisely in the javascript these values are coming from.
form['ctl00%24RadScriptManager1']=
'ctl00%24cphContent%24ctl00%24ctl00%24cphContent%24ctl00%24Radajaxpanel2Panel%7Cctl00%24cphContent%24ctl00%24ctl00%24RadGrid1%24ctl00%24ctl03%24ctl01%24ctl10'
form['ctl00_RadScriptManager1_HiddenField']=
'%3B%3BSystem.Web.Extensions%2C%20Version%3D3.5.0.0%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D31bf3856ad364e35%3Aen-US%3A0d787d5c-3903-4814-ad72-296cea810318%3Aea597d4b%3Ab25378d2%3BTelerik.Web.UI%2C%20Version%3D2009.1.527.35%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D121fae78165ba3d4%3Aen-US%3A1e3fef00-f492-4ed8-96ce-6371bc241e1c%3A16e4e7cd%3Af7645509%3A24ee1bba%3Ae330518b%3A1e771326%3Ac8618e41%3A4cacbc31%3A8e6f0d33%3Aed16cbdc%3A58366029%3Aaa288e2d'
doc = form.submit(form.button_with(:name => nextButton['name']))
scrape_and_follow_next_link(doc, comment_url)
end
end

@agent = Mechanize.new

# Jump through bollocks agree screen
doc = @agent.get(starting_url)
doc = doc.forms.first.submit(doc.forms.first.button_with(:value => "Agree"))
doc = @agent.get(starting_url)

scrape_and_follow_next_link(doc, comment_url)
require File.dirname(__FILE__) + '/lib_icon_rest_xml/scraper'

agent = Mechanize.new
doc = agent.get("http://pdonline.scenicrim.qld.gov.au/Common/Common/Terms.aspx")
form = doc.forms.first
# Tick the box.
id = doc.search('input')[4].attribute('name').value
form.checkbox_with(:name => id).check
# Click the button.
doc = doc.forms.first.submit(doc.forms.first.button_with(:value => "I Agree"), "Accept-Encoding" => "identity")

# Use the ICON XML scraper.
scrape_icon_rest_xml("http://pdonline.scenicrim.qld.gov.au/Pages/XC.Track/SearchApplication.aspx", "d=last14days&o=xml", false, agent)

0 comments on commit 8801c59

Please sign in to comment.