Skip to content
This repository has been archived by the owner on Jun 12, 2019. It is now read-only.

Commit

Permalink
Merge pull request #4 from LoveMyData/master
Browse files Browse the repository at this point in the history
Fixes current scraper that is no longer working
  • Loading branch information
LoveMyData committed Nov 29, 2017
2 parents 1669e6d + 599a01e commit 9a94b06
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 70 deletions.
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
source 'https://rubygems.org'

ruby "~>2.4"

gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', branch: 'morph_defaults'
gem 'mechanize'
9 changes: 6 additions & 3 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ GEM
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.2.0)
mini_portile2 (2.3.0)
net-http-digest_auth (1.4.1)
net-http-persistent (2.9.4)
nokogiri (1.8.0)
mini_portile2 (~> 2.2.0)
nokogiri (1.8.1)
mini_portile2 (~> 2.3.0)
ntlm-http (0.1.1)
sqlite3 (1.3.13)
sqlite_magic (0.0.6)
Expand All @@ -48,5 +48,8 @@ DEPENDENCIES
mechanize
scraperwiki!

RUBY VERSION
ruby 2.4.1p111

BUNDLED WITH
1.15.1
104 changes: 37 additions & 67 deletions scraper.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
require 'scraperwiki'
require 'rubygems'
require 'mechanize'

case ENV['MORPH_PERIOD']
Expand All @@ -13,87 +12,58 @@
end
puts "Getting data in `" + ENV['MORPH_PERIOD'] + "`, changable via MORPH_PERIOD environment"

starting_url = 'http://masterview.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx?d=' + period + '&k=LodgementDate&'
base_url = 'http://masterview.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx'
starting_url = base_url + '?d=' + period + '&k=LodgementDate&'
comment_url = 'mailto:council@northsydney.nsw.gov.au'

def clean_whitespace(a)
a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip
end

def scrape_table(doc, comment_url)
doc.search('.result').each do |tr|
# Columns in table
# Show Number Submitted Details
tds = tr.search('td')
agent = Mechanize.new

# Yes, this is "where no records"[sic]
break if tds[0].inner_text =~ /There where no records/
next if tds[0].nil?
# Jump through bollocks agree screen
page = agent.get(starting_url)
puts "Agreeing"
page = page.forms.first.submit(page.forms.first.button_with(:value => "I Agree"))
page = agent.get(starting_url + "&o=xml")

h = tds.map{|td| td.inner_html}
info_url = 'http://masterview.northsydney.nsw.gov.au/Modules/applicationmaster/' + tds[0].at('a')['href'].strip
info_page = @agent.get(info_url)
# Explicitly interpret as XML
page = Nokogiri::XML(page.content)

begin
date_received = Date.strptime(clean_whitespace(h[2]), '%d/%m/%Y').to_s
rescue
raise h[1..3].inspect
end
raise "Can't find any <Application> elements" unless page.search('Application').length > 0

begin
address = clean_whitespace(info_page.at('#b_ctl00_ctMain1_info_prop').at('a').inner_text.strip)
rescue
puts 'error with parsing address'
next
end
page.search('Application').each do |application|
council_reference = clean_whitespace(application.at("ReferenceNumber").inner_text)

application_id = clean_whitespace(application.at("ApplicationId").inner_text.strip)
info_url = "#{base_url}?id=#{application_id}"

unless application.at("Line1")
puts "Skipping due to lack of address for #{council_reference}"
next
end

address = clean_whitespace(application.at("Line1").inner_text)
if !application.at('Line2').inner_text.empty?
address += ", " + clean_whitespace(application.at("Line2").inner_text)
end

record = {
"council_reference" => council_reference,
"description" => clean_whitespace(application.at("ApplicationDetails").inner_text),
"date_received" => Date.parse(application.at("LodgementDate").inner_text).to_s,
"address" => address,
"date_scraped" => Date.today.to_s,
"info_url" => info_url,
"comment_url" => comment_url,
}

record = {
'info_url' => info_url,
'comment_url' => comment_url,
'council_reference' => clean_whitespace(h[1].sub("<strong>", "").sub("</strong>", "")),
'date_received' => date_received,
# TODO: Some DAs have multiple addresses, we're just getting the first :(
'address' => address,
'description' => CGI::unescapeHTML(info_page.at('#b_ctl00_ctMain1_info_app').inner_html.split('<br>')[0].sub("Development Application - ", "").strip),
'date_scraped' => Date.today.to_s
}
# puts record.to_yaml
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
puts "Saving record " + record['council_reference'] + " - " + record['address']
# puts record
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts "Skipping already saved record " + record['council_reference']
end
end
end

def scrape_and_follow_next_link(doc, comment_url)
scrape_table(doc, comment_url)
nextButton = doc.at('.rgPageNext')
puts "No further pages" if nextButton.nil?
unless nextButton.nil? || nextButton['onclick'] =~ /return false/
form = doc.forms.first

# The joy of dealing with ASP.NET
form['__EVENTTARGET'] = nextButton['name']
form['__EVENTARGUMENT'] = ''
# It doesn't seem to work without these stupid values being set.
# Would be good to figure out where precisely in the javascript these values are coming from.
form['ctl00%24RadScriptManager1']=
'ctl00%24cphContent%24ctl00%24ctl00%24cphContent%24ctl00%24Radajaxpanel2Panel%7Cctl00%24cphContent%24ctl00%24ctl00%24RadGrid1%24ctl00%24ctl03%24ctl01%24ctl10'
form['ctl00_RadScriptManager1_HiddenField']=
'%3B%3BSystem.Web.Extensions%2C%20Version%3D3.5.0.0%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D31bf3856ad364e35%3Aen-US%3A0d787d5c-3903-4814-ad72-296cea810318%3Aea597d4b%3Ab25378d2%3BTelerik.Web.UI%2C%20Version%3D2009.1.527.35%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D121fae78165ba3d4%3Aen-US%3A1e3fef00-f492-4ed8-96ce-6371bc241e1c%3A16e4e7cd%3Af7645509%3A24ee1bba%3Ae330518b%3A1e771326%3Ac8618e41%3A4cacbc31%3A8e6f0d33%3Aed16cbdc%3A58366029%3Aaa288e2d'
doc = form.submit(form.button_with(:name => nextButton['name']))
scrape_and_follow_next_link(doc, comment_url)
end
end

@agent = Mechanize.new

# Jump through bollocks agree screen
doc = @agent.get(starting_url)
puts "Agreeing"
doc = doc.forms.first.submit(doc.forms.first.button_with(:value => "I Agree"))
doc = @agent.get(starting_url)

scrape_and_follow_next_link(doc, comment_url)

0 comments on commit 9a94b06

Please sign in to comment.