Skip to content
This repository has been archived by the owner on Jun 12, 2019. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #2 from LoveMyData/master
fixed scraper due to council's link changed
  • Loading branch information
equivalentideas committed May 22, 2017
2 parents dca8650 + 0ee0bc9 commit 3a3e43c
Show file tree
Hide file tree
Showing 5 changed files with 131 additions and 58 deletions.
6 changes: 6 additions & 0 deletions Gemfile
@@ -0,0 +1,6 @@
source "https://rubygems.org"

ruby "2.0.0"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "mechanize"
52 changes: 52 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,52 @@
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic

GEM
remote: https://rubygems.org/
specs:
domain_name (0.5.20170404)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.8.3)
mechanize (2.7.5)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.1.0)
net-http-digest_auth (1.4.1)
net-http-persistent (2.9.4)
nokogiri (1.6.8.1)
mini_portile2 (~> 2.1.0)
ntlm-http (0.1.1)
sqlite3 (1.3.13)
sqlite_magic (0.0.6)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.4)
webrobots (0.1.2)

PLATFORMS
ruby

DEPENDENCIES
mechanize
scraperwiki!

BUNDLED WITH
1.14.6
17 changes: 17 additions & 0 deletions README.md
@@ -0,0 +1,17 @@
# Waverley Council Scraper

* Server - XC.Track
* Cookie tracking - Yes
* Pagnation - No
* Javascript - No
* Clearly defined data within a row - No but acceptable


Setup MORPH_PERIOD for data recovery, available options are

* thisweek (default)
* thismonth
* lastmonth


Enjoy
1 change: 0 additions & 1 deletion README.textile

This file was deleted.

113 changes: 56 additions & 57 deletions scraper.rb
@@ -1,67 +1,66 @@
require 'scraperwiki'
require 'rubygems'
require 'mechanize'

starting_url = 'https://epwgate.waverley.nsw.gov.au/DA_Tracking/Modules/applicationmaster/default.aspx?page=found&1=thismonth&4a=448&6=F'
case ENV['MORPH_PERIOD']
when 'thismonth'
period = 'thismonth'
when 'lastmonth'
period = 'lastmonth'
else
period = 'thisweek'
end
puts "Getting '" + period + "' data, changable via MORPH_PERIOD environment";

url_base = 'http://eservices.waverley.nsw.gov.au'
da_url = url_base + '/Pages/XC.Track/SearchApplication.aspx?d=' + period + '&k=LodgementDate&t=A0,SP2A,TPO,B1,B1A,FPS'
comment_url = 'mailto:waver@waverley.nsw.gov.au?subject='

def clean_whitespace(a)
a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip
end
# Disable gzip otherwise server will return below error message
# in `response_content_encoding': unsupported content-encoding: gzip,gzip (Mechanize::Error)
agent = Mechanize.new
agent.request_headers = { "Accept-Encoding" => "" }

def scrape_table(doc, comment_url)
doc.search('table tbody tr').each do |tr|
# Columns in table
# Show Number Submitted Details
tds = tr.search('td')
h = tds.map{|td| td.inner_html}

record = {
'info_url' => (doc.uri + tds[0].at('a')['href']).to_s,
'comment_url' => comment_url + CGI::escape("Development Application Enquiry: " + clean_whitespace(h[1])),
'council_reference' => clean_whitespace(h[1]),
'date_received' => Date.strptime(clean_whitespace(h[2]), '%d/%m/%Y').to_s,
'address' => clean_whitespace(h[3].split('<br>')[0]),
'description' => CGI::unescapeHTML(clean_whitespace(h[3].split('<br>')[1..-1].join)),
'date_scraped' => Date.today.to_s
}

#pp record
if ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty?
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts "Skipping already saved record " + record['council_reference']
end
end
end
# Accept terms
page = agent.get(url_base + '/Common/Common/terms.aspx')
form = page.forms.first
form.checkbox_with(:name => 'ctl00$ctMain$chkAgree$chk1').check
page = form.click_button( form.button_with(:value => "I Agree") )

def scrape_and_follow_next_link(doc, comment_url)
scrape_table(doc, comment_url)
nextButton = doc.at('.rgPageNext')
unless nextButton.nil? || nextButton['onclick'] =~ /return false/
form = doc.forms.first

# The joy of dealing with ASP.NET
form['__EVENTTARGET'] = nextButton['name']
form['__EVENTARGUMENT'] = ''
# It doesn't seem to work without these stupid values being set.
# Would be good to figure out where precisely in the javascript these values are coming from.
form['ctl00%24RadScriptManager1']=
'ctl00%24cphContent%24ctl00%24ctl00%24cphContent%24ctl00%24Radajaxpanel2Panel%7Cctl00%24cphContent%24ctl00%24ctl00%24RadGrid1%24ctl00%24ctl03%24ctl01%24ctl10'
form['ctl00_RadScriptManager1_HiddenField']=
'%3B%3BSystem.Web.Extensions%2C%20Version%3D3.5.0.0%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D31bf3856ad364e35%3Aen-US%3A0d787d5c-3903-4814-ad72-296cea810318%3Aea597d4b%3Ab25378d2%3BTelerik.Web.UI%2C%20Version%3D2009.1.527.35%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D121fae78165ba3d4%3Aen-US%3A1e3fef00-f492-4ed8-96ce-6371bc241e1c%3A16e4e7cd%3Af7645509%3A24ee1bba%3Ae330518b%3A1e771326%3Ac8618e41%3A4cacbc31%3A8e6f0d33%3Aed16cbdc%3A58366029%3Aaa288e2d'
doc = form.submit(form.button_with(:name => nextButton['name']))
scrape_and_follow_next_link(doc, comment_url)
end
end
# Scrape DA page
page = agent.get(da_url)
results = page.search('div.result')

agent = Mechanize.new do |a|
a.verify_mode = OpenSSL::SSL::VERIFY_NONE
end
results.each do |result|
council_reference = result.search('a.search')[0].inner_text.strip.split.join(" ")

description = result.inner_text
description = description.split( /\r?\n/ )
description = description[3].strip.split.join(" ").split(' - ', 2)[1]

info_url = result.search('a.search')[0]['href']
info_url = info_url.sub!('../..', '')
info_url = url_base + info_url

# Jump through bollocks agree screen
doc = agent.get(starting_url)
doc = doc.forms.first.submit(doc.forms.first.button_with(:value => "I Agree"))
doc = agent.get(starting_url)
date_received = result.inner_text
date_received = date_received.split(/Submitted:\r\n/)[1].split( /\r?\n/ )
date_received = Date.parse(date_received[0].strip.to_s)

scrape_and_follow_next_link(doc, comment_url)
record = {
'council_reference' => council_reference,
'address' => result.search('strong')[0].inner_text.strip.split.join(" "),
'description' => description,
'info_url' => info_url,
'comment_url' => comment_url + council_reference,
'date_scraped' => Date.today.to_s,
'date_received' => date_received.to_s
}

# Saving data
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
puts "Saving record " + record['council_reference'] + ", " + record['address']
# puts record
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts "Skipping already saved record " + record['council_reference']
end
end

0 comments on commit 3a3e43c

Please sign in to comment.