Permalink
Browse files

Merge pull request #3 from LoveMyData/master

Fixed scraper as council changed system
  • Loading branch information...
henare committed Jul 21, 2017
2 parents 557b70a + 4c1d37d commit d72b849d068f72afcab788b6dca10a79478381fb
Showing with 90 additions and 26 deletions.
  1. +4 −0 Gemfile
  2. +55 −0 Gemfile.lock
  3. +9 −0 README.md
  4. +22 −26 scraper.rb
View
@@ -0,0 +1,4 @@
source 'https://rubygems.org'
ruby '~> 2.4', '< 2.5'
gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', branch: 'morph_defaults'
gem 'mechanize'
View
@@ -0,0 +1,55 @@
GIT
remote: https://github.com/openaustralia/scraperwiki-ruby.git
revision: fc50176812505e463077d5c673d504a6a234aa78
branch: morph_defaults
specs:
scraperwiki (3.0.1)
httpclient
sqlite_magic
GEM
remote: https://rubygems.org/
specs:
domain_name (0.5.20170404)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.3)
domain_name (~> 0.5)
httpclient (2.8.3)
mechanize (2.7.5)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (>= 1.17.2)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.6)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (3.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2016.0521)
mini_portile2 (2.2.0)
net-http-digest_auth (1.4.1)
net-http-persistent (2.9.4)
nokogiri (1.8.0)
mini_portile2 (~> 2.2.0)
ntlm-http (0.1.1)
sqlite3 (1.3.13)
sqlite_magic (0.0.6)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.4)
webrobots (0.1.2)
PLATFORMS
ruby
DEPENDENCIES
mechanize
scraperwiki!
RUBY VERSION
ruby 2.4.1p111
BUNDLED WITH
1.15.1
View
@@ -0,0 +1,9 @@
# Yarra City Council
* Server - Standard HTTP
* Cookie tracking - Yes
* Pagnation - Yes
* Javascript - No
* Clearly defined data within a row - Yes
Enjoy
View
@@ -1,20 +1,22 @@
require 'scraperwiki'
require 'mechanize'
url = "http://www.yarracity.vic.gov.au/Planning-Application-Search/Results.aspx?ApplicationNumber=&Suburb=(All)&Street=(All)&Status=Current&Ward=(All)"
url_base = "https://www.yarracity.vic.gov.au/planning-application-search"
url = url_base + "?suburb=(All)&street=(All)&status=Current&ward=(All)"
def clean_whitespace(a)
a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip
end
def get_page_data(page)
comment_url = "http://www.yarracity.vic.gov.au/planning--building/Planning-applications/Objecting-to-a-planning-applicationVCAT/"
def get_page_data(page, url_base)
comment_url = "mailto:info@yarracity.vic.gov.au"
trs = page.search('table#ContentPlaceHolder_dgResults/tr')
trs[1..-2].each do |tr|
trs = page.search('table.search tbody tr')
trs.each do |tr|
texts = tr.search('td').map{|n| n.inner_text}
council_reference = clean_whitespace(texts[0])
info_url = "http://www.yarracity.vic.gov.au/Planning-Application-Search/Results.aspx?ApplicationNumber=#{council_reference}&Suburb=(All)&Street=(All)&Status=(All)&Ward=(All)"
info_url = url_base + "?applicationNumber=#{council_reference}"
record = {
'info_url' => info_url,
'comment_url' => comment_url,
@@ -30,39 +32,33 @@ def get_page_data(page)
# In case the date is invalid
end
if ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty?
if ( ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true )
puts "Saving record " + council_reference + " - " + record['address']
# puts record
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts "Skipping already saved record " + record['council_reference']
puts "Skipping already saved record " + record['council_reference'] + " - " + record['address']
end
end
end
agent = Mechanize.new
agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
page = agent.get(url)
page = agent.get url
current_page = 1
begin
get_page_data(page)
get_page_data(page, url_base)
# Click on the link to the next page
links = page.search('table tr')[-1].search('a')
link = links.find{|a| a.inner_text.to_i == current_page + 1}
# This page has a really odd paging mechanism
if link.nil?
# Ignore the first link in case it's a "..." as well that will go back rather than forward
link = links[1..-1].find{|a| a.inner_text == "..."}
end
links = page.search('div.pagination-container').search('a')
link = links.find{|a| a.inner_text == 'Next'}
if link
href = link["href"]
matches = href.match(/javascript:__doPostBack\('(.*)','(.*)'\)/)
# We're faking what the __doPostBack javascript does
form = page.forms.first
form["__EVENTTARGET"] = matches[1]
form["__EVENTARGUMENT"] = matches[2]
page = form.submit
current_page += 1
puts url_base + link["href"]
page = agent.get (url_base + link["href"])
end
# end
end while link

0 comments on commit d72b849

Please sign in to comment.