Merge pull request #3 from LoveMyData/master

Fixed scraper as council changed system
planningalerts-scrapers · Jul 21, 2017 · d72b849 · d72b849
2 parents 557b70a + 4c1d37d
commit d72b849
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 26 deletions.
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+ruby '~> 2.4', '< 2.5'
+gem 'scraperwiki', git: 'https://github.com/openaustralia/scraperwiki-ruby.git', branch: 'morph_defaults'
+gem 'mechanize'
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,55 @@
+GIT
+  remote: https://github.com/openaustralia/scraperwiki-ruby.git
+  revision: fc50176812505e463077d5c673d504a6a234aa78
+  branch: morph_defaults
+  specs:
+    scraperwiki (3.0.1)
+      httpclient
+      sqlite_magic
+
+GEM
+  remote: https://rubygems.org/
+  specs:
+    domain_name (0.5.20170404)
+      unf (>= 0.0.5, < 1.0.0)
+    http-cookie (1.0.3)
+      domain_name (~> 0.5)
+    httpclient (2.8.3)
+    mechanize (2.7.5)
+      domain_name (~> 0.5, >= 0.5.1)
+      http-cookie (~> 1.0)
+      mime-types (>= 1.17.2)
+      net-http-digest_auth (~> 1.1, >= 1.1.1)
+      net-http-persistent (~> 2.5, >= 2.5.2)
+      nokogiri (~> 1.6)
+      ntlm-http (~> 0.1, >= 0.1.1)
+      webrobots (>= 0.0.9, < 0.2)
+    mime-types (3.1)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2016.0521)
+    mini_portile2 (2.2.0)
+    net-http-digest_auth (1.4.1)
+    net-http-persistent (2.9.4)
+    nokogiri (1.8.0)
+      mini_portile2 (~> 2.2.0)
+    ntlm-http (0.1.1)
+    sqlite3 (1.3.13)
+    sqlite_magic (0.0.6)
+      sqlite3
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.4)
+    webrobots (0.1.2)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  mechanize
+  scraperwiki!
+
+RUBY VERSION
+   ruby 2.4.1p111
+
+BUNDLED WITH
+   1.15.1
diff --git a/README.md b/README.md
@@ -0,0 +1,9 @@
+# Yarra City Council
+
+* Server - Standard HTTP
+* Cookie tracking - Yes
+* Pagnation - Yes
+* Javascript - No
+* Clearly defined data within a row - Yes
+
+Enjoy
diff --git a/scraper.rb b/scraper.rb
@@ -1,20 +1,22 @@
 require 'scraperwiki'
 require 'mechanize'
 
-url = "http://www.yarracity.vic.gov.au/Planning-Application-Search/Results.aspx?ApplicationNumber=&Suburb=(All)&Street=(All)&Status=Current&Ward=(All)"
+url_base = "https://www.yarracity.vic.gov.au/planning-application-search"
+url = url_base + "?suburb=(All)&street=(All)&status=Current&ward=(All)"
 
 def clean_whitespace(a)
   a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip
 end
 
-def get_page_data(page)
+def get_page_data(page, url_base)
-  comment_url = "http://www.yarracity.vic.gov.au/planning--building/Planning-applications/Objecting-to-a-planning-applicationVCAT/"
+  comment_url = "mailto:info@yarracity.vic.gov.au"
 
-  trs = page.search('table#ContentPlaceHolder_dgResults/tr')
+  trs = page.search('table.search tbody tr')
-  trs[1..-2].each do |tr|
+  trs.each do |tr|
     texts = tr.search('td').map{|n| n.inner_text}
     council_reference = clean_whitespace(texts[0])
-    info_url = "http://www.yarracity.vic.gov.au/Planning-Application-Search/Results.aspx?ApplicationNumber=#{council_reference}&Suburb=(All)&Street=(All)&Status=(All)&Ward=(All)"
+
+    info_url = url_base + "?applicationNumber=#{council_reference}"
     record = {
       'info_url' => info_url,
       'comment_url' => comment_url,
@@ -30,39 +32,33 @@ def get_page_data(page)
       # In case the date is invalid
     end
 
-    if ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? 
+    if ( ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true )
+      puts "Saving record " + council_reference + " - " + record['address']
+#       puts record
       ScraperWiki.save_sqlite(['council_reference'], record)
     else
-      puts "Skipping already saved record " + record['council_reference']
+      puts "Skipping already saved record " + record['council_reference'] + " - " + record['address']
     end
   end
 end
 
 agent = Mechanize.new
+agent.verify_mode = OpenSSL::SSL::VERIFY_NONE
 
-page = agent.get(url)
+page = agent.get url
 
-current_page = 1
 begin
-  get_page_data(page)
+  get_page_data(page, url_base)
 
   # Click on the link to the next page
-  links = page.search('table tr')[-1].search('a')
+  links = page.search('div.pagination-container').search('a')
-  link = links.find{|a| a.inner_text.to_i == current_page + 1}
+  link = links.find{|a| a.inner_text == 'Next'}
-  # This page has a really odd paging mechanism
+
-  if link.nil? 
-    # Ignore the first link in case it's a "..." as well that will go back rather than forward
-    link = links[1..-1].find{|a| a.inner_text == "..."}
-  end
   if link
-    href = link["href"]
+    puts url_base + link["href"]
-    matches = href.match(/javascript:__doPostBack\('(.*)','(.*)'\)/)
+    page = agent.get (url_base + link["href"])
-    # We're faking what the __doPostBack javascript does
-    form = page.forms.first
-    form["__EVENTTARGET"] = matches[1]
-    form["__EVENTARGUMENT"] = matches[2]
-    page = form.submit
-    current_page += 1
   end
+  # end
+
 end while link