Merge pull request #2 from LoveMyData/master

updated scraper
planningalerts-scrapers · Oct 3, 2017 · 159650d · 159650d
2 parents b8d755d + 07ee0c0
commit 159650d
Show file tree

Hide file tree

Showing 5 changed files with 131 additions and 34 deletions.
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,10 @@
+# It's easy to add more libraries or choose different versions. Any libraries
+# specified here will be installed and made available to your morph.io scraper.
+# Find out more: https://morph.io/documentation/ruby
+
+source "https://rubygems.org"
+
+ruby "2.4.1"
+
+gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
+gem "mechanize"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,55 @@
+GIT
+  remote: https://github.com/openaustralia/scraperwiki-ruby.git
+  revision: fc50176812505e463077d5c673d504a6a234aa78
+  branch: morph_defaults
+  specs:
+    scraperwiki (3.0.1)
+      httpclient
+      sqlite_magic
+
+GEM
+  remote: https://rubygems.org/
+  specs:
+    domain_name (0.5.20170404)
+      unf (>= 0.0.5, < 1.0.0)
+    http-cookie (1.0.3)
+      domain_name (~> 0.5)
+    httpclient (2.8.3)
+    mechanize (2.7.5)
+      domain_name (~> 0.5, >= 0.5.1)
+      http-cookie (~> 1.0)
+      mime-types (>= 1.17.2)
+      net-http-digest_auth (~> 1.1, >= 1.1.1)
+      net-http-persistent (~> 2.5, >= 2.5.2)
+      nokogiri (~> 1.6)
+      ntlm-http (~> 0.1, >= 0.1.1)
+      webrobots (>= 0.0.9, < 0.2)
+    mime-types (3.1)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2016.0521)
+    mini_portile2 (2.3.0)
+    net-http-digest_auth (1.4.1)
+    net-http-persistent (2.9.4)
+    nokogiri (1.8.1)
+      mini_portile2 (~> 2.3.0)
+    ntlm-http (0.1.1)
+    sqlite3 (1.3.13)
+    sqlite_magic (0.0.6)
+      sqlite3
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.4)
+    webrobots (0.1.2)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  mechanize
+  scraperwiki!
+
+RUBY VERSION
+   ruby 2.4.1p111
+
+BUNDLED WITH
+   1.15.1
diff --git a/README.md b/README.md
@@ -0,0 +1,17 @@
+# City of Gold Coast Council Scraper
+
+This Council involves the followings
+
+* Server - Masterview, ApplicationMaster
+* Cookie - Yes
+* Scrape detail page - No
+* Clearly defined data within a row - Yes
+* JavaScript - Yes - doPostBack
+
+Setup MORPH_PERIOD for data recovery, available options are
+
+* thisweek (default)
+* thismonth
+* lastmonth
+
+Enjoy
diff --git a/README.textile b/README.textile
diff --git a/scraper.rb b/scraper.rb
@@ -1,63 +1,79 @@
 require 'scraperwiki'
 require 'mechanize'
 
-starting_url = 'http://pdonline.goldcoast.qld.gov.au/masterview/modules/ApplicationMaster/default.aspx?page=found&1=thismonth&4a=BLD%27,%27MCU%27,%27OPW%27,%27ROL&6=F'
-comment_url = 'mailto:gcccmail@goldcoast.qld.gov.au?subject='
+case ENV['MORPH_PERIOD']
+when 'thismonth'
+  period = 'thismonth'
+when 'lastmonth'
+  period = 'lastmonth'
+else
+  period = 'thisweek'
+end
+puts "Getting '" + period + "' data, changable via MORPH_PERIOD environment";
+
+starting_url = 'http://pdonline.goldcoast.qld.gov.au/masterview/modules/ApplicationMaster/default.aspx?page=found&1=' +period+ '&4a=BLD%27,%27MCU%27,%27OPW%27,%27ROL&6=F'
+comment_url = 'mailto:gcccmail@goldcoast.qld.gov.au'
 
 def clean_whitespace(a)
   a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip
 end
 
+# Extending Mechanize Form to support doPostBack
+# http://scraperblog.blogspot.com.au/2012/10/asp-forms-with-dopostback-using-ruby.html
+class Mechanize::Form
+  def postback target, argument
+    self['__EVENTTARGET'], self['__EVENTARGUMENT'] = target, argument
+    submit
+  end
+end
+
 def scrape_table(doc, comment_url)
   doc.search('table tbody tr').each do |tr|
     # Columns in table
     # Show  Number  Submitted  Details
     tds = tr.search('td')
     h = tds.map{|td| td.inner_html}
-  
+
     record = {
-      'info_url' => (doc.uri + tds[0].at('a')['href']).to_s,
-      'comment_url' => comment_url + CGI::escape("Development Application Enquiry: " + clean_whitespace(h[1])),
       'council_reference' => clean_whitespace(h[1]),
-      'date_received' => Date.strptime(clean_whitespace(h[2]), '%d/%m/%Y').to_s,
-      'address' => clean_whitespace(tds[3].at('b').inner_text),
+      'address' => clean_whitespace(tds[3].at('b').inner_text) + ' QLD',
       'description' => CGI::unescapeHTML(clean_whitespace(h[3].split('<br>')[1..-1].join.gsub(/<\/?b>/,''))),
-      'date_scraped' => Date.today.to_s
+      'info_url' => (doc.uri + tds[0].at('a')['href']).to_s,
+      'comment_url' => comment_url,
+      'date_scraped' => Date.today.to_s,
+      'date_received' => Date.strptime(clean_whitespace(h[2]), '%d/%m/%Y').to_s
     }
-    
+
     if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
+      puts "Saving record " + record['council_reference'] + ", " + record['address']
+#       puts record
       ScraperWiki.save_sqlite(['council_reference'], record)
     else
       puts "Skipping already saved record " + record['council_reference']
     end
   end
 end
 
-def scrape_and_follow_next_link(doc, comment_url)
-  scrape_table(doc, comment_url)
-  nextButton = doc.at('.rgPageNext')
-  unless nextButton.nil? || nextButton['onclick'] =~ /return false/
-    form = doc.forms.first
-
-    # The joy of dealing with ASP.NET
-    form['__EVENTTARGET'] = nextButton['name']
-    form['__EVENTARGUMENT'] = ''
-    # It doesn't seem to work without these stupid values being set.
-    # Would be good to figure out where precisely in the javascript these values are coming from.
-    form['ctl00%24RadScriptManager1']=
-      'ctl00%24cphContent%24ctl00%24ctl00%24cphContent%24ctl00%24Radajaxpanel2Panel%7Cctl00%24cphContent%24ctl00%24ctl00%24RadGrid1%24ctl00%24ctl03%24ctl01%24ctl10'
-    form['ctl00_RadScriptManager1_HiddenField']=
-      '%3B%3BSystem.Web.Extensions%2C%20Version%3D3.5.0.0%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D31bf3856ad364e35%3Aen-US%3A0d787d5c-3903-4814-ad72-296cea810318%3Aea597d4b%3Ab25378d2%3BTelerik.Web.UI%2C%20Version%3D2009.1.527.35%2C%20Culture%3Dneutral%2C%20PublicKeyToken%3D121fae78165ba3d4%3Aen-US%3A1e3fef00-f492-4ed8-96ce-6371bc241e1c%3A16e4e7cd%3Af7645509%3A24ee1bba%3Ae330518b%3A1e771326%3Ac8618e41%3A4cacbc31%3A8e6f0d33%3Aed16cbdc%3A58366029%3Aaa288e2d'
-    doc = form.submit(form.button_with(:name => nextButton['name']))
-    scrape_and_follow_next_link(doc, comment_url)
-  end
-end
 
 agent = Mechanize.new
-
-# Jump through bollocks agree screen
-doc = agent.get(starting_url)
-doc = doc.forms.first.submit(doc.forms.first.button_with(:value => "Agree"))
 doc = agent.get(starting_url)
 
-scrape_and_follow_next_link(doc, comment_url)
+scrape_table(doc, comment_url)
+
+# Is there more than a page?
+begin
+  totalPages = doc.at('div .rgInfoPart').inner_text.split(' items in ')[1].split(' pages')[0].to_i
+rescue
+  totalPages = 1
+end
+
+# run a loop if there are more than a page
+(2..totalPages).each do |i|
+  puts "scraping for page " + i.to_s + " of " + totalPages.to_s + " pages"
+
+  nextButton = doc.at('.rgPageNext')
+  target, argument = nextButton[:onclick].scan(/'([^']*)'/).flatten
+  doc = doc.form.postback target, argument
+  scrape_table(doc, comment_url)
+  i += i
+end