Update scraper now they're using a new site. Fixes #3

There's a strong likelyhood that the hack to circumvent their "protection" will only work for a short period of time.
planningalerts-scrapers · Aug 21, 2019 · bca6585 · bca6585
1 parent a5137dc
commit bca6585
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 17 deletions.
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,7 @@
+source "https://rubygems.org"
+
+ruby "2.5.1"
+
+gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
+gem "mechanize"
+gem "pdf-reader"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -0,0 +1,69 @@
+GIT
+  remote: https://github.com/openaustralia/scraperwiki-ruby.git
+  revision: fc50176812505e463077d5c673d504a6a234aa78
+  branch: morph_defaults
+  specs:
+    scraperwiki (3.0.1)
+      httpclient
+      sqlite_magic
+
+GEM
+  remote: https://rubygems.org/
+  specs:
+    Ascii85 (1.0.3)
+    afm (0.2.2)
+    connection_pool (2.2.2)
+    domain_name (0.5.20190701)
+      unf (>= 0.0.5, < 1.0.0)
+    hashery (2.1.2)
+    http-cookie (1.0.3)
+      domain_name (~> 0.5)
+    httpclient (2.8.3)
+    mechanize (2.7.6)
+      domain_name (~> 0.5, >= 0.5.1)
+      http-cookie (~> 1.0)
+      mime-types (>= 1.17.2)
+      net-http-digest_auth (~> 1.1, >= 1.1.1)
+      net-http-persistent (>= 2.5.2)
+      nokogiri (~> 1.6)
+      ntlm-http (~> 0.1, >= 0.1.1)
+      webrobots (>= 0.0.9, < 0.2)
+    mime-types (3.2.2)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2019.0331)
+    mini_portile2 (2.4.0)
+    net-http-digest_auth (1.4.1)
+    net-http-persistent (3.1.0)
+      connection_pool (~> 2.2)
+    nokogiri (1.10.4)
+      mini_portile2 (~> 2.4.0)
+    ntlm-http (0.1.1)
+    pdf-reader (2.2.1)
+      Ascii85 (~> 1.0.0)
+      afm (~> 0.2.1)
+      hashery (~> 2.0)
+      ruby-rc4
+      ttfunk
+    ruby-rc4 (0.1.5)
+    sqlite3 (1.4.1)
+    sqlite_magic (0.0.6)
+      sqlite3
+    ttfunk (1.5.1)
+    unf (0.1.4)
+      unf_ext
+    unf_ext (0.0.7.6)
+    webrobots (0.1.2)
+
+PLATFORMS
+  ruby
+
+DEPENDENCIES
+  mechanize
+  pdf-reader
+  scraperwiki!
+
+RUBY VERSION
+   ruby 2.5.1p57
+
+BUNDLED WITH
+   1.16.2
diff --git a/scraper.rb b/scraper.rb
@@ -45,7 +45,7 @@ def scrape_pdf(url)
   text = receiver.content.join
   puts text
   match = text.match(/(DEVELOPMENT|SUBDIVISION) APPLICATION(.*)APPLICANT:(.*)PROPOSAL:(.*)LOCATION:(.*)ADVERTISING EXPIRY DATE:([^.]*)\./)
-  if match.nil? 
+  if match.nil?
     puts "WARNING: Returned text isn't matching regular expression"
     nil
   else
@@ -62,24 +62,40 @@ def scrape_pdf(url)
 end
 
 a = Mechanize.new
-a.get("http://www.ccc.tas.gov.au/page.aspx?u=1581") do |page|
-  page.search('.uContentList a').each do |a|
+
+# Oh great. Thanks. This site is "protected" from scraping by a scheme that's just "work" to get around
+# Why do this? It's futile. It's extremely bad for accessibility
+# It's using https://sucuri.net/ which is owned by GoDaddy. So, basically super dodgy.
+
+# First, naive stab at this assumes that the cookies never have to change. So, hardcoding
+# two cookies that get set by the site that allow it to be loaded.
+cookie1 = Mechanize::Cookie.new domain: ".www.ccc.tas.gov.au", name: "sucuri_cloudproxy_uuid_7f1fb16fe", value: "2be47d328df84f3ed16371eef41e2e1c", path: "/"
+cookie2 = Mechanize::Cookie.new domain: ".www.ccc.tas.gov.au", name: "sucuri_cloudproxy_uuid_0755cafd7", value: "2250934b40bf9b8e6556017a3c643e31", path: "/"
+a.cookie_jar << cookie1
+a.cookie_jar << cookie2
+
+a.get("https://www.ccc.tas.gov.au/planning-development/planning/advertised-planning-permit-applications/") do |page|
+  page.search('.doc-list a').each do |a|
     unless a.at('img')
-      url = a['href']
-      s = a.inner_text.split('-')
-      # Skip over links that we don't know how to handle (e.g. Notice under Historic Cultural Heritage Act 1995)
-      if s.count >= 5
-        record = {
-          'council_reference' => s[0..1].join('-').strip,
-          'address' => s[2].strip + ", TAS",
-          'description' => s[3..-2].join('-').strip,
-          'on_notice_to' => Date.parse(s[-1].split(' ')[-3..-1].join(' ')).to_s,
-          'date_scraped' => Date.today.to_s,
-          'info_url' => ("http://www.ccc.tas.gov.au/" + url).gsub(" ", "%20"),
-          'comment_url' => 'mailto:clarence@ccc.tas.gov.au'
-        }
-        ScraperWiki.save_sqlite(['council_reference'], record)
+      # Long winded name of PDF
+      name = a.inner_text.strip
+      s = name.split(' - ').map(&:strip)
+      # Skip over links that we don't know how to handle
+      if s.count != 4
+        puts "Unexpected form of PDF name. So, skipping: #{name}"
+        next
       end
+
+      record = {
+        'council_reference' => s[0],
+        'address' => s[1] + ", TAS",
+        'description' => s[2],
+        'on_notice_to' => Date.parse(s[3]).to_s,
+        'date_scraped' => Date.today.to_s,
+        'info_url' => (page.uri + a["href"]).to_s
+      }
+
+      ScraperWiki.save_sqlite(['council_reference'], record)
     end
   end
 end