diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..642cdae --- /dev/null +++ b/Gemfile @@ -0,0 +1,7 @@ +source "https://rubygems.org" + +ruby "2.5.1" + +gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults" +gem "mechanize" +gem "pdf-reader" diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..82ffb9b --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,69 @@ +GIT + remote: https://github.com/openaustralia/scraperwiki-ruby.git + revision: fc50176812505e463077d5c673d504a6a234aa78 + branch: morph_defaults + specs: + scraperwiki (3.0.1) + httpclient + sqlite_magic + +GEM + remote: https://rubygems.org/ + specs: + Ascii85 (1.0.3) + afm (0.2.2) + connection_pool (2.2.2) + domain_name (0.5.20190701) + unf (>= 0.0.5, < 1.0.0) + hashery (2.1.2) + http-cookie (1.0.3) + domain_name (~> 0.5) + httpclient (2.8.3) + mechanize (2.7.6) + domain_name (~> 0.5, >= 0.5.1) + http-cookie (~> 1.0) + mime-types (>= 1.17.2) + net-http-digest_auth (~> 1.1, >= 1.1.1) + net-http-persistent (>= 2.5.2) + nokogiri (~> 1.6) + ntlm-http (~> 0.1, >= 0.1.1) + webrobots (>= 0.0.9, < 0.2) + mime-types (3.2.2) + mime-types-data (~> 3.2015) + mime-types-data (3.2019.0331) + mini_portile2 (2.4.0) + net-http-digest_auth (1.4.1) + net-http-persistent (3.1.0) + connection_pool (~> 2.2) + nokogiri (1.10.4) + mini_portile2 (~> 2.4.0) + ntlm-http (0.1.1) + pdf-reader (2.2.1) + Ascii85 (~> 1.0.0) + afm (~> 0.2.1) + hashery (~> 2.0) + ruby-rc4 + ttfunk + ruby-rc4 (0.1.5) + sqlite3 (1.4.1) + sqlite_magic (0.0.6) + sqlite3 + ttfunk (1.5.1) + unf (0.1.4) + unf_ext + unf_ext (0.0.7.6) + webrobots (0.1.2) + +PLATFORMS + ruby + +DEPENDENCIES + mechanize + pdf-reader + scraperwiki! + +RUBY VERSION + ruby 2.5.1p57 + +BUNDLED WITH + 1.16.2 diff --git a/scraper.rb b/scraper.rb index 270006c..de3db2e 100644 --- a/scraper.rb +++ b/scraper.rb @@ -45,7 +45,7 @@ def scrape_pdf(url) text = receiver.content.join puts text match = text.match(/(DEVELOPMENT|SUBDIVISION) APPLICATION(.*)APPLICANT:(.*)PROPOSAL:(.*)LOCATION:(.*)ADVERTISING EXPIRY DATE:([^.]*)\./) - if match.nil? + if match.nil? puts "WARNING: Returned text isn't matching regular expression" nil else @@ -62,24 +62,40 @@ def scrape_pdf(url) end a = Mechanize.new -a.get("http://www.ccc.tas.gov.au/page.aspx?u=1581") do |page| - page.search('.uContentList a').each do |a| + +# Oh great. Thanks. This site is "protected" from scraping by a scheme that's just "work" to get around +# Why do this? It's futile. It's extremely bad for accessibility +# It's using https://sucuri.net/ which is owned by GoDaddy. So, basically super dodgy. + +# First, naive stab at this assumes that the cookies never have to change. So, hardcoding +# two cookies that get set by the site that allow it to be loaded. +cookie1 = Mechanize::Cookie.new domain: ".www.ccc.tas.gov.au", name: "sucuri_cloudproxy_uuid_7f1fb16fe", value: "2be47d328df84f3ed16371eef41e2e1c", path: "/" +cookie2 = Mechanize::Cookie.new domain: ".www.ccc.tas.gov.au", name: "sucuri_cloudproxy_uuid_0755cafd7", value: "2250934b40bf9b8e6556017a3c643e31", path: "/" +a.cookie_jar << cookie1 +a.cookie_jar << cookie2 + +a.get("https://www.ccc.tas.gov.au/planning-development/planning/advertised-planning-permit-applications/") do |page| + page.search('.doc-list a').each do |a| unless a.at('img') - url = a['href'] - s = a.inner_text.split('-') - # Skip over links that we don't know how to handle (e.g. Notice under Historic Cultural Heritage Act 1995) - if s.count >= 5 - record = { - 'council_reference' => s[0..1].join('-').strip, - 'address' => s[2].strip + ", TAS", - 'description' => s[3..-2].join('-').strip, - 'on_notice_to' => Date.parse(s[-1].split(' ')[-3..-1].join(' ')).to_s, - 'date_scraped' => Date.today.to_s, - 'info_url' => ("http://www.ccc.tas.gov.au/" + url).gsub(" ", "%20"), - 'comment_url' => 'mailto:clarence@ccc.tas.gov.au' - } - ScraperWiki.save_sqlite(['council_reference'], record) + # Long winded name of PDF + name = a.inner_text.strip + s = name.split(' - ').map(&:strip) + # Skip over links that we don't know how to handle + if s.count != 4 + puts "Unexpected form of PDF name. So, skipping: #{name}" + next end + + record = { + 'council_reference' => s[0], + 'address' => s[1] + ", TAS", + 'description' => s[2], + 'on_notice_to' => Date.parse(s[3]).to_s, + 'date_scraped' => Date.today.to_s, + 'info_url' => (page.uri + a["href"]).to_s + } + + ScraperWiki.save_sqlite(['council_reference'], record) end end end