Merge pull request #5 from LoveMyData/master

updated URL and change to use XML library
planningalerts-scrapers · Dec 19, 2018 · a547a8e · a547a8e
2 parents 4055c5d + 2770750
commit a547a8e
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 68 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "lib_icon_rest_xml"]
+	path = lib_icon_rest_xml
+	url = https://github.com/planningalerts-scrapers/lib_icon_rest_xml/
diff --git a/README.md b/README.md
@@ -2,10 +2,7 @@
 
 * Server - Microsoft
 * System - XC.Track
-* Cookie tracking - Yes
-* Pagination - Yes
-* Detail Page - Yes
-* Clearly defined data within a row - Sort of
+* XML - Yes
 
 
 Setup MORPH_PERIOD for data recovery, available options are

diff --git a/lib_icon_rest_xml b/lib_icon_rest_xml
diff --git a/scraper.rb b/scraper.rb
@@ -1,74 +1,16 @@
-require 'scraperwiki'
-require 'mechanize'
+require File.dirname(__FILE__) + '/lib_icon_rest_xml/scraper'
 
 case ENV['MORPH_PERIOD']
   when 'lastmonth'
   	period = "lastmonth"
   when 'thismonth'
   	period = "thismonth"
-  else
+  when
     period = "thisweek"
-    ENV['MORPH_PERIOD'] = 'thisweek'
-end
-puts "Getting data in `" + ENV['MORPH_PERIOD'] + "`, changable via MORPH_PERIOD environment"
-
-base_url = 'https://apptracking.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx'
-starting_url =  base_url + '?d=' + period + '&k=LodgementDate&'
-comment_url = 'mailto:council@northsydney.nsw.gov.au'
+  else
+    period = "last14days"
 
-def clean_whitespace(a)
-  a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip
 end
+puts "Getting data in `" + period + "`, changable via MORPH_PERIOD environment"
 
-agent = Mechanize.new
-
-# Jump through bollocks agree screen
-page = agent.get(starting_url)
-puts "Agreeing"
-# Workaround for weird content-encoding "gzip,gzip".
-# See https://stackoverflow.com/questions/35444572/unsupported-content-encoding-gzip-gzip-when-submitting-form-with-mechanize
-agent.content_encoding_hooks << lambda { |httpagent, uri, response, body_io|
-  response['Content-Encoding'] = ''
-}
-page = page.forms.first.submit(page.forms.first.button_with(:value => "I Agree"))
-page = agent.get(starting_url + "&o=xml")
-
-# Explicitly interpret as XML
-page = Nokogiri::XML(page.content)
-
-raise "Can't find any <Application> elements" unless page.search('Application').length > 0
-
-page.search('Application').each do |application|
-  council_reference = clean_whitespace(application.at("ReferenceNumber").inner_text)
-
-  application_id = clean_whitespace(application.at("ApplicationId").inner_text.strip)
-  info_url = "#{base_url}?id=#{application_id}"
-
-  unless application.at("Line1")
-    puts "Skipping due to lack of address for #{council_reference}"
-    next
-  end
-
-  address = clean_whitespace(application.at("Line1").inner_text)
-  if !application.at('Line2').inner_text.empty?
-    address += ", " + clean_whitespace(application.at("Line2").inner_text)
-  end
-
-  record = {
-    "council_reference" => council_reference,
-    "description" => clean_whitespace(application.at("ApplicationDetails").inner_text),
-    "date_received" => Date.parse(application.at("LodgementDate").inner_text).to_s,
-    "address" => address,
-    "date_scraped" => Date.today.to_s,
-    "info_url" => info_url,
-    "comment_url" => comment_url,
-  }
-
-    if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
-      puts "Saving record " + record['council_reference'] + " - " + record['address']
-#       puts record
-      ScraperWiki.save_sqlite(['council_reference'], record)
-    else
-      puts "Skipping already saved record " + record['council_reference']
-    end
-end
+scrape_icon_rest_xml("https://apptracking.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx", "d=" + period + "&k=LodgementDate&o=xml")