diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..3465900 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "lib_icon_rest_xml"] + path = lib_icon_rest_xml + url = https://github.com/planningalerts-scrapers/lib_icon_rest_xml/ diff --git a/README.md b/README.md index d16db5b..d9f2acf 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,7 @@ * Server - Microsoft * System - XC.Track -* Cookie tracking - Yes -* Pagination - Yes -* Detail Page - Yes -* Clearly defined data within a row - Sort of +* XML - Yes Setup MORPH_PERIOD for data recovery, available options are diff --git a/lib_icon_rest_xml b/lib_icon_rest_xml new file mode 160000 index 0000000..3d62847 --- /dev/null +++ b/lib_icon_rest_xml @@ -0,0 +1 @@ +Subproject commit 3d628471dbb6f74b1319d9ebe2492ac684dcd47f diff --git a/scraper.rb b/scraper.rb index 6bba850..5d9b60b 100644 --- a/scraper.rb +++ b/scraper.rb @@ -1,74 +1,16 @@ -require 'scraperwiki' -require 'mechanize' +require File.dirname(__FILE__) + '/lib_icon_rest_xml/scraper' case ENV['MORPH_PERIOD'] when 'lastmonth' period = "lastmonth" when 'thismonth' period = "thismonth" - else + when period = "thisweek" - ENV['MORPH_PERIOD'] = 'thisweek' -end -puts "Getting data in `" + ENV['MORPH_PERIOD'] + "`, changable via MORPH_PERIOD environment" - -base_url = 'https://apptracking.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx' -starting_url = base_url + '?d=' + period + '&k=LodgementDate&' -comment_url = 'mailto:council@northsydney.nsw.gov.au' + else + period = "last14days" -def clean_whitespace(a) - a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip end +puts "Getting data in `" + period + "`, changable via MORPH_PERIOD environment" -agent = Mechanize.new - -# Jump through bollocks agree screen -page = agent.get(starting_url) -puts "Agreeing" -# Workaround for weird content-encoding "gzip,gzip". -# See https://stackoverflow.com/questions/35444572/unsupported-content-encoding-gzip-gzip-when-submitting-form-with-mechanize -agent.content_encoding_hooks << lambda { |httpagent, uri, response, body_io| - response['Content-Encoding'] = '' -} -page = page.forms.first.submit(page.forms.first.button_with(:value => "I Agree")) -page = agent.get(starting_url + "&o=xml") - -# Explicitly interpret as XML -page = Nokogiri::XML(page.content) - -raise "Can't find any elements" unless page.search('Application').length > 0 - -page.search('Application').each do |application| - council_reference = clean_whitespace(application.at("ReferenceNumber").inner_text) - - application_id = clean_whitespace(application.at("ApplicationId").inner_text.strip) - info_url = "#{base_url}?id=#{application_id}" - - unless application.at("Line1") - puts "Skipping due to lack of address for #{council_reference}" - next - end - - address = clean_whitespace(application.at("Line1").inner_text) - if !application.at('Line2').inner_text.empty? - address += ", " + clean_whitespace(application.at("Line2").inner_text) - end - - record = { - "council_reference" => council_reference, - "description" => clean_whitespace(application.at("ApplicationDetails").inner_text), - "date_received" => Date.parse(application.at("LodgementDate").inner_text).to_s, - "address" => address, - "date_scraped" => Date.today.to_s, - "info_url" => info_url, - "comment_url" => comment_url, - } - - if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true) - puts "Saving record " + record['council_reference'] + " - " + record['address'] -# puts record - ScraperWiki.save_sqlite(['council_reference'], record) - else - puts "Skipping already saved record " + record['council_reference'] - end -end +scrape_icon_rest_xml("https://apptracking.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx", "d=" + period + "&k=LodgementDate&o=xml")