From 90e18414ecdfa17b50689355fc66b16c642f8c7f Mon Sep 17 00:00:00 2001 From: Eric Tam Date: Mon, 12 Nov 2018 15:11:22 +1100 Subject: [PATCH 1/4] added lib_icon_rest_xml git submodule --- .gitmodules | 3 +++ lib_icon_rest_xml | 1 + 2 files changed, 4 insertions(+) create mode 100644 .gitmodules create mode 160000 lib_icon_rest_xml diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..3465900 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "lib_icon_rest_xml"] + path = lib_icon_rest_xml + url = https://github.com/planningalerts-scrapers/lib_icon_rest_xml/ diff --git a/lib_icon_rest_xml b/lib_icon_rest_xml new file mode 160000 index 0000000..3d62847 --- /dev/null +++ b/lib_icon_rest_xml @@ -0,0 +1 @@ +Subproject commit 3d628471dbb6f74b1319d9ebe2492ac684dcd47f From 6982b85608e1a9209f04dbf898c7ef2c9c7cca66 Mon Sep 17 00:00:00 2001 From: Eric Tam Date: Mon, 12 Nov 2018 15:11:49 +1100 Subject: [PATCH 2/4] updated code with new URL and use lib_icon_rest_xml module to fetch data --- scraper.rb | 65 +++++------------------------------------------------- 1 file changed, 6 insertions(+), 59 deletions(-) diff --git a/scraper.rb b/scraper.rb index 85ef037..ef9747c 100644 --- a/scraper.rb +++ b/scraper.rb @@ -1,69 +1,16 @@ -require 'scraperwiki' -require 'mechanize' +require File.dirname(__FILE__) + '/lib_icon_rest_xml/scraper' case ENV['MORPH_PERIOD'] when 'lastmonth' period = "lastmonth" when 'thismonth' period = "thismonth" - else + when period = "thisweek" - ENV['MORPH_PERIOD'] = 'thisweek' + else + period = "last14days" + ENV['MORPH_PERIOD'] = 'last14days' end puts "Getting data in `" + ENV['MORPH_PERIOD'] + "`, changable via MORPH_PERIOD environment" -base_url = 'http://masterview.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx' -starting_url = base_url + '?d=' + period + '&k=LodgementDate&' -comment_url = 'mailto:council@northsydney.nsw.gov.au' - -def clean_whitespace(a) - a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip -end - -agent = Mechanize.new - -# Jump through bollocks agree screen -page = agent.get(starting_url) -puts "Agreeing" -page = page.forms.first.submit(page.forms.first.button_with(:value => "I Agree")) -page = agent.get(starting_url + "&o=xml") - -# Explicitly interpret as XML -page = Nokogiri::XML(page.content) - -raise "Can't find any elements" unless page.search('Application').length > 0 - -page.search('Application').each do |application| - council_reference = clean_whitespace(application.at("ReferenceNumber").inner_text) - - application_id = clean_whitespace(application.at("ApplicationId").inner_text.strip) - info_url = "#{base_url}?id=#{application_id}" - - unless application.at("Line1") - puts "Skipping due to lack of address for #{council_reference}" - next - end - - address = clean_whitespace(application.at("Line1").inner_text) - if !application.at('Line2').inner_text.empty? - address += ", " + clean_whitespace(application.at("Line2").inner_text) - end - - record = { - "council_reference" => council_reference, - "description" => clean_whitespace(application.at("ApplicationDetails").inner_text), - "date_received" => Date.parse(application.at("LodgementDate").inner_text).to_s, - "address" => address, - "date_scraped" => Date.today.to_s, - "info_url" => info_url, - "comment_url" => comment_url, - } - - if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true) - puts "Saving record " + record['council_reference'] + " - " + record['address'] -# puts record - ScraperWiki.save_sqlite(['council_reference'], record) - else - puts "Skipping already saved record " + record['council_reference'] - end -end +scrape_icon_rest_xml("https://apptracking.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx", "d=" + period + "&k=LodgementDate&o=xml") From b3d3ec0dd5ecfe45ea86ec150e0d312d244ffbeb Mon Sep 17 00:00:00 2001 From: Eric Tam Date: Mon, 12 Nov 2018 15:17:26 +1100 Subject: [PATCH 3/4] updated README --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index d16db5b..d9f2acf 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,7 @@ * Server - Microsoft * System - XC.Track -* Cookie tracking - Yes -* Pagination - Yes -* Detail Page - Yes -* Clearly defined data within a row - Sort of +* XML - Yes Setup MORPH_PERIOD for data recovery, available options are From ee6daeea234cc5ed9795aaf725e3ec81bec92636 Mon Sep 17 00:00:00 2001 From: Eric Tam Date: Mon, 12 Nov 2018 15:20:27 +1100 Subject: [PATCH 4/4] removed useless variable --- scraper.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scraper.rb b/scraper.rb index ef9747c..4df7496 100644 --- a/scraper.rb +++ b/scraper.rb @@ -9,8 +9,7 @@ period = "thisweek" else period = "last14days" - ENV['MORPH_PERIOD'] = 'last14days' end -puts "Getting data in `" + ENV['MORPH_PERIOD'] + "`, changable via MORPH_PERIOD environment" +puts "Getting data in `" + period + "`, changable via MORPH_PERIOD environment" scrape_icon_rest_xml("https://apptracking.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx", "d=" + period + "&k=LodgementDate&o=xml")