This repository has been archived by the owner on Jun 12, 2019. It is now read-only.
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from LoveMyData/master
updated URL and change to use XML library
- Loading branch information
Showing
4 changed files
with
11 additions
and
68 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[submodule "lib_icon_rest_xml"] | ||
path = lib_icon_rest_xml | ||
url = https://github.com/planningalerts-scrapers/lib_icon_rest_xml/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Submodule lib_icon_rest_xml
added at
3d6284
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,74 +1,16 @@ | ||
require 'scraperwiki' | ||
require 'mechanize' | ||
require File.dirname(__FILE__) + '/lib_icon_rest_xml/scraper' | ||
|
||
case ENV['MORPH_PERIOD'] | ||
when 'lastmonth' | ||
period = "lastmonth" | ||
when 'thismonth' | ||
period = "thismonth" | ||
else | ||
when | ||
period = "thisweek" | ||
ENV['MORPH_PERIOD'] = 'thisweek' | ||
end | ||
puts "Getting data in `" + ENV['MORPH_PERIOD'] + "`, changable via MORPH_PERIOD environment" | ||
|
||
base_url = 'https://apptracking.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx' | ||
starting_url = base_url + '?d=' + period + '&k=LodgementDate&' | ||
comment_url = 'mailto:council@northsydney.nsw.gov.au' | ||
else | ||
period = "last14days" | ||
|
||
def clean_whitespace(a) | ||
a.gsub("\r", ' ').gsub("\n", ' ').squeeze(" ").strip | ||
end | ||
puts "Getting data in `" + period + "`, changable via MORPH_PERIOD environment" | ||
|
||
agent = Mechanize.new | ||
|
||
# Jump through bollocks agree screen | ||
page = agent.get(starting_url) | ||
puts "Agreeing" | ||
# Workaround for weird content-encoding "gzip,gzip". | ||
# See https://stackoverflow.com/questions/35444572/unsupported-content-encoding-gzip-gzip-when-submitting-form-with-mechanize | ||
agent.content_encoding_hooks << lambda { |httpagent, uri, response, body_io| | ||
response['Content-Encoding'] = '' | ||
} | ||
page = page.forms.first.submit(page.forms.first.button_with(:value => "I Agree")) | ||
page = agent.get(starting_url + "&o=xml") | ||
|
||
# Explicitly interpret as XML | ||
page = Nokogiri::XML(page.content) | ||
|
||
raise "Can't find any <Application> elements" unless page.search('Application').length > 0 | ||
|
||
page.search('Application').each do |application| | ||
council_reference = clean_whitespace(application.at("ReferenceNumber").inner_text) | ||
|
||
application_id = clean_whitespace(application.at("ApplicationId").inner_text.strip) | ||
info_url = "#{base_url}?id=#{application_id}" | ||
|
||
unless application.at("Line1") | ||
puts "Skipping due to lack of address for #{council_reference}" | ||
next | ||
end | ||
|
||
address = clean_whitespace(application.at("Line1").inner_text) | ||
if !application.at('Line2').inner_text.empty? | ||
address += ", " + clean_whitespace(application.at("Line2").inner_text) | ||
end | ||
|
||
record = { | ||
"council_reference" => council_reference, | ||
"description" => clean_whitespace(application.at("ApplicationDetails").inner_text), | ||
"date_received" => Date.parse(application.at("LodgementDate").inner_text).to_s, | ||
"address" => address, | ||
"date_scraped" => Date.today.to_s, | ||
"info_url" => info_url, | ||
"comment_url" => comment_url, | ||
} | ||
|
||
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true) | ||
puts "Saving record " + record['council_reference'] + " - " + record['address'] | ||
# puts record | ||
ScraperWiki.save_sqlite(['council_reference'], record) | ||
else | ||
puts "Skipping already saved record " + record['council_reference'] | ||
end | ||
end | ||
scrape_icon_rest_xml("https://apptracking.northsydney.nsw.gov.au/Pages/XC.Track/SearchApplication.aspx", "d=" + period + "&k=LodgementDate&o=xml") |