This repository has been archived by the owner on Jun 5, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from LoveMyData/master
Changed URL hence refactor all code
- Loading branch information
Showing
2 changed files
with
82 additions
and
66 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# City Of Onkaparinga Council Scraper | ||
|
||
Note - This scraper will only scrape the primary DA and any subsequent DAs will not be scrape. | ||
eg. 683/2017/2 will not be scraped but 683/2017 will | ||
|
||
* Server - .NET - ePathway | ||
* Cookie tracking - Yes | ||
* Pagination - No | ||
* Javascript - Yes | ||
* Clearly defined data within a row - No but acceptable | ||
* Force scan one record at a time - Yes | ||
* User-Agent - Yes | ||
|
||
Setup MORPH_PERIOD for data recovery, available options are | ||
|
||
* year value (default is this year) | ||
|
||
Enjoy | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,84 +1,81 @@ | ||
require 'scraperwiki' | ||
require 'mechanize' | ||
|
||
use_cache = false | ||
cache_fn = 'cache.html' | ||
domain = 'http://www.onkaparingacity.com' | ||
url = 'http://www.onkaparingacity.com/onka/living_here/planning_development/applications_for_public_comment.jsp' | ||
comment_url = 'mailto:mail@onkaparinga.sa.gov.au?Subject=Planning+application+' | ||
|
||
# Clean up repeated and Unicode spaces. | ||
def clean(t) | ||
t.gsub(/\u00A0|\uC2A0/,' ').squeeze(' ').strip | ||
end | ||
|
||
# The structure looks like: | ||
# <strong>Key:</strong>val<br> | ||
# | ||
def maybe_get_field(p, key) | ||
p.search('strong').each do |elem| | ||
if elem.inner_text.start_with?(key) | ||
val = elem.next_sibling | ||
return clean(val.inner_text) if val.name == 'text' | ||
def is_valid_year(date_str, min=2004, max=DateTime.now.year) | ||
if ( date_str.scan(/^(\d)+$/) ) | ||
if ( (min..max).include?(date_str.to_i) ) | ||
return true | ||
end | ||
end | ||
return '' | ||
return false | ||
end | ||
|
||
def get_field(p, key) | ||
val = maybe_get_field(p, key) | ||
raise "Can't find mandator field " + key if val == '' | ||
return val | ||
unless ( is_valid_year(ENV['MORPH_PERIOD'].to_s) ) | ||
ENV['MORPH_PERIOD'] = DateTime.now.year.to_s | ||
end | ||
puts "Getting data in year `" + ENV['MORPH_PERIOD'].to_s + "`, changable via MORPH_PERIOD environment" | ||
|
||
def maybe_get_date(p, key) | ||
val = maybe_get_field(p, key) | ||
return Date.parse(val).to_s if val != '' | ||
return '' | ||
end | ||
base_url = "http://pathway.onkaparinga.sa.gov.au/ePathway/Production/Web/" | ||
comment_url = "mailto:mail@onkaparinga.sa.gov.au" | ||
|
||
if use_cache and File.exist?(cache_fn) | ||
body = '' | ||
File.open(cache_fn, 'r') {|f| body = f.read() } | ||
page = Nokogiri(body) | ||
else | ||
agent = Mechanize.new | ||
page = agent.get(url) | ||
File.open(cache_fn, 'w') {|f| f.write(page.body) } | ||
end | ||
# get the right cookies | ||
agent = Mechanize.new | ||
agent.user_agent_alias = 'Mac Safari' | ||
page = agent.get base_url + "default.aspx" | ||
|
||
found = false | ||
page.search('div.centricGeneral p').each do |p| | ||
next unless p.inner_text =~ /\AApplication Number:/; | ||
found = true | ||
council_reference = get_field(p, 'Application Number:') | ||
# get to the page I can enter DA search | ||
page = agent.get base_url + "GeneralEnquiry/EnquiryLists.aspx?ModuleCode=LAP" | ||
|
||
# Fix up the address: | ||
address = get_field(p, 'Subject Land:') | ||
# "278 (Allot 55 Sec 159 DP 69079) Communication Road, TATACHILLA SA 5171" | ||
if address.include?('(') | ||
address = address.split('(').first + address.split(')').last | ||
# local DB lookup if DB exist and find out what is the maxDA number | ||
i = 1; | ||
sql = "select * from data where `council_reference` like '%/#{ENV['MORPH_PERIOD']}'" | ||
results = ScraperWiki.sqliteexecute(sql) rescue false | ||
if ( results ) | ||
results.each do |result| | ||
maxDA = result['council_reference'].gsub!("/#{ENV['MORPH_PERIOD']}", '') | ||
if maxDA.to_i > i | ||
i = maxDA.to_i | ||
end | ||
end | ||
address.squeeze!(' ') | ||
# "Allot 102 Sec 1242 Range Road West, WILLUNGA SOUTH SA 5172" | ||
address.sub!(/\AAllot \S+ Sec /, '') | ||
end | ||
|
||
record = { | ||
'council_reference' => council_reference, | ||
'address' => address, | ||
'description' => get_field(p, 'Nature of Development:'), | ||
'info_url' => url, | ||
'comment_url' => comment_url + CGI::escape(council_reference), | ||
'date_scraped' => Date.today.to_s, | ||
'on_notice_from' => maybe_get_date(p, 'Advertising Date:'), | ||
'on_notice_to' => maybe_get_date(p, 'Close Date:'), | ||
} | ||
error = 0 | ||
cont = true | ||
while cont do | ||
form = page.form | ||
form.field_with(:name=>'ctl00$MainBodyContent$mGeneralEnquirySearchControl$mTabControl$ctl04$mFormattedNumberTextBox').value = i.to_s + '/' + ENV['MORPH_PERIOD'].to_s | ||
button = form.button_with(:value => "Search") | ||
list = form.click_button(button) | ||
|
||
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true) | ||
ScraperWiki.save_sqlite(['council_reference'], record) | ||
table = list.search("table.ContentPanel") | ||
unless ( table.empty? ) | ||
error = 0 | ||
tr = table.search("tr.ContentPanel") | ||
|
||
record = { | ||
'council_reference' => tr.search('a').inner_text, | ||
'address' => tr.search('span')[3].inner_text, | ||
'description' => tr.search('span')[2].inner_text.gsub("\n", '. ').squeeze(' '), | ||
'info_url' => base_url + 'GeneralEnquiry/' + tr.search('a')[0]['href'], | ||
'comment_url' => comment_url, | ||
'date_scraped' => Date.today.to_s, | ||
'date_received' => Date.parse(tr.search('span')[1].inner_text).to_s, | ||
} | ||
|
||
if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true) | ||
puts "Saving record " + record['council_reference'] + ", " + record['address'] | ||
# puts record | ||
ScraperWiki.save_sqlite(['council_reference'], record) | ||
else | ||
puts 'Skipping already saved record ' + record['council_reference'] | ||
end | ||
else | ||
puts 'Skipping already saved record ' + record['council_reference'] | ||
error += 1 | ||
end | ||
end | ||
|
||
raise "No entries found." unless found | ||
# increase i value and scan the next DA | ||
i += 1 | ||
if error == 10 | ||
cont = false | ||
end | ||
end |