Skip to content
This repository has been archived by the owner on Jun 5, 2019. It is now read-only.

Commit

Permalink
Merge pull request #4 from LoveMyData/master
Browse files Browse the repository at this point in the history
Changed URL hence refactor all code
  • Loading branch information
equivalentideas committed May 23, 2017
2 parents d284f9a + fa95e38 commit 5102c9d
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 66 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# City Of Onkaparinga Council Scraper

Note - This scraper will only scrape the primary DA and any subsequent DAs will not be scrape.
eg. 683/2017/2 will not be scraped but 683/2017 will

* Server - .NET - ePathway
* Cookie tracking - Yes
* Pagination - No
* Javascript - Yes
* Clearly defined data within a row - No but acceptable
* Force scan one record at a time - Yes
* User-Agent - Yes

Setup MORPH_PERIOD for data recovery, available options are

* year value (default is this year)

Enjoy

129 changes: 63 additions & 66 deletions scraper.rb
Original file line number Diff line number Diff line change
@@ -1,84 +1,81 @@
require 'scraperwiki'
require 'mechanize'

use_cache = false
cache_fn = 'cache.html'
domain = 'http://www.onkaparingacity.com'
url = 'http://www.onkaparingacity.com/onka/living_here/planning_development/applications_for_public_comment.jsp'
comment_url = 'mailto:mail@onkaparinga.sa.gov.au?Subject=Planning+application+'

# Clean up repeated and Unicode spaces.
def clean(t)
t.gsub(/\u00A0|\uC2A0/,' ').squeeze(' ').strip
end

# The structure looks like:
# <strong>Key:</strong>val<br>
#
def maybe_get_field(p, key)
p.search('strong').each do |elem|
if elem.inner_text.start_with?(key)
val = elem.next_sibling
return clean(val.inner_text) if val.name == 'text'
def is_valid_year(date_str, min=2004, max=DateTime.now.year)
if ( date_str.scan(/^(\d)+$/) )
if ( (min..max).include?(date_str.to_i) )
return true
end
end
return ''
return false
end

def get_field(p, key)
val = maybe_get_field(p, key)
raise "Can't find mandator field " + key if val == ''
return val
unless ( is_valid_year(ENV['MORPH_PERIOD'].to_s) )
ENV['MORPH_PERIOD'] = DateTime.now.year.to_s
end
puts "Getting data in year `" + ENV['MORPH_PERIOD'].to_s + "`, changable via MORPH_PERIOD environment"

def maybe_get_date(p, key)
val = maybe_get_field(p, key)
return Date.parse(val).to_s if val != ''
return ''
end
base_url = "http://pathway.onkaparinga.sa.gov.au/ePathway/Production/Web/"
comment_url = "mailto:mail@onkaparinga.sa.gov.au"

if use_cache and File.exist?(cache_fn)
body = ''
File.open(cache_fn, 'r') {|f| body = f.read() }
page = Nokogiri(body)
else
agent = Mechanize.new
page = agent.get(url)
File.open(cache_fn, 'w') {|f| f.write(page.body) }
end
# get the right cookies
agent = Mechanize.new
agent.user_agent_alias = 'Mac Safari'
page = agent.get base_url + "default.aspx"

found = false
page.search('div.centricGeneral p').each do |p|
next unless p.inner_text =~ /\AApplication Number:/;
found = true
council_reference = get_field(p, 'Application Number:')
# get to the page I can enter DA search
page = agent.get base_url + "GeneralEnquiry/EnquiryLists.aspx?ModuleCode=LAP"

# Fix up the address:
address = get_field(p, 'Subject Land:')
# "278 (Allot 55 Sec 159 DP 69079) Communication Road, TATACHILLA SA 5171"
if address.include?('(')
address = address.split('(').first + address.split(')').last
# local DB lookup if DB exist and find out what is the maxDA number
i = 1;
sql = "select * from data where `council_reference` like '%/#{ENV['MORPH_PERIOD']}'"
results = ScraperWiki.sqliteexecute(sql) rescue false
if ( results )
results.each do |result|
maxDA = result['council_reference'].gsub!("/#{ENV['MORPH_PERIOD']}", '')
if maxDA.to_i > i
i = maxDA.to_i
end
end
address.squeeze!(' ')
# "Allot 102 Sec 1242 Range Road West, WILLUNGA SOUTH SA 5172"
address.sub!(/\AAllot \S+ Sec /, '')
end

record = {
'council_reference' => council_reference,
'address' => address,
'description' => get_field(p, 'Nature of Development:'),
'info_url' => url,
'comment_url' => comment_url + CGI::escape(council_reference),
'date_scraped' => Date.today.to_s,
'on_notice_from' => maybe_get_date(p, 'Advertising Date:'),
'on_notice_to' => maybe_get_date(p, 'Close Date:'),
}
error = 0
cont = true
while cont do
form = page.form
form.field_with(:name=>'ctl00$MainBodyContent$mGeneralEnquirySearchControl$mTabControl$ctl04$mFormattedNumberTextBox').value = i.to_s + '/' + ENV['MORPH_PERIOD'].to_s
button = form.button_with(:value => "Search")
list = form.click_button(button)

if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
ScraperWiki.save_sqlite(['council_reference'], record)
table = list.search("table.ContentPanel")
unless ( table.empty? )
error = 0
tr = table.search("tr.ContentPanel")

record = {
'council_reference' => tr.search('a').inner_text,
'address' => tr.search('span')[3].inner_text,
'description' => tr.search('span')[2].inner_text.gsub("\n", '. ').squeeze(' '),
'info_url' => base_url + 'GeneralEnquiry/' + tr.search('a')[0]['href'],
'comment_url' => comment_url,
'date_scraped' => Date.today.to_s,
'date_received' => Date.parse(tr.search('span')[1].inner_text).to_s,
}

if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
puts "Saving record " + record['council_reference'] + ", " + record['address']
# puts record
ScraperWiki.save_sqlite(['council_reference'], record)
else
puts 'Skipping already saved record ' + record['council_reference']
end
else
puts 'Skipping already saved record ' + record['council_reference']
error += 1
end
end

raise "No entries found." unless found
# increase i value and scan the next DA
i += 1
if error == 10
cont = false
end
end

0 comments on commit 5102c9d

Please sign in to comment.