Merge pull request #4 from LoveMyData/master

Changed URL hence refactor all code
planningalerts-scrapers · May 23, 2017 · 5102c9d · 5102c9d
2 parents d284f9a + fa95e38
commit 5102c9d
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 66 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,19 @@
+# City Of Onkaparinga Council Scraper
+
+Note - This scraper will only scrape the primary DA and any subsequent DAs will not be scrape.
+eg. 683/2017/2 will not be scraped but 683/2017 will
+
+* Server - .NET - ePathway
+* Cookie tracking - Yes
+* Pagination - No
+* Javascript - Yes
+* Clearly defined data within a row - No but acceptable
+* Force scan one record at a time - Yes
+* User-Agent - Yes
+
+Setup MORPH_PERIOD for data recovery, available options are
+
+* year value (default is this year)
+
+Enjoy
+
diff --git a/scraper.rb b/scraper.rb
@@ -1,84 +1,81 @@
 require 'scraperwiki'
 require 'mechanize'
 
-use_cache = false
-cache_fn = 'cache.html'
-domain = 'http://www.onkaparingacity.com'
-url = 'http://www.onkaparingacity.com/onka/living_here/planning_development/applications_for_public_comment.jsp'
-comment_url = 'mailto:mail@onkaparinga.sa.gov.au?Subject=Planning+application+'
-
-# Clean up repeated and Unicode spaces.
-def clean(t)
-  t.gsub(/\u00A0|\uC2A0/,' ').squeeze(' ').strip
-end
-
-# The structure looks like:
-# <strong>Key:</strong>val<br>
-#
-def maybe_get_field(p, key)
-  p.search('strong').each do |elem|
-    if elem.inner_text.start_with?(key)
-      val = elem.next_sibling
-      return clean(val.inner_text) if val.name == 'text'
+def is_valid_year(date_str, min=2004, max=DateTime.now.year)
+  if ( date_str.scan(/^(\d)+$/) )
+    if ( (min..max).include?(date_str.to_i) )
+      return true
     end
   end
-  return ''
+  return false
 end
 
-def get_field(p, key)
-  val = maybe_get_field(p, key)
-  raise "Can't find mandator field " + key if val == ''
-  return val
+unless ( is_valid_year(ENV['MORPH_PERIOD'].to_s) )
+  ENV['MORPH_PERIOD'] = DateTime.now.year.to_s
 end
+puts "Getting data in year `" + ENV['MORPH_PERIOD'].to_s + "`, changable via MORPH_PERIOD environment"
 
-def maybe_get_date(p, key)
-  val = maybe_get_field(p, key)
-  return Date.parse(val).to_s if val != ''
-  return ''
-end
+base_url = "http://pathway.onkaparinga.sa.gov.au/ePathway/Production/Web/"
+comment_url = "mailto:mail@onkaparinga.sa.gov.au"
 
-if use_cache and File.exist?(cache_fn)
-  body = ''
-  File.open(cache_fn, 'r') {|f| body = f.read() }
-  page = Nokogiri(body)
-else
-  agent = Mechanize.new
-  page = agent.get(url)
-  File.open(cache_fn, 'w') {|f| f.write(page.body) }
-end
+# get the right cookies
+agent = Mechanize.new
+agent.user_agent_alias = 'Mac Safari'
+page = agent.get base_url + "default.aspx"
 
-found = false
-page.search('div.centricGeneral p').each do |p|
-  next unless p.inner_text =~ /\AApplication Number:/;
-  found = true
-  council_reference = get_field(p, 'Application Number:')
+# get to the page I can enter DA search
+page = agent.get base_url + "GeneralEnquiry/EnquiryLists.aspx?ModuleCode=LAP"
 
-  # Fix up the address:
-  address = get_field(p, 'Subject Land:')
-  # "278 (Allot 55 Sec 159 DP 69079) Communication Road, TATACHILLA SA 5171"
-  if address.include?('(')
-    address = address.split('(').first + address.split(')').last
+# local DB lookup if DB exist and find out what is the maxDA number
+i = 1;
+sql = "select * from data where `council_reference` like '%/#{ENV['MORPH_PERIOD']}'"
+results = ScraperWiki.sqliteexecute(sql) rescue false
+if ( results )
+  results.each do |result|
+    maxDA = result['council_reference'].gsub!("/#{ENV['MORPH_PERIOD']}", '')
+    if maxDA.to_i > i
+      i = maxDA.to_i
+    end
   end
-  address.squeeze!(' ')
-  # "Allot 102 Sec 1242 Range Road West, WILLUNGA SOUTH SA 5172"
-  address.sub!(/\AAllot \S+ Sec /, '')
+end
 
-  record = {
-    'council_reference' => council_reference,
-    'address' => address,
-    'description' => get_field(p, 'Nature of Development:'),
-    'info_url' => url,
-    'comment_url' => comment_url + CGI::escape(council_reference),
-    'date_scraped' => Date.today.to_s,
-    'on_notice_from' => maybe_get_date(p, 'Advertising Date:'),
-    'on_notice_to' => maybe_get_date(p, 'Close Date:'),
-  }
+error = 0
+cont = true
+while cont do
+  form = page.form
+  form.field_with(:name=>'ctl00$MainBodyContent$mGeneralEnquirySearchControl$mTabControl$ctl04$mFormattedNumberTextBox').value = i.to_s + '/' + ENV['MORPH_PERIOD'].to_s
+  button = form.button_with(:value => "Search")
+  list = form.click_button(button)
 
-  if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
-    ScraperWiki.save_sqlite(['council_reference'], record)
+  table = list.search("table.ContentPanel")
+  unless ( table.empty? )
+    error  = 0
+    tr     = table.search("tr.ContentPanel")
+
+    record = {
+      'council_reference' => tr.search('a').inner_text,
+      'address'           => tr.search('span')[3].inner_text,
+      'description'       => tr.search('span')[2].inner_text.gsub("\n", '. ').squeeze(' '),
+      'info_url'          => base_url + 'GeneralEnquiry/' + tr.search('a')[0]['href'],
+      'comment_url'       => comment_url,
+      'date_scraped'      => Date.today.to_s,
+      'date_received'     => Date.parse(tr.search('span')[1].inner_text).to_s,
+    }
+
+    if (ScraperWiki.select("* from data where `council_reference`='#{record['council_reference']}'").empty? rescue true)
+      puts "Saving record " + record['council_reference'] + ", " + record['address']
+#      puts record
+      ScraperWiki.save_sqlite(['council_reference'], record)
+    else
+      puts 'Skipping already saved record ' + record['council_reference']
+    end
   else
-    puts 'Skipping already saved record ' + record['council_reference']
+    error += 1
   end
-end
 
-raise "No entries found." unless found
+  # increase i value and scan the next DA
+  i += 1
+  if error == 10
+    cont = false
+  end
+end