First pass

planningalerts-scrapers · Apr 4, 2014 · cd108f9 · cd108f9
1 parent 0b7cd6b
commit cd108f9
Showing 1 changed file with 35 additions and 22 deletions.
diff --git a/scraper.rb b/scraper.rb
@@ -1,24 +1,37 @@
-# This is a template for a Ruby scraper on Morph (https://morph.io)
-# including some code snippets below that you should find helpful
+require 'scraperwiki'
+require 'mechanize'
 
-# require 'scraperwiki'
-# require 'mechanize'
-#
-# agent = Mechanize.new
-#
-# # Read in a page
-# page = agent.get("http://foo.com")
-#
-# # Find somehing on the page using css selectors
-# p page.at('div.content')
-#
-# # Write out to the sqlite database using scraperwiki library
-# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
-#
-# # An arbitrary query against the database
-# ScraperWiki.select("* from data where 'name'='peter'")
+agent = Mechanize.new
 
-# You don't have to do things with the Mechanize or ScraperWiki libraries. You can use whatever gems are installed
-# on Morph for Ruby (https://github.com/openaustralia/morph-docker-ruby/blob/master/Gemfile) and all that matters
-# is that your final data is written to an Sqlite database called data.sqlite in the current working directory which
-# has at least a table called data.
+url = "https://eservices.moreland.vic.gov.au/ePathway/Production/Web/GeneralEnquiry/EnquiryLists.aspx?ModuleCode=LAP"
+
+def scrape_page(page, url)
+  table = page.at("table.ContentPanel")
+  table.search("tr")[1..-1].each do |tr|
+    day, month, year = tr.search("td")[1].inner_text.split("/")
+    record = {
+      "info_url" => url,
+      "comment_url" => url,
+      "council_reference" => tr.at("td a").inner_text,
+      "date_received" => "#{year}-#{month}-#{day}",
+      "description" => tr.search("td")[2].inner_text,
+      "address" => tr.search("td")[3].inner_text
+    }
+    p record
+  end
+end
+
+page = agent.get(url)
+
+form = page.forms.first
+form.radiobuttons.first.check
+page = form.submit(form.button_with(type: "submit"))
+
+# Now do the paging magic
+number_pages =  page.at("#ctl00_MainBodyContent_mPagingControl_pageNumberLabel").inner_text.split(" ")[3].to_i
+
+(1..number_pages).each do |no|
+  page = agent.get("https://eservices.moreland.vic.gov.au/ePathway/Production/Web/GeneralEnquiry/EnquirySummaryView.aspx?PageNumber=#{no}")
+  puts "Scraping page #{no} of results..."
+  scrape_page(page, url)
+end