Skip to content
This repository has been archived by the owner on Jun 5, 2019. It is now read-only.

Commit

Permalink
Merge pull request #1 from JasonThomasData/fixing_monash
Browse files Browse the repository at this point in the history
Fixing monash
  • Loading branch information
equivalentideas committed Aug 12, 2016
2 parents 4f48520 + ecfa9e7 commit 4894fee
Showing 1 changed file with 26 additions and 14 deletions.
40 changes: 26 additions & 14 deletions scraper.rb
Expand Up @@ -3,16 +3,21 @@

agent = Mechanize.new

def scrape_page(page)
def scrape_page(page, base_url)
table = page.at("table.ContentPanel")

table.search("tr")[1..-1].each do |tr|
tr_elems = table.search("tr")

tr_elems.each_with_index do |tr, index|
next if index == 0 #Skipping the first beacause it's got <th> not <td>

this_application_details = tr.css('td a')[0].attributes['href']
this_application_link = base_url + this_application_details.to_s

day, month, year = tr.search("td")[3].inner_text.split("/").map{|s| s.to_i}
default_url = "https://epathway.monash.vic.gov.au/ePathway/Production/Web/GeneralEnquiry/EnquiryLists.aspx?ModuleCode=LAP"


record = {
"info_url" => default_url,
"comment_url" => default_url,
"info_url" => this_application_link,
"comment_url" => 'mailto:mail@monash.vic.gov.au',
"council_reference" => tr.at("td a").inner_text,
"address" => tr.search("td")[1].inner_text,
"description" => tr.search("td")[2].inner_text,
Expand All @@ -29,16 +34,23 @@ def scrape_page(page)
end
end

# Load summary page.
url = "https://epathway.monash.vic.gov.au/ePathway/Production/Web/GeneralEnquiry/EnquiryLists.aspx?ModuleCode=LAP"
page = agent.get(url)
base_url = "https://epathway.monash.vic.gov.au/ePathway/Production/Web/GeneralEnquiry/"
entry_form_url_ext = "/EnquiryLists.aspx?ModuleCode=LAP"
first_page_url = base_url + entry_form_url_ext

# Get first page with radio buttons.
page = agent.get(first_page_url)
form = page.forms.first
form.radiobuttons[0].check
page = form.submit(form.button_with(:value => "Next"))

# Now do the paging magic
number_pages = page.at("#ctl00_MainBodyContent_mPagingControl_pageNumberLabel").inner_text.split(" ")[3].to_i

(1..number_pages).each do |no|
url = "https://epathway.monash.vic.gov.au/ePathway/Production/Web/GeneralEnquiry/EnquirySummaryView.aspx?PageNumber=#{no}"
page = agent.get(url)
result_page_extension = "/EnquirySummaryView.aspx?PageNumber=#{no}"
results_page_url = base_url + result_page_extension
page = agent.get(results_page_url)
puts "Scraping page #{no} of " + number_pages.to_s + "..."
scrape_page(page)
end
scrape_page(page, base_url)
end

0 comments on commit 4894fee

Please sign in to comment.