diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..66d464d --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# Ignore output of scraper +data.sqlite diff --git a/README.md b/README.md new file mode 100644 index 0000000..d15e58e --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +Gets development applications for the "City of Ryde.":http://www.ryde.nsw.gov.au/development/pn.htm \ No newline at end of file diff --git a/scraper.rb b/scraper.rb new file mode 100644 index 0000000..7c5ff27 --- /dev/null +++ b/scraper.rb @@ -0,0 +1,28 @@ +require 'rubygems' +require 'mechanize' +require 'date' + +url = 'http://www.ryde.nsw.gov.au/Development/Development+Applications/DAs+on+Exhibition/Received+Development+Applications' +agent = Mechanize.new + +page = agent.get(url) + +page.at('div.content-spacing').search('p').each do |p| + # Skip if this isn't a DA + next if p.search('strong').count < 3 + + record = { + 'council_reference' => p.search('strong')[1].next.inner_text.gsub(': ', '').gsub('. ', '').strip, + 'description' => p.search('strong')[2].next.next.next.inner_text.strip, + 'address' => p.search('strong')[0].next.inner_text.gsub(': ', '').strip, + 'info_url' => url, + 'comment_url' => url, + 'date_scraped' => Date.today.to_s + } + + if ScraperWiki.select("* from swdata where `council_reference`='#{record['council_reference']}'").empty? + ScraperWiki.save_sqlite(['council_reference'], record) + else + puts "Skipping already saved record " + record['council_reference'] + end +end