Skip to content

Commit

Permalink
Add scraper code
Browse files Browse the repository at this point in the history
  • Loading branch information
lizconlan committed Aug 10, 2015
1 parent edb87f0 commit 20b96b8
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 49 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
# Ignore output of scraper
data.sqlite

# Ignore local data
.cache/*
4 changes: 3 additions & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ source "https://rubygems.org"
ruby "2.0.0"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "mechanize"
gem "colorize"
gem "nokogiri"
gem "open-uri-cached"
32 changes: 9 additions & 23 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -10,38 +10,24 @@ GIT
GEM
remote: https://rubygems.org/
specs:
domain_name (0.5.24)
unf (>= 0.0.5, < 1.0.0)
http-cookie (1.0.2)
domain_name (~> 0.5)
colorize (0.7.7)
httpclient (2.6.0.1)
mechanize (2.7.3)
domain_name (~> 0.5, >= 0.5.1)
http-cookie (~> 1.0)
mime-types (~> 2.0)
net-http-digest_auth (~> 1.1, >= 1.1.1)
net-http-persistent (~> 2.5, >= 2.5.2)
nokogiri (~> 1.4)
ntlm-http (~> 0.1, >= 0.1.1)
webrobots (>= 0.0.9, < 0.2)
mime-types (2.5)
mini_portile (0.6.2)
net-http-digest_auth (1.4)
net-http-persistent (2.9.4)
nokogiri (1.6.6.2)
mini_portile (~> 0.6.0)
ntlm-http (0.1.1)
open-uri-cached (0.0.5)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite_magic (0.0.6)
sqlite3
unf (0.1.4)
unf_ext
unf_ext (0.0.7.1)
webrobots (0.1.1)

PLATFORMS
ruby

DEPENDENCIES
mechanize
colorize
nokogiri
open-uri-cached
scraperwiki!

BUNDLED WITH
1.10.6
149 changes: 124 additions & 25 deletions scraper.rb
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,25 +1,124 @@
# This is a template for a Ruby scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# require 'scraperwiki'
# require 'mechanize'
#
# agent = Mechanize.new
#
# # Read in a page
# page = agent.get("http://foo.com")
#
# # Find somehing on the page using css selectors
# p page.at('div.content')
#
# # Write out to the sqlite database using scraperwiki library
# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
#
# # An arbitrary query against the database
# ScraperWiki.select("* from data where 'name'='peter'")

# You don't have to do things with the Mechanize or ScraperWiki libraries.
# You can use whatever gems you want: https://morph.io/documentation/ruby
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
#!/bin/env ruby
# encoding: utf-8

require 'scraperwiki'
require 'nokogiri'
require 'open-uri'

require 'open-uri/cached'
OpenURI::Cache.cache_path = '.cache'

base_url = 'http://www.caribbeanelections.com/ai/elections/'
start_page = "#{base_url}default.asp"

def get_page(url)
Nokogiri::HTML(open(url).read)
end

def scrape_election_list(start_page, base_url)
pages = []
doc = get_page(start_page)
doc.css('a[href*="_results_"]/@href').map(&:text).uniq.each do |page|
pages << "#{base_url}#{page}"
end
pages
end

def election_years(urls, term_length=5)
years = []
urls.each do |url|
if url =~ /(\d{4})/
years << $1.to_i
end
end
years.sort
end

def create_terms(years, term_length=5)
term_years = years.dup
term_years << term_years[-1] + term_years[-1] + term_length
term_years.each_cons(2) do |s, e|
term = {
id: s,
name: "#{s}#{e}",
start_date: s,
end_date: e,
}
ScraperWiki.save_sqlite([:id], term, 'terms')
end
end

def scrape_constituency(url)
doc = noko_for(url)
constituency = noko.css('.Article02').text
puts constituency
noko.xpath('.//span[@class="votes" and contains(.,"Representatives")]/ancestor::table[1]/tr[2]//table/tr').drop(1).each do |tr|
tds = tr.css('td')
data = {
name: tds[1].text.tidy,
party: tds[2].text.tidy,
term: tds[0].text.tidy,
constituency: constituency,
source: url.to_s,
}
mp_link = tds[1].css('a/@href')
unless mp_link.to_s.empty?
new_data = scrape_mp(URI.join(url, mp_link.text))
data.merge! new_data
end
# puts data
ScraperWiki.save_sqlite([:name, :term], data)
end
end

def scrape_mp(url)
doc = get_page(url)
data = {
image: doc.css('img[@src*="/people/"]').sort_by { |i| i.attr('width') }.first.attr('src'),
facebook: doc.css('a.inside[@href*="facebook.com"]/@href').text,
}
data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty?
data
end

def scrape_constituency(url)
doc = get_page(url)
constituency = doc.css('.Article02').text
puts constituency
doc.xpath('.//span[@class="votes" and contains(.,"Representatives")]/ancestor::table[1]/tr[2]//table/tr').drop(1).each do |tr|
tds = tr.css('td')
data = {
name: tds[1].text.tidy,
party: tds[2].text.tidy,
term: tds[0].text.tidy,
constituency: constituency,
source: url.to_s,
}
mp_link = tds[1].css('a/@href')
unless mp_link.to_s.empty?
new_data = scrape_mp(URI.join(url, mp_link.text))
data.merge! new_data
end
# puts data
ScraperWiki.save_sqlite([:name, :term], data)
end
end

def scrape_election(url)
doc = get_page(url)
doc.css('a[href*="/district/"]/@href').map(&:text).uniq.each do |page|
scrape_constituency("#{base_url}#{page}")
end
end


doc = get_page(start_page)
election_pages = scrape_election_list(start_page, base_url)

# store the term data
create_terms(election_years(election_pages))

election_pages.each do |election_page|
# scrape and store election data
scrape_election(election_page)
end

0 comments on commit 20b96b8

Please sign in to comment.