Add scraper code

lizconlan · Aug 10, 2015 · 20b96b8 · 20b96b8
1 parent edb87f0
commit 20b96b8
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 49 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,5 @@
 # Ignore output of scraper
 data.sqlite
+
+# Ignore local data
+.cache/*
diff --git a/Gemfile b/Gemfile
@@ -7,4 +7,6 @@ source "https://rubygems.org"
 ruby "2.0.0"
 
 gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
-gem "mechanize"
+gem "colorize"
+gem "nokogiri"
+gem "open-uri-cached"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -10,38 +10,24 @@ GIT
 GEM
   remote: https://rubygems.org/
   specs:
-    domain_name (0.5.24)
-      unf (>= 0.0.5, < 1.0.0)
-    http-cookie (1.0.2)
-      domain_name (~> 0.5)
+    colorize (0.7.7)
     httpclient (2.6.0.1)
-    mechanize (2.7.3)
-      domain_name (~> 0.5, >= 0.5.1)
-      http-cookie (~> 1.0)
-      mime-types (~> 2.0)
-      net-http-digest_auth (~> 1.1, >= 1.1.1)
-      net-http-persistent (~> 2.5, >= 2.5.2)
-      nokogiri (~> 1.4)
-      ntlm-http (~> 0.1, >= 0.1.1)
-      webrobots (>= 0.0.9, < 0.2)
-    mime-types (2.5)
     mini_portile (0.6.2)
-    net-http-digest_auth (1.4)
-    net-http-persistent (2.9.4)
     nokogiri (1.6.6.2)
       mini_portile (~> 0.6.0)
-    ntlm-http (0.1.1)
+    open-uri-cached (0.0.5)
     sqlite3 (1.3.10)
-    sqlite_magic (0.0.3)
+    sqlite_magic (0.0.6)
       sqlite3
-    unf (0.1.4)
-      unf_ext
-    unf_ext (0.0.7.1)
-    webrobots (0.1.1)
 
 PLATFORMS
   ruby
 
 DEPENDENCIES
-  mechanize
+  colorize
+  nokogiri
+  open-uri-cached
   scraperwiki!
+
+BUNDLED WITH
+   1.10.6
diff --git a/scraper.rb b/scraper.rb
@@ -1,25 +1,124 @@
-# This is a template for a Ruby scraper on morph.io (https://morph.io)
-# including some code snippets below that you should find helpful
-
-# require 'scraperwiki'
-# require 'mechanize'
-#
-# agent = Mechanize.new
-#
-# # Read in a page
-# page = agent.get("http://foo.com")
-#
-# # Find somehing on the page using css selectors
-# p page.at('div.content')
-#
-# # Write out to the sqlite database using scraperwiki library
-# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
-#
-# # An arbitrary query against the database
-# ScraperWiki.select("* from data where 'name'='peter'")
-
-# You don't have to do things with the Mechanize or ScraperWiki libraries.
-# You can use whatever gems you want: https://morph.io/documentation/ruby
-# All that matters is that your final data is written to an SQLite database
-# called "data.sqlite" in the current working directory which has at least a table
-# called "data".
+#!/bin/env ruby
+# encoding: utf-8
+
+require 'scraperwiki'
+require 'nokogiri'
+require 'open-uri'
+
+require 'open-uri/cached'
+OpenURI::Cache.cache_path = '.cache'
+
+base_url = 'http://www.caribbeanelections.com/ai/elections/'
+start_page = "#{base_url}default.asp"
+
+def get_page(url)
+  Nokogiri::HTML(open(url).read)
+end
+
+def scrape_election_list(start_page, base_url)
+  pages = []
+  doc = get_page(start_page)
+  doc.css('a[href*="_results_"]/@href').map(&:text).uniq.each do |page|
+    pages << "#{base_url}#{page}"
+  end
+  pages
+end
+
+def election_years(urls, term_length=5)
+  years = []
+  urls.each do |url|
+    if url =~ /(\d{4})/
+      years << $1.to_i
+    end
+  end
+  years.sort
+end
+
+def create_terms(years, term_length=5)
+  term_years = years.dup
+  term_years << term_years[-1] + term_years[-1] + term_length
+  term_years.each_cons(2) do |s, e|
+    term = {
+      id: s,
+      name: "#{s}–#{e}",
+      start_date: s,
+      end_date: e,
+    }
+    ScraperWiki.save_sqlite([:id], term, 'terms')
+  end
+end
+
+def scrape_constituency(url)
+  doc = noko_for(url)
+  constituency = noko.css('.Article02').text
+  puts constituency
+  noko.xpath('.//span[@class="votes" and contains(.,"Representatives")]/ancestor::table[1]/tr[2]//table/tr').drop(1).each do |tr|
+    tds = tr.css('td')
+    data = {
+      name: tds[1].text.tidy,
+      party: tds[2].text.tidy,
+      term: tds[0].text.tidy,
+      constituency: constituency,
+      source: url.to_s,
+    }
+    mp_link = tds[1].css('a/@href')
+    unless mp_link.to_s.empty?
+      new_data = scrape_mp(URI.join(url, mp_link.text))
+      data.merge! new_data
+    end
+    # puts data
+    ScraperWiki.save_sqlite([:name, :term], data)
+  end
+end
+
+def scrape_mp(url)
+  doc = get_page(url)
+  data = {
+    image: doc.css('img[@src*="/people/"]').sort_by { |i| i.attr('width') }.first.attr('src'),
+    facebook: doc.css('a.inside[@href*="facebook.com"]/@href').text,
+  }
+  data[:image] = URI.join(url, data[:image]).to_s unless data[:image].to_s.empty?
+  data
+end
+
+def scrape_constituency(url)
+  doc = get_page(url)
+  constituency = doc.css('.Article02').text
+  puts constituency
+  doc.xpath('.//span[@class="votes" and contains(.,"Representatives")]/ancestor::table[1]/tr[2]//table/tr').drop(1).each do |tr|
+    tds = tr.css('td')
+    data = {
+      name: tds[1].text.tidy,
+      party: tds[2].text.tidy,
+      term: tds[0].text.tidy,
+      constituency: constituency,
+      source: url.to_s,
+    }
+    mp_link = tds[1].css('a/@href')
+    unless mp_link.to_s.empty?
+      new_data = scrape_mp(URI.join(url, mp_link.text))
+      data.merge! new_data
+    end
+    # puts data
+    ScraperWiki.save_sqlite([:name, :term], data)
+  end
+end
+
+def scrape_election(url)
+  doc = get_page(url)
+  doc.css('a[href*="/district/"]/@href').map(&:text).uniq.each do |page|
+    scrape_constituency("#{base_url}#{page}")
+  end
+end
+
+
+doc = get_page(start_page)
+election_pages = scrape_election_list(start_page, base_url)
+
+# store the term data
+create_terms(election_years(election_pages))
+
+election_pages.each do |election_page|
+  # scrape and store election data
+  scrape_election(election_page)
+end