initial commit

loren · Jun 23, 2016 · 3dda2d8 · 3dda2d8
1 parent c153cfc
commit 3dda2d8
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 43 deletions.
diff --git a/Gemfile b/Gemfile
@@ -7,6 +7,4 @@ source "https://rubygems.org"
 ruby "2.2.5"
 
 gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
-gem "httparty"
-gem "sanitize"
-gem "htmlentities"
+gem "httparty"
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -10,25 +10,12 @@ GIT
 GEM
   remote: https://rubygems.org/
   specs:
-    crass (1.0.2)
-    htmlentities (4.3.4)
     httparty (0.13.7)
       json (~> 1.8)
       multi_xml (>= 0.5.2)
     httpclient (2.6.0.1)
     json (1.8.3)
-    mini_portile2 (2.1.0)
     multi_xml (0.5.5)
-    nokogiri (1.6.8)
-      mini_portile2 (~> 2.1.0)
-      pkg-config (~> 1.1.7)
-    nokogumbo (1.4.7)
-      nokogiri
-    pkg-config (1.1.7)
-    sanitize (4.0.1)
-      crass (~> 1.0.2)
-      nokogiri (>= 1.4.4)
-      nokogumbo (~> 1.4.1)
     sqlite3 (1.3.10)
     sqlite_magic (0.0.3)
       sqlite3
@@ -37,9 +24,7 @@ PLATFORMS
   ruby
 
 DEPENDENCIES
-  htmlentities
   httparty
-  sanitize
   scraperwiki!
 
 RUBY VERSION

diff --git a/scraper.rb b/scraper.rb
@@ -1,25 +1,46 @@
-# This is a template for a Ruby scraper on morph.io (https://morph.io)
-# including some code snippets below that you should find helpful
-
-# require 'scraperwiki'
-# require 'mechanize'
-#
-# agent = Mechanize.new
-#
-# # Read in a page
-# page = agent.get("http://foo.com")
-#
-# # Find somehing on the page using css selectors
-# p page.at('div.content')
-#
-# # Write out to the sqlite database using scraperwiki library
-# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
-#
-# # An arbitrary query against the database
-# ScraperWiki.select("* from data where 'name'='peter'")
-
-# You don't have to do things with the Mechanize or ScraperWiki libraries.
-# You can use whatever gems you want: https://morph.io/documentation/ruby
-# All that matters is that your final data is written to an SQLite database
-# called "data.sqlite" in the current working directory which has at least a table
-# called "data".
+require 'scraperwiki'
+require 'httparty'
+require 'date'
+
+ENDPOINT = 'http://bids.state.gov/geoserver/opengeo/ows?service=WFS&version=1.0.0&request=GetFeature&srsName=EPSG:4326&typeName=opengeo%3ADATATABLE&outputformat=json&FILTER=%3CFilter%3E%0A%3CPropertyIsEqualTo%3E%0A%09%09%09%3CPropertyName%3ECleared%3C%2FPropertyName%3E%0A%09%09%09%3CLiteral%3E1%3C%2FLiteral%3E%0A%09%09%3C%2FPropertyIsEqualTo%3E%0A%3C%2FFilter%3E'
+
+def clean_table
+  ScraperWiki.sqliteexecute('DELETE FROM data')
+rescue SqliteMagic::NoSuchTable
+  puts "Data table does not exist yet"
+end
+
+def squish(str)
+  str.gsub(/\A[[:space:]]+/, '').gsub(/[[:space:]]+\z/, '').gsub(/[[:space:]]+/, ' ')
+end
+
+def normalize(str)
+  squish(CODER.decode(str))
+end
+
+def fetch_results
+  response = HTTParty.get(ENDPOINT)
+  results = JSON.parse(response.body, symbolize_names: true)
+  results[:features].select { |article_hash| valid_entry?(article_hash) }.
+    map { |article_hash| process_entry_info(article_hash) }.
+    each { |article_hash| ScraperWiki.save_sqlite(%i(id), article_hash) }
+end
+
+def process_entry_info(entry_hash)
+  entry = entry_hash[:properties]
+  entry[:id] = entry_hash[:id]
+  %i(Project_Announced Tender_Date).each do |field|
+    entry[field] &&= Date.parse(entry[field]).iso8601
+  end
+  %i(Post_Comments Project_Description Project_Title Keyword Project_POCs).each do |field|
+    entry[field] &&= squish(entry[field])
+  end
+  entry
+end
+
+def valid_entry?(entry)
+  entry[:Tender_Date].nil? || Date.strptime(entry[:Tender_Date]) >= Date.current
+end
+
+clean_table
+fetch_results