Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
loren committed Jun 23, 2016
1 parent c153cfc commit 3dda2d8
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 43 deletions.
4 changes: 1 addition & 3 deletions Gemfile
Expand Up @@ -7,6 +7,4 @@ source "https://rubygems.org"
ruby "2.2.5"

gem "scraperwiki", git: "https://github.com/openaustralia/scraperwiki-ruby.git", branch: "morph_defaults"
gem "httparty"
gem "sanitize"
gem "htmlentities"
gem "httparty"
15 changes: 0 additions & 15 deletions Gemfile.lock
Expand Up @@ -10,25 +10,12 @@ GIT
GEM
remote: https://rubygems.org/
specs:
crass (1.0.2)
htmlentities (4.3.4)
httparty (0.13.7)
json (~> 1.8)
multi_xml (>= 0.5.2)
httpclient (2.6.0.1)
json (1.8.3)
mini_portile2 (2.1.0)
multi_xml (0.5.5)
nokogiri (1.6.8)
mini_portile2 (~> 2.1.0)
pkg-config (~> 1.1.7)
nokogumbo (1.4.7)
nokogiri
pkg-config (1.1.7)
sanitize (4.0.1)
crass (~> 1.0.2)
nokogiri (>= 1.4.4)
nokogumbo (~> 1.4.1)
sqlite3 (1.3.10)
sqlite_magic (0.0.3)
sqlite3
Expand All @@ -37,9 +24,7 @@ PLATFORMS
ruby

DEPENDENCIES
htmlentities
httparty
sanitize
scraperwiki!

RUBY VERSION
Expand Down
71 changes: 46 additions & 25 deletions scraper.rb
@@ -1,25 +1,46 @@
# This is a template for a Ruby scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful

# require 'scraperwiki'
# require 'mechanize'
#
# agent = Mechanize.new
#
# # Read in a page
# page = agent.get("http://foo.com")
#
# # Find somehing on the page using css selectors
# p page.at('div.content')
#
# # Write out to the sqlite database using scraperwiki library
# ScraperWiki.save_sqlite(["name"], {"name" => "susan", "occupation" => "software developer"})
#
# # An arbitrary query against the database
# ScraperWiki.select("* from data where 'name'='peter'")

# You don't have to do things with the Mechanize or ScraperWiki libraries.
# You can use whatever gems you want: https://morph.io/documentation/ruby
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".
require 'scraperwiki'
require 'httparty'
require 'date'

ENDPOINT = 'http://bids.state.gov/geoserver/opengeo/ows?service=WFS&version=1.0.0&request=GetFeature&srsName=EPSG:4326&typeName=opengeo%3ADATATABLE&outputformat=json&FILTER=%3CFilter%3E%0A%3CPropertyIsEqualTo%3E%0A%09%09%09%3CPropertyName%3ECleared%3C%2FPropertyName%3E%0A%09%09%09%3CLiteral%3E1%3C%2FLiteral%3E%0A%09%09%3C%2FPropertyIsEqualTo%3E%0A%3C%2FFilter%3E'

def clean_table
ScraperWiki.sqliteexecute('DELETE FROM data')
rescue SqliteMagic::NoSuchTable
puts "Data table does not exist yet"
end

def squish(str)
str.gsub(/\A[[:space:]]+/, '').gsub(/[[:space:]]+\z/, '').gsub(/[[:space:]]+/, ' ')
end

def normalize(str)
squish(CODER.decode(str))
end

def fetch_results
response = HTTParty.get(ENDPOINT)
results = JSON.parse(response.body, symbolize_names: true)
results[:features].select { |article_hash| valid_entry?(article_hash) }.
map { |article_hash| process_entry_info(article_hash) }.
each { |article_hash| ScraperWiki.save_sqlite(%i(id), article_hash) }
end

def process_entry_info(entry_hash)
entry = entry_hash[:properties]
entry[:id] = entry_hash[:id]
%i(Project_Announced Tender_Date).each do |field|
entry[field] &&= Date.parse(entry[field]).iso8601
end
%i(Post_Comments Project_Description Project_Title Keyword Project_POCs).each do |field|
entry[field] &&= squish(entry[field])
end
entry
end

def valid_entry?(entry)
entry[:Tender_Date].nil? || Date.strptime(entry[:Tender_Date]) >= Date.current
end

clean_table
fetch_results

0 comments on commit 3dda2d8

Please sign in to comment.