/
rakefile
44 lines (35 loc) · 1.26 KB
/
rakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
require "parallel"
require "JSON"
load "scrape/scraper.rb"
load "scrape/NGAPainting.rb"
load "scrape/collection_to_csv.rb"
BASEURL = "http://www.nga.gov"
SYSCAT = "/content/ngaweb/research/online-editions/17th-century-dutch-paintings.html"
HTMLFILES = FileList["scrape/html/*.html"]
JSONFILES = HTMLFILES.ext(".json")
task :default => [:retrieve, :convert, :write]
desc "Scrape all the Dutch syscat entries from nga.gov"
task :retrieve => "scrape/html" do
# Get all object links
links = object_url_list("#{BASEURL}#{SYSCAT}")
puts links
# Run load_file process for each file in parallel
Parallel.map_with_index(links, :in_processes => 8) do |object_url, index|
url = "#{BASEURL}#{object_url}"
filename = "scrape/html/#{object_url.match(/(\d+)\.html/)[0]}"
download_content(url, filename)
end
end
# This ruleset will parse each html file into a JSON file
task :convert => JSONFILES
rule ".json" => ".html" do |t|
print "Converting #{t.source}..."
parsed_data = parse_file(t.source)
File.open(t.name, "w") { |file| file.write(JSON.pretty_generate(parsed_data)) }
puts "done."
end
directory "scrape/html"
desc "Create a raw csv file with all the information stored in JSON files"
task :write do
collection_to_csv(JSONFILES, "collection_data.csv")
end