-
Notifications
You must be signed in to change notification settings - Fork 57
/
thomas_fetch_committee_reports.rb
98 lines (80 loc) · 2.96 KB
/
thomas_fetch_committee_reports.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env ruby
if __FILE__ == $0
require File.dirname(__FILE__) + '/../config/environment'
end
require 'hpricot'
require 'open-uri'
require 'fileutils'
$base_path = Settings.committee_reports_path
types = ["#{$base_path}/house","#{$base_path}/senate","#{$base_path}/conference","#{$base_path}/joint"]
types.each do |t|
unless FileTest.directory?(t)
Dir.mkdir(t)
end
end
house_reports_url = "http://thomas.loc.gov/cgi-bin/cpquery/L?cp%d:list/cp%dch.lst:"
senate_reports_url = "http://thomas.loc.gov/cgi-bin/cpquery/L?cp%d:./list/cp%dcs.lst:"
conference_reports_url = "http://thomas.loc.gov/cgi-bin/cpquery/L?cp%d:./list/cp%dco.lst:"
joint_reports_url = "http://thomas.loc.gov/cgi-bin/cpquery/L?cp%d:./list/cp%dcj.lst:"
class Parser
attr_reader :gen, :congress, :printable_base, :rows_per_page
def initialize
@congress = Settings.default_congress
@printable_base = "http://thomas.loc.gov/cgi-bin/cpquery/T?&report=%s&dbname=%s&"
@rows_per_page = 50
end
def parse(url_base, type)
reports = []
log = File.open "#{$base_path}/#{type}/log.txt", "w+"
entry_num = 0
# This loops through the pages of results from THOMAS.
# We'll exit when we see less than 50 entries on a given page.
while true
url = url_base % [congress, congress]
current_url = url + entry_num.to_s
puts current_url
doc = Hpricot(open(current_url))
# We're looking at each row of the HTML table on the reports index page
all_reports = ((doc/:table/:tr))
reports = all_reports.select { |elm| (elm/:td).size == 5 }
reports.each do |tr|
entries = tr/:td
entry_num = entries[0].inner_html.to_i
name = entries[1].inner_html
md = (entries[2]/:a).to_html.match(/report=(.*?)\&/)
next if md.nil?
reportname, dbname = md.captures[0].split(/\./)
report_url = (printable_base % [reportname, dbname])
filename = "#{$base_path}/#{type}/#{entry_num}.#{reportname}.#{dbname}.html"
log.puts "#{entry_num}\t#{name}\t#{reportname}\t#{dbname}\t#{filename}"
log.flush
unless File.exists? filename
begin
report = Hpricot(open(report_url))
contents = (report/'div[@id="content"]')
unless contents.first.nil?
File.open(filename, "w+") do |f|
f.puts "<html>"
f.puts contents.first.to_html
f.puts "</html>"
end
end
rescue
puts "Bad HTML for report: #{report_url}. Skipping..."
end
end #unless
end #reports.each
# If we ended on #46, let's fetch #47 next time around.
entry_num += 1;
if all_reports.size < rows_per_page
break
end
end # while true
log.close
end
end
p = Parser.new
p.parse(house_reports_url, "house")
p.parse(senate_reports_url, "senate")
p.parse(conference_reports_url, "conference")
p.parse(joint_reports_url, "joint")