/
votes.py
209 lines (165 loc) · 6.56 KB
/
votes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
# -*- coding: utf-8 -*-
import re
import os
import pdb
import datetime
from operator import itemgetter
import contextlib
import sh
import tesseract
import scrapelib
from billy.scrape.utils import convert_pdf
from billy.scrape.votes import VoteScraper, Vote as BillyVote
from .lexers import with_image
from .lexers import without_image
@contextlib.contextmanager
def cd(path):
'''Creates the path if it doesn't exist'''
old_dir = os.getcwd()
try:
os.makedirs(path)
except OSError:
pass
os.chdir(path)
try:
yield
finally:
os.chdir(old_dir)
class MAVoteScraper(VoteScraper):
jurisdiction = 'ma'
class EndOfHouseVotes(Exception):
'''Raise when there are no more house votes to scrape.
'''
pass
class MiscellaneousVote(Exception):
'''Sometimes the chamber will vote on something that isn't
related to a bill, like whether to suspend the rules in order
to continue to meet late in the night.
See http://www.mass.gov/legis/journal/RollCallPdfs/188/00060.pdf?Session=188&RollCall=00060
'''
def scrape(self, chamber, session):
self.filenames = []
if chamber == 'upper':
self.scrape_senate(session)
elif chamber == 'lower':
self.scrape_house(session)
def scrape_senate(self, session):
pass
def scrape_house(self, session):
n = 1
while True:
try:
self.scrape_vote(session, n)
except self.EndOfHouseVotes:
break
except self.MiscellaneousVote:
pass
n += 1
def scrape_vote(self, session, rollcall_number):
# Fetch this piece of garbage.
url = (
'http://www.mass.gov/legis/journal/RollCallPdfs/'
'{session}/{rollcall}.pdf?Session={session}&RollCall={rollcall}')
url_args = dict(
session=re.findall(r'\d+', session).pop(),
rollcall=str(rollcall_number).zfill(5))
url = url.format(**url_args)
try:
vote_file, resp = self.urlretrieve(url)
except scrapelib.HTTPError:
# We'll hit a 404 at the end of the votes.
self.warning('Stopping; encountered a 404 at %s' % url)
raise self.EndOfHouseVotes
text = convert_pdf(vote_file, type='text')
text = text.decode('utf8')
# A hack to guess whether this PDF has embedded images or contains
# machine readable text.
if len(re.findall(r'[YNPX]', text)) > 157:
vote = self.house_get_vote(text, vote_file, session)
else:
vote = self.house_get_vote_with_images(text, vote_file, session)
self.house_add_votes_from_image(vote_file, vote)
vote.add_source(url)
if not self.house_check_vote(vote):
self.logger.warning('Bad vote counts for %s' % vote)
return
self.save_vote(vote)
os.remove(vote_file)
def house_get_vote(self, text, vote_file, session):
# Skip quorum votes.*
if 'QUORUM' in text:
raise self.MiscellaneousVote
# Parse the text into a tree.
tree = without_image.Rollcall.parse(without_image.HeaderLexer(text))
# Visit the tree and add rollcall votes to the vote object.
vote_data = without_image.VoteVisitor().visit(tree)
if 'bill_id' not in vote_data:
msg = 'Skipping vote not associated with any bill_id'
self.logger.warning(msg)
raise self.MiscellaneousVote(msg)
vote_data['passed'] = vote_data['yes_count'] > vote_data['no_count']
vote_data['session'] = session
vote_data['bill_chamber'] = {
'S': 'upper',
'H': 'lower'}[vote_data['bill_id'][0]]
voters = vote_data.pop('votes')
vote = BillyVote('lower', **vote_data)
# Parse the text into a tree.
tree = with_image.Rollcall.parse(with_image.Lexer(voters))
# Visit the tree and add rollcall votes to the vote object.
visitor = with_image.VoteVisitor(vote).visit(tree)
return vote
def house_get_vote_with_images(self, text, vote_file, session):
_, motion_start = re.search('Yea and Nay No.+', text).span()
motion_end, _ = re.search('YEAS', text).span()
motion = text[motion_start:motion_end]
motion = ' '.join(motion.strip().split())
counts_re = r'([A-Z\-]+):\s+(\d+)'
counts = dict(re.findall(counts_re, text))
date = re.search(r'\S+ \d+, \d{4}', text).group()
date = datetime.datetime.strptime(date, '%B %d, %Y')
chamber_re = r'(Senate|House),\s+No\. (\d+)'
bill_chamber = re.search(chamber_re, text)
if bill_chamber is None:
raise self.MiscellaneousVote('Vote not realted to a bill.')
chamber, bill_id = bill_chamber.groups()
bill_chamber = {
'Senate': 'upper',
'House': 'lower'}[chamber]
if bill_chamber == 'lower':
bill_id = 'H ' + bill_id
else:
bill_id = 'S ' + bill_id
yes = int(counts['YEAS'])
no = int(counts['NAYS'])
other = int(counts.get('N-V', 0))
vote = BillyVote('lower', date, motion, (yes > no),
yes, no, other, session=session, bill_id=bill_id,
bill_chamber=bill_chamber)
return vote
def house_add_votes_from_image(self, vote_file, vote):
# Extract the image.
with cd('/tmp'):
sh.pdfimages(vote_file, vote_file)
# Convert it to .png
image_file = vote_file + '-000.pbm'
with open(image_file, 'rb') as f:
data = f.read()
api = tesseract.TessBaseAPI()
api.Init(".", "eng", tesseract.OEM_DEFAULT)
api.SetPageSegMode(tesseract.PSM_SINGLE_BLOCK)
whitelist = (
"abcdefghijklmnopqrstuvwxyz',-.*"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ ")
api.SetVariable("tessedit_char_whitelist", whitelist)
text = tesseract.ProcessPagesBuffer(data, len(data), api)
# Parse the text into a tree.
tree = with_image.Rollcall.parse(with_image.Lexer(text))
# Visit the tree and add rollcall votes to the vote object.
visitor = with_image.VoteVisitor(vote).visit(tree)
os.remove(image_file)
def house_check_vote(self, vote):
return all([
len(vote['yes_votes']) == vote['yes_count'],
len(vote['no_votes']) == vote['no_count'],
len(vote['other_votes']) == vote['other_count']])