-
Notifications
You must be signed in to change notification settings - Fork 27
/
events.py
316 lines (246 loc) · 12.2 KB
/
events.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
import time
import datetime
from collections import deque
import lxml.html
import pytz
import icalendar
import requests
from pupa.scrape import Scraper
import scrapelib
from .base import LegistarScraper, LegistarAPIScraper
class LegistarEventsScraper(LegistarScraper):
def eventPages(self, since) :
# Directly use the requests library here, so that we do not
# use a cached page, which may have expired .NET state values,
# even in fastmode (which uses the cache).
response = requests.get(self.EVENTSPAGE, verify=False)
entry = response.text
page = lxml.html.fromstring(entry)
page.make_links_absolute(self.EVENTSPAGE)
for page in self.eventSearch(page, since):
yield page
def eventSearch(self, page, since) :
payload = self.sessionSecrets(page)
payload['ctl00_ContentPlaceHolder1_lstYears_ClientState'] = '{"value":"%s"}' % since
payload['__EVENTTARGET'] = 'ctl00$ContentPlaceHolder1$lstYears'
return self.pages(self.EVENTSPAGE, payload)
def events(self, follow_links=True, since=None) :
# If an event is added to the the legistar system while we
# are scraping, it will shift the list of events down and
# we might revisit the same event. So, we keep track of
# the last few events we've visited in order to
# make sure we are not revisiting
scraped_events = deque([], maxlen=10)
current_year = self.now().year
if since:
if since > current_year:
raise ValueError('Value of :since cannot exceed {}'.format(current_year))
else:
since_year = since - 1
else:
since_year = 0
# Anticipate events will be scheduled for the following year to avoid
# missing upcoming events during scrapes near the end of the current
# year.
for year in range(current_year + 1, since_year, -1):
no_events_in_year = True
for page in self.eventPages(year):
no_events_in_year = False
events_table = page.xpath("//table[@class='rgMasterTable']")[0]
for event, _, _ in self.parseDataTable(events_table) :
if follow_links and type(event["Meeting Details"]) == dict :
detail_url = event["Meeting Details"]['url']
if detail_url in scraped_events :
continue
else :
scraped_events.append(detail_url)
meeting_details = self.lxmlize(detail_url)
agenda = self.agenda(detail_url)
else :
agenda = None
yield event, agenda
if no_events_in_year: # Bail from scrape if no results returned from year
break
def agenda(self, detail_url) :
page = self.lxmlize(detail_url)
payload = self.sessionSecrets(page)
payload.update({"__EVENTARGUMENT": "3:1",
"__EVENTTARGET":"ctl00$ContentPlaceHolder1$menuMain"})
for page in self.pages(detail_url, payload) :
agenda_table = page.xpath(
"//table[@id='ctl00_ContentPlaceHolder1_gridMain_ctl00']")[0]
agenda = self.parseDataTable(agenda_table)
yield from agenda
def addDocs(self, e, events, doc_type) :
try :
if events[doc_type] != 'Not\xa0available' :
e.add_document(note= events[doc_type]['label'],
url = events[doc_type]['url'],
media_type="application/pdf")
except ValueError :
pass
def extractRollCall(self, action_detail_url) :
action_detail_page = self.lxmlize(action_detail_url)
try:
rollcall_table = action_detail_page.xpath("//table[@id='ctl00_ContentPlaceHolder1_gridRollCall_ctl00']")[0]
except IndexError:
self.warning("No rollcall found in table")
return []
roll_call = list(self.parseDataTable(rollcall_table))
call_list = []
for call, _, _ in roll_call :
option = call['Attendance']
call_list.append((option,
call['Person Name']['label']))
return call_list
def ical(self, ical_text):
value = icalendar.Calendar.from_ical(ical_text)
return value
class LegistarAPIEventScraper(LegistarAPIScraper):
def events(self, since_datetime=None):
# Set attribute equal to an instance of our generator yielding events
# scraped from the Legistar web interface. This allows us to pause
# and resume iteration as needed.
self._events = self._scrapeWebCalendar()
# Instantiate dictionary where events from generator are stored as they
# are scraped.
self._scraped_events = {}
for api_event in self.api_events(since_datetime):
# EventTime may be 'None': this try-except block catches those instances.
try:
start_time = time.strptime(api_event['EventTime'], '%I:%M %p')
except TypeError:
continue
else:
start = self.toTime(api_event['EventDate'])
api_event['start'] = start.replace(hour=start_time.tm_hour,
minute=start_time.tm_min)
api_event['status'] = self._event_status(api_event)
if self._not_in_web_interface(api_event):
continue
else:
# None if entire web calendar scraped but API event not found
web_event = self.web_results(api_event)
if web_event:
yield api_event, web_event
else:
event_url = '{0}/events/{1}'.format(self.BASE_URL, api_event['EventId'])
self.warning('API event could not be found in web interface: {0}'.format(event_url))
continue
def api_events(self, since_datetime=None):
# scrape from oldest to newest. This makes resuming big scraping jobs easier
# because upon a scrape failure we can import everything scraped and then
# scrape everything newer then the last event we scraped
params = {'$orderby': 'EventLastModifiedUtc'}
if since_datetime:
# Minutes are often published after an event occurs – without a
# corresponding event modification. Query all update fields so later
# changes are always caught by our scraper, particularly when
# scraping narrower windows of time.
update_fields = ('EventLastModifiedUtc',
'EventAgendaLastPublishedUTC',
'EventMinutesLastPublishedUTC')
since_fmt = " gt datetime'{}'".format(since_datetime.isoformat())
since_filter = ' or '.join(field + since_fmt for field in update_fields)
params['$filter'] = since_filter
events_url = self.BASE_URL + '/events/'
yield from self.pages(events_url,
params=params,
item_key="EventId")
def agenda(self, event):
agenda_url = self.BASE_URL + '/events/{}/eventitems'.format(event['EventId'])
response = self.get(agenda_url)
try:
# Order the event items according to the EventItemMinutesSequence. If an
# event item does not have a value for EventItemMinutesSequence, the script
#will throw a TypeError. In that case, try to order by EventItemAgendaSequence.
filtered_response = sorted((item for item in response.json()
if item['EventItemTitle']),
key = lambda item : item['EventItemMinutesSequence'])
except TypeError:
try:
filtered_response = sorted((item for item in response.json()
if item['EventItemTitle']),
key = lambda item : item['EventItemAgendaSequence'])
except TypeError:
filtered_response = (item for item in response.json()
if item['EventItemTitle'])
for item in filtered_response:
yield item
def rollcalls(self, event):
for item in self.agenda(event):
if item['EventItemRollCallFlag']:
rollcall_url = self.BASE_URL + '/eventitems/{}/rollcalls'.format(item['EventItemId'])
response = self.get(rollcall_url)
for item in response.json():
yield item
def web_results(self, event):
api_key = (event['EventBodyName'].strip(),
event['start'])
# Check the cache of events we've already scraped from the web interface
# for the API event at hand.
if api_key in self._scraped_events:
return self._scraped_events[api_key]
else:
# If API event not in web scrape cache, continue scraping the web
# interface.
for web_key, event in self._events:
self._scraped_events[web_key] = event
# When we find the API event, stop scraping.
if web_key == api_key:
return event
def _scrapeWebCalendar(self):
'''Generator yielding events from Legistar in roughly reverse
chronological order.
'''
web_scraper = LegistarEventsScraper(self.jurisdiction,
self.datadir,
strict_validation=self.strict_validation,
fastmode=(self.requests_per_minute == 0))
web_scraper.EVENTSPAGE = self.EVENTSPAGE
web_scraper.BASE_URL = self.WEB_URL
web_scraper.TIMEZONE = self.TIMEZONE
web_scraper.date_format = '%m/%d/%Y'
for event, _ in web_scraper.events(follow_links=False):
event_key = self._event_key(event, web_scraper)
yield event_key, event
def _event_key(self, event, web_scraper):
'''Since Legistar InSite contains more information about events than
are available in the API, we need to scrape both. Then, we have
to line them up. This method makes a key that should be
uniquely identify every event and will allow us to link
events from the two data sources.
'''
response = web_scraper.get(event['iCalendar']['url'], verify=False)
event_time = web_scraper.ical(response.text).subcomponents[0]['DTSTART'].dt
event_time = pytz.timezone(self.TIMEZONE).localize(event_time)
key = (event['Name']['label'],
event_time)
return key
def addDocs(self, e, events, doc_type):
try :
if events[doc_type] != 'Not\xa0available':
e.add_document(note= events[doc_type]['label'],
url = events[doc_type]['url'],
media_type="application/pdf")
except ValueError :
pass
def _event_status(self, event):
'''Events can have a status of tentative, confirmed, cancelled, or
passed (http://docs.opencivicdata.org/en/latest/data/event.html). By
default, set status to passed if the current date and time exceeds the
event date and time, or confirmed otherwise. Available for override in
jurisdictional scrapers.
'''
if datetime.datetime.utcnow().replace(tzinfo=pytz.utc) > event['start']:
status = 'passed'
else:
status = 'confirmed'
return status
def _not_in_web_interface(self, event):
'''Occasionally, an event will appear in the API, but not in the web
interface. This method checks attributes of the API event that tell us
whether the given event is one of those cases, returning True if so, and
False otherwise. Available for override in jurisdictional scrapers.
'''
return False