Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Use legistar_url to determine private vs. public bills #260

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 27 additions & 13 deletions lametro/bills.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,21 @@ def scrape(self, window=28, matter_ids=None) :
a window of 7 will scrape legislation updated in the last week. Pass
a window of 0 to scrape all legislation.
:matter_ids (str) - Comma-separated list of matter IDs to scrape


### Notes on scraping private bills
The Metro scraper scrapes private bills.

Private bills have 'MatterRestrictViewViaWeb' set to True.
However, the Metro system sometimes has discrepancies (e.g., a bill has
'MatterRestrictViewViaWeb' set to False, but with a status as 'draft' and no agenda date).
We thus also use the absence of a `legistar_web` link to determine if a bill is private.

We do not want to capture significant data about private bills,
other than the value of `restrict_view` and a last modified timestamp.
We yield private bills early, wipe data from previously imported once-public bills,
and include only data *required* by the pupa schema.
https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py
'''

if matter_ids:
Expand Down Expand Up @@ -159,20 +174,19 @@ def scrape(self, window=28, matter_ids=None) :
classification=bill_type,
from_organization={"name":"Board of Directors"})

# The Metro scraper scrapes private bills.
# However, we do not want to capture significant data about private bills,
# other than the value of `restrict_view` and a last modified timestamp.
# We yield private bills early, wipe data from previously imported once-public bills,
# and include only data *required* by the pupa schema.
# https://github.com/opencivicdata/pupa/blob/master/pupa/scrape/schemas/bill.py
bill.extras = {'restrict_view' : matter['MatterRestrictViewViaWeb']}

# Add API source early.
# Private bills should have this url for debugging.
legistar_api = self.BASE_URL + '/matters/{0}'.format(matter_id)
bill.add_source(legistar_api, note='api')

if matter['MatterRestrictViewViaWeb']:
legistar_web = matter.get('legistar_url', '')
if legistar_web:
bill.add_source(legistar_web, note='web')

# Yield private bills early.
if matter['MatterRestrictViewViaWeb'] or not legistar_web:
# required fields
bill.title = 'Restricted View'
bill.add_subject('Restricted View')
Expand All @@ -189,10 +203,6 @@ def scrape(self, window=28, matter_ids=None) :
yield bill
continue

legistar_web = matter.get('legistar_url', '')
if legistar_web:
bill.add_source(legistar_web, note='web')

for identifier in alternate_identifiers:
bill.add_identifier(identifier)

Expand All @@ -215,6 +225,9 @@ def scrape(self, window=28, matter_ids=None) :
result=result,
bill=bill)

print("******sources")
print(legistar_web)
print(legistar_api + '/histories')
vote_event.add_source(legistar_web)
vote_event.add_source(legistar_api + '/histories')

Expand All @@ -227,7 +240,6 @@ def scrape(self, window=28, matter_ids=None) :

yield vote_event


for sponsorship in self.sponsorships(matter_id) :
bill.add_sponsorship(**sponsorship)

Expand Down Expand Up @@ -311,5 +323,7 @@ def scrape(self, window=28, matter_ids=None) :
'Ordinance / Administrative Code': None,
'Appointment': None,
'Public Hearing': None,
'Application': None}
'Application': None,
'Board Correspondence': None,
'Closed Session': None}