Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PeriodicalSpider #467

Merged
merged 14 commits into from
Sep 21, 2020
114 changes: 103 additions & 11 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
from abc import abstractmethod
from datetime import datetime
from io import BytesIO
from zipfile import ZipFile
Expand Down Expand Up @@ -58,7 +59,7 @@ class BaseSpider(scrapy.Spider):

MAX_SAMPLE = 10
MAX_RELEASES_PER_PACKAGE = 100
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'}
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'}

ocds_version = '1.1'
date_format = 'date'
Expand Down Expand Up @@ -115,17 +116,25 @@ def from_crawler(cls, crawler, *args, **kwargs):
if not spider.from_date:
# Default to `default_from_date` class attribute.
spider.from_date = spider.default_from_date
if isinstance(spider.from_date, str):
# convert to date format, if needed
spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
else:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we remove the else statement and always convert the date to datetime? to avoid duplicating spider.from_date = datetime.strptime(spider.from_date, spider.date_format)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This if-else block is used to know whether from_date was passed as a parameter or not. If we eliminate the else a bad configuration in default_from_date by a subclass may raise a SpiderArgumentError.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we remove the else then? (following the same logic that there are not going to be bad configurations in hardcoded default dates)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@romifz not sure if you have seen this!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I'll check if we can remove the if-else block.

try:
spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e))

if not spider.until_date:
# Default to today.
spider.until_date = datetime.now().strftime(spider.date_format)
try:
spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e))
try:
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e))
spider.until_date = cls.get_default_until_date(spider)
if isinstance(spider.until_date, str):
# convert to date format, if needed
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
else:
try:
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e))

return spider

Expand Down Expand Up @@ -272,6 +281,10 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None
if self.sample:
break

@classmethod
def get_default_until_date(cls, spider):
return datetime.now()


class SimpleSpider(BaseSpider):
"""
Expand Down Expand Up @@ -435,3 +448,82 @@ def next_link(self, response, **kwargs):

if response.meta['depth'] == 0:
raise MissingNextLinkError('next link not found on the first page: {}'.format(response.url))


class PeriodicalSpider(SimpleSpider):
"""
This class helps to crawl urls that receive a year (YYYY) or a month and year (YYYY-mm) as parameters. To use it:

1. Extend from ``PeriodicalSpider``.
1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'.
1. Set a ``default_from_date`` year or month-year.
1. Optionally, set a ``default_until_date`` year or month-year. If absent, ``default_until_date`` defaults to the
current year or month-year.
1. Set the ``pattern`` parameter with the url to retrieve.
1. Implement the `get_formatter` method.

The ``pattern`` should include a placeholder for a year or month-year parameter. With the year parameter, an int is
passed. If the year-month parameter is used, a ``Date`` instance is passed. Example:

.. code-block: python

url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}'

When the ``sample`` option is used, the latest year or month of data is retrieved.
"""
VALID_DATE_FORMATS = {'year': '%Y', 'year-month': '%Y-%m'}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we remove 'year-month' from base_spider VALID_DATE_FORMATS ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure. I think it should be possible to use the same format for filtering in other spiders, although I think we don't have a case, don't we?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I think that all of the spiders which uses 'year-month' extends from PeriodicalSpider now

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, no problem, I'll remove the option.


def __init__(self, *args, **kwargs):
self.date_format_key = self.date_format
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@romifz is this needed? cant we use self.date_format directly?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this case yes, because internally self.date_format is replaced by the date format it represents in VALID_DATE_FORMATS e.g. "year" gets replaced by "%Y". I prefer to have the original name in a separate variable.

super().__init__(*args, **kwargs)

if hasattr(self, 'start_requests_callback'):
self.start_requests_callback = getattr(self, self.start_requests_callback)
else:
self.start_requests_callback = self.parse

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(SimpleSpider, cls).from_crawler(crawler, *args, **kwargs)

if not spider.from_date:
spider.from_date = spider.default_from_date
if isinstance(spider.from_date, str):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we are already doing this in the base class, arent we?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the conversion in the base class happens only if the user specifies from_date or until_date.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can do something like

def from_crawler(cls, crawler, from_date=None, *args, **kwargs):
if not from_date:
from_date = cls.default_from_date
spider = super().from_crawler(crawler, from_date=from_date, *args, **kwargs)

to avoid the validation/conversion duplication

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would have the same issue as the previous comment, a bad configuration in the spider subclass may rise a SpiderArgumentError. Since Scrapy does not provide a way to know if a property is a parameter set by the user (at least I couldn't find a way), I think it is best to never set from_date and until_date in a superclass like PeriodicalSpider.

I'm thinking that the best solution would be to have a class property to indicate the base class that we need a time interval to be set, like require_time_period or something similar. Does this make sense?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that as default_*_date variables are hardcoded variables we shouldn't have bad configurations here...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's fair.

spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
spider.until_date = cls.get_default_until_date(spider)
if isinstance(spider.until_date, str):
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)

return spider

@classmethod
def get_default_until_date(cls, spider):
if hasattr(spider, 'default_until_date') and spider.default_until_date:
return spider.default_until_date
else:
return datetime.today()

def start_requests(self):

start = self.from_date

stop = self.until_date

if self.sample:
start = stop

if self.date_format_key == 'year':
date_range = util.date_range_by_year(start.year, stop.year)
else:
date_range = util.date_range_by_month(start, stop)

for date in date_range:
for url in self.build_urls(self.pattern, date):
yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback)

@abstractmethod
def get_formatter(self):
pass

def build_urls(self, pattern, date):
yield pattern.format(date)
22 changes: 9 additions & 13 deletions kingfisher_scrapy/spiders/moldova_old.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, date_range_by_year
from kingfisher_scrapy.base_spider import PeriodicalSpider
from kingfisher_scrapy.util import components


class MoldovaOld(SimpleSpider):
class MoldovaOld(PeriodicalSpider):
"""
Bulk download documentation
http://opencontracting.date.gov.md/downloads
Expand All @@ -12,14 +12,10 @@ class MoldovaOld(SimpleSpider):
"""
name = 'moldova_old'
data_type = 'release_package'
default_from_date = '2012'
default_until_date = '2018'
pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'
date_format = 'year'

def start_requests(self):
pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'

start = 2012
stop = 2018
if self.sample:
start = 2018

for year in date_range_by_year(start, stop):
yield self.build_request(pattern.format(year), formatter=components(-1))
def get_formatter(self):
return components(-1)
26 changes: 9 additions & 17 deletions kingfisher_scrapy/spiders/nepal_portal.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from datetime import date
from kingfisher_scrapy.base_spider import PeriodicalSpider
from kingfisher_scrapy.util import components

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, date_range_by_year


class NepalPortal(SimpleSpider):
class NepalPortal(PeriodicalSpider):
"""
Bulk download documentation
http://ppip.gov.np/downloads
Expand All @@ -15,16 +13,10 @@ class NepalPortal(SimpleSpider):
name = 'nepal_portal'
data_type = 'release_package'
ocds_version = '1.0'
default_from_date = '2012'
default_until_date = '2018'
pattern = 'http://ppip.gov.np/bulk-download/{}'
date_format = 'year'

def start_requests(self):
pattern = 'http://ppip.gov.np/bulk-download/{}'

if self.sample:
start = 2018
stop = 2018
else:
start = 2012
stop = date.today().year # HTTP 500 after 2018

for year in date_range_by_year(start, stop):
yield self.build_request(pattern.format(year), formatter=components(-1))
def get_formatter(self):
return components(-1)
75 changes: 33 additions & 42 deletions kingfisher_scrapy/spiders/scotland_base.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,42 @@
from datetime import date

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import date_range_by_month, parameters
from kingfisher_scrapy.base_spider import PeriodicalSpider
from kingfisher_scrapy.util import parameters


class ScotlandBase(SimpleSpider):
class ScotlandBase(PeriodicalSpider):
default_from_date = '2019-01'
date_format = 'year-month'
default_until_date = date.today()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the base class already does this, doesn't it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we can remove this.

default_from_date = date(default_until_date.year - 1, default_until_date.month, 1)

@classmethod
def from_crawler(cls, crawler, from_date=None, *args, **kwargs):
if not from_date:
from_date = cls.default_from_date
notice_types = [
1, # OJEU - F1 - Prior Information Notice
2, # OJEU - F2 - Contract Notice
3, # OJEU - F3 - Contract Award Notice
4, # OJEU - F4 - Prior Information Notice(Utilities)
5, # OJEU - F5 - Contract Notice(Utilities)
6, # OJEU - F6 - Contract Award Notice(Utilities)
7, # OJEU - F7 - Qualification Systems(Utilities)
12, # OJEU - F12 - Design Contest Notice
13, # OJEU - F13 - Results Of Design Contest
14, # OJEU - F14 - Corrigendum
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice
20, # OJEU - F20 - Modification Notice
21, # OJEU - F21 - Social And other Specific Services(Public Contracts)
22, # OJEU - F22 - Social And other Specific Services(Utilities)
23, # OJEU - F23 - Social And other Specific Services(Concessions)
24, # OJEU - F24 - Concession Notice
25, # OJEU - F25 - Concession Award Notice
101, # Site Notice - Website Contract Notice
102, # Site Notice - Website Prior Information Notice
103, # Site Notice - Website Contract Award Notice
104, # Site Notice - Quick Quote Award
]

return super().from_crawler(crawler, from_date=from_date, *args, **kwargs)
def build_urls(self, pattern, date):
for notice_type in self.notice_types:
yield pattern.format(date, notice_type)

def start_requests(self):
notice_types = [
1, # OJEU - F1 - Prior Information Notice
2, # OJEU - F2 - Contract Notice
3, # OJEU - F3 - Contract Award Notice
4, # OJEU - F4 - Prior Information Notice(Utilities)
5, # OJEU - F5 - Contract Notice(Utilities)
6, # OJEU - F6 - Contract Award Notice(Utilities)
7, # OJEU - F7 - Qualification Systems(Utilities)
12, # OJEU - F12 - Design Contest Notice
13, # OJEU - F13 - Results Of Design Contest
14, # OJEU - F14 - Corrigendum
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice
20, # OJEU - F20 - Modification Notice
21, # OJEU - F21 - Social And other Specific Services(Public Contracts)
22, # OJEU - F22 - Social And other Specific Services(Utilities)
23, # OJEU - F23 - Social And other Specific Services(Concessions)
24, # OJEU - F24 - Concession Notice
25, # OJEU - F25 - Concession Award Notice
101, # Site Notice - Website Contract Notice
102, # Site Notice - Website Prior Information Notice
103, # Site Notice - Website Contract Award Notice
104, # Site Notice - Quick Quote Award
]

for year_month in date_range_by_month(self.from_date, date.today()):
date_string = year_month.strftime('%m-%Y')
for notice_type in notice_types:
yield self.build_request(
self.url.format(date_string, notice_type),
formatter=parameters('noticeType', 'dateFrom')
)
if self.sample:
return
def get_formatter(self):
return parameters('noticeType', 'dateFrom')
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/scotland_proactis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ class ScotlandProactis(ScotlandBase):
"""
name = 'scotland_proactis'
data_type = 'release_package'
url = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={}&outputType=0&noticeType={}'
pattern = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={:%m-%Y}&outputType=0&noticeType={}'
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/scotland_public_contracts.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ class ScotlandPublicContracts(ScotlandBase):
"""
name = 'scotland_public_contracts'
data_type = 'release_package'
url = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={}&outputType=1&noticeType={}'
pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={:%m-%Y}&outputType=0&noticeType={}'
24 changes: 7 additions & 17 deletions kingfisher_scrapy/spiders/uruguay_base.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,18 @@
from abc import abstractmethod

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, date_range_by_month
from kingfisher_scrapy.base_spider import PeriodicalSpider
from kingfisher_scrapy.util import components


class UruguayBase(SimpleSpider):
class UruguayBase(PeriodicalSpider):
download_delay = 0.9
default_from_date = '2017-11'
date_format = 'year-month'
pattern = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}'
start_requests_callback = 'parse_list'

@classmethod
def from_crawler(cls, crawler, from_date=None, *args, **kwargs):
if not from_date:
from_date = cls.default_from_date

return super().from_crawler(crawler, from_date=from_date, *args, **kwargs)

def start_requests(self):
url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}'
if self.sample:
self.from_date = self.until_date

for d in date_range_by_month(self.from_date, self.until_date):
yield self.build_request(url.format(d), formatter=components(-2), callback=self.parse_list)
def get_formatter(self):
return components(-2)

@abstractmethod
def parse_list(self):
Expand Down
23 changes: 10 additions & 13 deletions kingfisher_scrapy/spiders/uruguay_historical.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from kingfisher_scrapy.base_spider import CompressedFileSpider
from kingfisher_scrapy.util import components, date_range_by_year
from kingfisher_scrapy.base_spider import CompressedFileSpider, PeriodicalSpider
from kingfisher_scrapy.util import components


class UruguayHistorical(CompressedFileSpider):
class UruguayHistorical(CompressedFileSpider, PeriodicalSpider):
"""
Bulk download documentation
https://www.gub.uy/agencia-compras-contrataciones-estado/datos-y-estadisticas/datos/open-contracting
Expand All @@ -15,20 +15,17 @@ class UruguayHistorical(CompressedFileSpider):

# the files takes too long to be downloaded, so we increase the download timeout
download_timeout = 1000
default_from_date = '2002'
default_until_date = '2017'
date_format = 'year'
custom_settings = {
# It seems some websites don't like it and block when your user agent is not a browser.
# see https://github.com/scrapy/scrapy/issues/3103
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/37.0.2049.0 Safari/537.36',
}
pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \
'/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip'

def start_requests(self):
start = 2002
stop = 2017
if self.sample:
start = stop

pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \
'/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip'
for year in date_range_by_year(start, stop):
yield self.build_request(pattern.format(year), formatter=components(-1))
def get_formatter(self):
return components(-1)