diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index d1d6076d..193e78b4 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -1,5 +1,6 @@ import json import os +from abc import abstractmethod from datetime import datetime from io import BytesIO from zipfile import ZipFile @@ -58,7 +59,7 @@ class BaseSpider(scrapy.Spider): MAX_SAMPLE = 10 MAX_RELEASES_PER_PACKAGE = 100 - VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'} + VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'} ocds_version = '1.1' date_format = 'date' @@ -115,15 +116,18 @@ def from_crawler(cls, crawler, *args, **kwargs): if not spider.from_date: # Default to `default_from_date` class attribute. spider.from_date = spider.default_from_date - if not spider.until_date: - # Default to today. - spider.until_date = datetime.now().strftime(spider.date_format) try: - spider.from_date = datetime.strptime(spider.from_date, spider.date_format) + if isinstance(spider.from_date, str): + # convert to date format, if needed + spider.from_date = datetime.strptime(spider.from_date, spider.date_format) except ValueError as e: raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e)) + + if not spider.until_date: + spider.until_date = cls.get_default_until_date(spider) try: - spider.until_date = datetime.strptime(spider.until_date, spider.date_format) + if isinstance(spider.until_date, str): + spider.until_date = datetime.strptime(spider.until_date, spider.date_format) except ValueError as e: raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e)) @@ -272,6 +276,10 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None if self.sample: break + @classmethod + def get_default_until_date(cls, spider): + return datetime.now() + class SimpleSpider(BaseSpider): """ @@ -435,3 +443,77 @@ def next_link(self, response, **kwargs): if response.meta['depth'] == 0: raise MissingNextLinkError('next link not found on the first page: {}'.format(response.url)) + + +class PeriodicalSpider(SimpleSpider): + """ + This class helps to crawl urls that receive a year (YYYY) or a month and year (YYYY-mm) as parameters. To use it: + + 1. Extend from ``PeriodicalSpider``. + 1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'. + 1. Set a ``default_from_date`` year or month-year. + 1. Optionally, set a ``default_until_date`` year or month-year. If absent, ``default_until_date`` defaults to the + current year or month-year. + 1. Set the ``pattern`` parameter with the url to retrieve. + 1. Implement the `get_formatter` method. + + The ``pattern`` should include a placeholder for a year or month-year parameter. With the year parameter, an int is + passed. If the year-month parameter is used, a ``Date`` instance is passed. Example: + + .. code-block: python + + url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' + + When the ``sample`` option is used, the latest year or month of data is retrieved. + """ + VALID_DATE_FORMATS = {'year': '%Y', 'year-month': '%Y-%m'} + + def __init__(self, *args, **kwargs): + self.date_format_key = self.date_format + super().__init__(*args, **kwargs) + + if hasattr(self, 'start_requests_callback'): + self.start_requests_callback = getattr(self, self.start_requests_callback) + else: + self.start_requests_callback = self.parse + + @classmethod + def from_crawler(cls, crawler, from_date=None, *args, **kwargs): + if not from_date: + from_date = cls.default_from_date + + spider = super(SimpleSpider, cls).from_crawler(crawler, from_date=from_date, *args, **kwargs) + + return spider + + @classmethod + def get_default_until_date(cls, spider): + if hasattr(spider, 'default_until_date') and spider.default_until_date: + return spider.default_until_date + else: + return datetime.today() + + def start_requests(self): + + start = self.from_date + + stop = self.until_date + + if self.sample: + start = stop + + if self.date_format_key == 'year': + date_range = util.date_range_by_year(start.year, stop.year) + else: + date_range = util.date_range_by_month(start, stop) + + for date in date_range: + for url in self.build_urls(self.pattern, date): + yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback) + + @abstractmethod + def get_formatter(self): + pass + + def build_urls(self, pattern, date): + yield pattern.format(date) diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index 938b1113..5d79b81b 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -1,8 +1,8 @@ -from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import components, date_range_by_year +from kingfisher_scrapy.base_spider import PeriodicalSpider +from kingfisher_scrapy.util import components -class MoldovaOld(SimpleSpider): +class MoldovaOld(PeriodicalSpider): """ Bulk download documentation http://opencontracting.date.gov.md/downloads @@ -12,14 +12,10 @@ class MoldovaOld(SimpleSpider): """ name = 'moldova_old' data_type = 'release_package' + default_from_date = '2012' + default_until_date = '2018' + pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}' + date_format = 'year' - def start_requests(self): - pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}' - - start = 2012 - stop = 2018 - if self.sample: - start = 2018 - - for year in date_range_by_year(start, stop): - yield self.build_request(pattern.format(year), formatter=components(-1)) + def get_formatter(self): + return components(-1) diff --git a/kingfisher_scrapy/spiders/nepal_portal.py b/kingfisher_scrapy/spiders/nepal_portal.py index 0bdabd3b..8b8c211a 100644 --- a/kingfisher_scrapy/spiders/nepal_portal.py +++ b/kingfisher_scrapy/spiders/nepal_portal.py @@ -1,10 +1,8 @@ -from datetime import date +from kingfisher_scrapy.base_spider import PeriodicalSpider +from kingfisher_scrapy.util import components -from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import components, date_range_by_year - -class NepalPortal(SimpleSpider): +class NepalPortal(PeriodicalSpider): """ Bulk download documentation http://ppip.gov.np/downloads @@ -15,16 +13,10 @@ class NepalPortal(SimpleSpider): name = 'nepal_portal' data_type = 'release_package' ocds_version = '1.0' + default_from_date = '2012' + default_until_date = '2018' + pattern = 'http://ppip.gov.np/bulk-download/{}' + date_format = 'year' - def start_requests(self): - pattern = 'http://ppip.gov.np/bulk-download/{}' - - if self.sample: - start = 2018 - stop = 2018 - else: - start = 2012 - stop = date.today().year # HTTP 500 after 2018 - - for year in date_range_by_year(start, stop): - yield self.build_request(pattern.format(year), formatter=components(-1)) + def get_formatter(self): + return components(-1) diff --git a/kingfisher_scrapy/spiders/scotland_base.py b/kingfisher_scrapy/spiders/scotland_base.py index 2f8ac806..d22147af 100644 --- a/kingfisher_scrapy/spiders/scotland_base.py +++ b/kingfisher_scrapy/spiders/scotland_base.py @@ -1,51 +1,40 @@ from datetime import date -from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import date_range_by_month, parameters +from kingfisher_scrapy.base_spider import PeriodicalSpider +from kingfisher_scrapy.util import parameters -class ScotlandBase(SimpleSpider): - default_from_date = '2019-01' +class ScotlandBase(PeriodicalSpider): date_format = 'year-month' + default_from_date = date(date.today().year - 1, date.today().month, 1) - @classmethod - def from_crawler(cls, crawler, from_date=None, *args, **kwargs): - if not from_date: - from_date = cls.default_from_date + notice_types = [ + 1, # OJEU - F1 - Prior Information Notice + 2, # OJEU - F2 - Contract Notice + 3, # OJEU - F3 - Contract Award Notice + 4, # OJEU - F4 - Prior Information Notice(Utilities) + 5, # OJEU - F5 - Contract Notice(Utilities) + 6, # OJEU - F6 - Contract Award Notice(Utilities) + 7, # OJEU - F7 - Qualification Systems(Utilities) + 12, # OJEU - F12 - Design Contest Notice + 13, # OJEU - F13 - Results Of Design Contest + 14, # OJEU - F14 - Corrigendum + 15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice + 20, # OJEU - F20 - Modification Notice + 21, # OJEU - F21 - Social And other Specific Services(Public Contracts) + 22, # OJEU - F22 - Social And other Specific Services(Utilities) + 23, # OJEU - F23 - Social And other Specific Services(Concessions) + 24, # OJEU - F24 - Concession Notice + 25, # OJEU - F25 - Concession Award Notice + 101, # Site Notice - Website Contract Notice + 102, # Site Notice - Website Prior Information Notice + 103, # Site Notice - Website Contract Award Notice + 104, # Site Notice - Quick Quote Award + ] - return super().from_crawler(crawler, from_date=from_date, *args, **kwargs) + def build_urls(self, pattern, date): + for notice_type in self.notice_types: + yield pattern.format(date, notice_type) - def start_requests(self): - notice_types = [ - 1, # OJEU - F1 - Prior Information Notice - 2, # OJEU - F2 - Contract Notice - 3, # OJEU - F3 - Contract Award Notice - 4, # OJEU - F4 - Prior Information Notice(Utilities) - 5, # OJEU - F5 - Contract Notice(Utilities) - 6, # OJEU - F6 - Contract Award Notice(Utilities) - 7, # OJEU - F7 - Qualification Systems(Utilities) - 12, # OJEU - F12 - Design Contest Notice - 13, # OJEU - F13 - Results Of Design Contest - 14, # OJEU - F14 - Corrigendum - 15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice - 20, # OJEU - F20 - Modification Notice - 21, # OJEU - F21 - Social And other Specific Services(Public Contracts) - 22, # OJEU - F22 - Social And other Specific Services(Utilities) - 23, # OJEU - F23 - Social And other Specific Services(Concessions) - 24, # OJEU - F24 - Concession Notice - 25, # OJEU - F25 - Concession Award Notice - 101, # Site Notice - Website Contract Notice - 102, # Site Notice - Website Prior Information Notice - 103, # Site Notice - Website Contract Award Notice - 104, # Site Notice - Quick Quote Award - ] - - for year_month in date_range_by_month(self.from_date, date.today()): - date_string = year_month.strftime('%m-%Y') - for notice_type in notice_types: - yield self.build_request( - self.url.format(date_string, notice_type), - formatter=parameters('noticeType', 'dateFrom') - ) - if self.sample: - return + def get_formatter(self): + return parameters('noticeType', 'dateFrom') diff --git a/kingfisher_scrapy/spiders/scotland_proactis.py b/kingfisher_scrapy/spiders/scotland_proactis.py index a44c0d78..543a99bc 100644 --- a/kingfisher_scrapy/spiders/scotland_proactis.py +++ b/kingfisher_scrapy/spiders/scotland_proactis.py @@ -13,4 +13,4 @@ class ScotlandProactis(ScotlandBase): """ name = 'scotland_proactis' data_type = 'release_package' - url = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={}&outputType=0¬iceType={}' + pattern = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={:%m-%Y}&outputType=0¬iceType={}' diff --git a/kingfisher_scrapy/spiders/scotland_public_contracts.py b/kingfisher_scrapy/spiders/scotland_public_contracts.py index 2be9fb68..e1d9f256 100644 --- a/kingfisher_scrapy/spiders/scotland_public_contracts.py +++ b/kingfisher_scrapy/spiders/scotland_public_contracts.py @@ -13,4 +13,4 @@ class ScotlandPublicContracts(ScotlandBase): """ name = 'scotland_public_contracts' data_type = 'release_package' - url = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={}&outputType=1¬iceType={}' + pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={:%m-%Y}&outputType=0¬iceType={}' diff --git a/kingfisher_scrapy/spiders/uruguay_base.py b/kingfisher_scrapy/spiders/uruguay_base.py index 97b4af3d..0824fc48 100644 --- a/kingfisher_scrapy/spiders/uruguay_base.py +++ b/kingfisher_scrapy/spiders/uruguay_base.py @@ -1,28 +1,18 @@ from abc import abstractmethod -from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import components, date_range_by_month +from kingfisher_scrapy.base_spider import PeriodicalSpider +from kingfisher_scrapy.util import components -class UruguayBase(SimpleSpider): +class UruguayBase(PeriodicalSpider): download_delay = 0.9 default_from_date = '2017-11' date_format = 'year-month' + pattern = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' + start_requests_callback = 'parse_list' - @classmethod - def from_crawler(cls, crawler, from_date=None, *args, **kwargs): - if not from_date: - from_date = cls.default_from_date - - return super().from_crawler(crawler, from_date=from_date, *args, **kwargs) - - def start_requests(self): - url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' - if self.sample: - self.from_date = self.until_date - - for d in date_range_by_month(self.from_date, self.until_date): - yield self.build_request(url.format(d), formatter=components(-2), callback=self.parse_list) + def get_formatter(self): + return components(-2) @abstractmethod def parse_list(self): diff --git a/kingfisher_scrapy/spiders/uruguay_historical.py b/kingfisher_scrapy/spiders/uruguay_historical.py index 1dcca0e3..f0bd0454 100644 --- a/kingfisher_scrapy/spiders/uruguay_historical.py +++ b/kingfisher_scrapy/spiders/uruguay_historical.py @@ -1,8 +1,8 @@ -from kingfisher_scrapy.base_spider import CompressedFileSpider -from kingfisher_scrapy.util import components, date_range_by_year +from kingfisher_scrapy.base_spider import CompressedFileSpider, PeriodicalSpider +from kingfisher_scrapy.util import components -class UruguayHistorical(CompressedFileSpider): +class UruguayHistorical(CompressedFileSpider, PeriodicalSpider): """ Bulk download documentation https://www.gub.uy/agencia-compras-contrataciones-estado/datos-y-estadisticas/datos/open-contracting @@ -15,20 +15,17 @@ class UruguayHistorical(CompressedFileSpider): # the files takes too long to be downloaded, so we increase the download timeout download_timeout = 1000 + default_from_date = '2002' + default_until_date = '2017' + date_format = 'year' custom_settings = { # It seems some websites don't like it and block when your user agent is not a browser. # see https://github.com/scrapy/scrapy/issues/3103 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/37.0.2049.0 Safari/537.36', } + pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \ + '/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip' - def start_requests(self): - start = 2002 - stop = 2017 - if self.sample: - start = stop - - pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \ - '/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip' - for year in date_range_by_year(start, stop): - yield self.build_request(pattern.format(year), formatter=components(-1)) + def get_formatter(self): + return components(-1) diff --git a/tests/test_periodical_spider.py b/tests/test_periodical_spider.py new file mode 100644 index 00000000..3430f64a --- /dev/null +++ b/tests/test_periodical_spider.py @@ -0,0 +1,91 @@ +from datetime import datetime + +import pytest + +from kingfisher_scrapy.base_spider import PeriodicalSpider +from kingfisher_scrapy.util import components, date_range_by_month, date_range_by_year + +from . import spider_with_crawler + + +def _format_urls(arg_type, pattern, arg_start, arg_end): + if arg_type == 'year': + date_range = date_range_by_year + start = arg_start.year + end = arg_end.year + else: + date_range = date_range_by_month + start = arg_start + end = arg_end + + return [pattern.format(x) for x in date_range(start, end)] + + +TEST_CASES = [ + # default from date + ('year', 'http://example.com/{}', '2012', datetime.today().year, {'default_from_date': '2012'}, {}), + ('year-month', 'http://example.com/{:%Y-%m}', '2010-06', datetime.today().strftime('%Y-%m'), { + 'default_from_date': '2010-06' + }, {}), + # default from & end dates + ('year', 'http://example.com/{}', '2012', '2016', { + 'default_from_date': '2012', + 'default_until_date': '2016' + }, {}), + ('year-month', 'http://example.com/{:%Y-%m}', '2010-06', '2019-12', { + 'default_from_date': '2010-06', + 'default_until_date': '2019-12' + }, {}), + # from_date specified by the user + ('year', 'http://example.com/{}', '2017', datetime.today().year, { + 'default_from_date': '2008' + }, { + 'from_date': '2017' + }), + ('year-month', 'http://example.com/{:%Y-%m}', '2018-01', datetime.today().strftime('%Y-%m'), { + 'default_from_date': '2011-01' + }, { + 'from_date': '2018-01' + }), + # until_date specified by the user + ('year', 'http://example.com/{}', '2008', '2010', { + 'default_from_date': '2008', + 'default_until_date': '2017' + }, { + 'until_date': '2010' + }), + ('year-month', 'http://example.com/{:%Y-%m}', '2011-01', '2019-06', { + 'default_from_date': '2011-01' + }, { + 'until_date': '2019-06' + }), + # pass the 'sample' parameter + ('year', 'http://example.com/{}', datetime.today().year, datetime.today().year, { + 'default_from_date': '2008', + }, { + 'sample': 'true' + }), +] + + +@pytest.mark.parametrize( + 'date_format,pattern,expected_start,expected_end,class_args,user_args', + TEST_CASES) +def test_urls(date_format, pattern, expected_start, expected_end, class_args, user_args): + expected = _format_urls( + date_format, + pattern, + datetime.strptime(str(expected_start), PeriodicalSpider.VALID_DATE_FORMATS[date_format]), + datetime.strptime(str(expected_end), PeriodicalSpider.VALID_DATE_FORMATS[date_format]) + ) + + test_spider = type('TestSpider', (PeriodicalSpider,), dict(date_format=date_format, + get_formatter=lambda x: components(-1), + pattern=pattern, + **class_args)) + spider = spider_with_crawler(spider_class=test_spider, **user_args) + + requests = [x for x in spider.start_requests()] + + for request, expected_url in zip(requests, expected): + assert request.url == expected_url