From 4f39bd9a00ee644f20b1284da87f36d0503f9ae1 Mon Sep 17 00:00:00 2001 From: Romina Fernandez Date: Sat, 25 Jul 2020 23:26:34 -0400 Subject: [PATCH 01/11] Add PeriodicalSpider --- kingfisher_scrapy/base_spider.py | 78 ++++++++++++++++++- kingfisher_scrapy/spiders/moldova_old.py | 22 +++--- kingfisher_scrapy/spiders/nepal_portal.py | 26 +++---- kingfisher_scrapy/spiders/scotland_base.py | 76 ++++++++---------- .../spiders/scotland_proactis.py | 5 +- .../spiders/scotland_public_contracts.py | 5 +- kingfisher_scrapy/spiders/uruguay_base.py | 26 ++----- .../spiders/uruguay_historical.py | 23 +++--- 8 files changed, 147 insertions(+), 114 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index cb1cce49..92660c29 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -1,5 +1,6 @@ import json import os +from abc import abstractmethod from datetime import datetime from io import BytesIO from zipfile import ZipFile @@ -107,8 +108,7 @@ def from_crawler(cls, crawler, *args, **kwargs): # Default to `default_from_date` class attribute. spider.from_date = spider.default_from_date if not spider.until_date: - # Default to today. - spider.until_date = datetime.now().strftime(spider.date_format) + spider.until_date = cls.get_default_until_date(spider) try: spider.from_date = datetime.strptime(spider.from_date, spider.date_format) except ValueError as e: @@ -266,6 +266,10 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None if self.sample: break + @classmethod + def get_default_until_date(cls, spider): + return datetime.now().strftime(spider.date_format) + class SimpleSpider(BaseSpider): """ @@ -429,3 +433,73 @@ def next_link(self, response): if response.meta['depth'] == 0: raise MissingNextLinkError('next link not found on the first page: {}'.format(response.url)) + + +class PeriodicalSpider(SimpleSpider): + """ + This class helps to crawl urls that receive a year (YYYY) or a month and year (YYYY-mm) as parameters. To use it: + + 1. Extend from ``PeriodicalSpider``. + 1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'. + 1. Set a ``start`` year or month-year. + 1. Optionally, set a ``stop`` year or month-year. If absent, ``stop`` defaults to the current year or month-year. + 1. Set the ``pattern`` parameter with the url to retrieve. + 1. Implement the `get_formatter` method. + + The ``pattern`` should include a placeholder for a year or month-year parameter. With the year parameter, an int is + passed. If the year-month parameter is used, a ``Date`` instance is passed. Example: + + .. code-block: python + + url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' + + When the ``sample`` option is used, the latest year or month of data is retrieved. + """ + VALID_DATE_FORMATS = {'year': '%Y', 'year-month': '%Y-%m'} + + def __init__(self, *args, **kwargs): + self.date_format_key = self.date_format + if hasattr(self, 'start_requests_callback'): + self.start_requests_callback = getattr(self, self.start_requests_callback) + else: + self.start_requests_callback = self.parse + + super().__init__(*args, **kwargs) + + @classmethod + def get_default_until_date(cls, spider): + try: + return str(spider.stop) + except AttributeError: + return super().get_default_until_date(spider) + + def start_requests(self): + + start = datetime.strptime(str(self.start), self.date_format) \ + if not (hasattr(self, 'from_date') and self.from_date) else self.from_date + + if hasattr(self, 'stop'): + self.stop = datetime.strptime(str(self.stop), self.date_format) + else: + self.stop = datetime.today() + + stop = self.stop if not (hasattr(self, 'until_date') and self.until_date) else self.until_date + + if self.sample: + start = stop + + if self.date_format_key == 'year': + date_range = util.date_range_by_year(start.year, stop.year) + else: + date_range = util.date_range_by_month(start, stop) + + for date in date_range: + for url in self.build_urls(self.pattern, date): + yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback) + + @abstractmethod + def get_formatter(self): + pass + + def build_urls(self, pattern, date): + yield pattern.format(date) diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index 938b1113..eb3e54e2 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -1,8 +1,8 @@ -from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import components, date_range_by_year +from kingfisher_scrapy.base_spider import PeriodicalSpider +from kingfisher_scrapy.util import components -class MoldovaOld(SimpleSpider): +class MoldovaOld(PeriodicalSpider): """ Bulk download documentation http://opencontracting.date.gov.md/downloads @@ -12,14 +12,10 @@ class MoldovaOld(SimpleSpider): """ name = 'moldova_old' data_type = 'release_package' + start = 2012 + stop = 2018 + pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}' + date_format = 'year' - def start_requests(self): - pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}' - - start = 2012 - stop = 2018 - if self.sample: - start = 2018 - - for year in date_range_by_year(start, stop): - yield self.build_request(pattern.format(year), formatter=components(-1)) + def get_formatter(self): + return components(-1) diff --git a/kingfisher_scrapy/spiders/nepal_portal.py b/kingfisher_scrapy/spiders/nepal_portal.py index 0bdabd3b..3c1f81cd 100644 --- a/kingfisher_scrapy/spiders/nepal_portal.py +++ b/kingfisher_scrapy/spiders/nepal_portal.py @@ -1,10 +1,8 @@ -from datetime import date +from kingfisher_scrapy.base_spider import PeriodicalSpider +from kingfisher_scrapy.util import components -from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import components, date_range_by_year - -class NepalPortal(SimpleSpider): +class NepalPortal(PeriodicalSpider): """ Bulk download documentation http://ppip.gov.np/downloads @@ -15,16 +13,10 @@ class NepalPortal(SimpleSpider): name = 'nepal_portal' data_type = 'release_package' ocds_version = '1.0' + start = 2012 + stop = 2018 + pattern = 'http://ppip.gov.np/bulk-download/{}' + date_format = 'year' - def start_requests(self): - pattern = 'http://ppip.gov.np/bulk-download/{}' - - if self.sample: - start = 2018 - stop = 2018 - else: - start = 2012 - stop = date.today().year # HTTP 500 after 2018 - - for year in date_range_by_year(start, stop): - yield self.build_request(pattern.format(year), formatter=components(-1)) + def get_formatter(self): + return components(-1) diff --git a/kingfisher_scrapy/spiders/scotland_base.py b/kingfisher_scrapy/spiders/scotland_base.py index 56aa38eb..e9300d2c 100644 --- a/kingfisher_scrapy/spiders/scotland_base.py +++ b/kingfisher_scrapy/spiders/scotland_base.py @@ -1,50 +1,40 @@ -from datetime import date +from datetime import date, timedelta -from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import date_range_by_month, parameters +from kingfisher_scrapy.base_spider import PeriodicalSpider +from kingfisher_scrapy.util import parameters -class ScotlandBase(SimpleSpider): +class ScotlandBase(PeriodicalSpider): date_format = 'year-month' + start = (date.today() - timedelta(days=1)).strftime("%Y-%m") - def parse_requests(self, pattern): + notice_types = [ + 1, # OJEU - F1 - Prior Information Notice + 2, # OJEU - F2 - Contract Notice + 3, # OJEU - F3 - Contract Award Notice + 4, # OJEU - F4 - Prior Information Notice(Utilities) + 5, # OJEU - F5 - Contract Notice(Utilities) + 6, # OJEU - F6 - Contract Award Notice(Utilities) + 7, # OJEU - F7 - Qualification Systems(Utilities) + 12, # OJEU - F12 - Design Contest Notice + 13, # OJEU - F13 - Results Of Design Contest + 14, # OJEU - F14 - Corrigendum + 15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice + 20, # OJEU - F20 - Modification Notice + 21, # OJEU - F21 - Social And other Specific Services(Public Contracts) + 22, # OJEU - F22 - Social And other Specific Services(Utilities) + 23, # OJEU - F23 - Social And other Specific Services(Concessions) + 24, # OJEU - F24 - Concession Notice + 25, # OJEU - F25 - Concession Award Notice + 101, # Site Notice - Website Contract Notice + 102, # Site Notice - Website Prior Information Notice + 103, # Site Notice - Website Contract Award Notice + 104, # Site Notice - Quick Quote Award + ] - notice_types = [ - 1, # OJEU - F1 - Prior Information Notice - 2, # OJEU - F2 - Contract Notice - 3, # OJEU - F3 - Contract Award Notice - 4, # OJEU - F4 - Prior Information Notice(Utilities) - 5, # OJEU - F5 - Contract Notice(Utilities) - 6, # OJEU - F6 - Contract Award Notice(Utilities) - 7, # OJEU - F7 - Qualification Systems(Utilities) - 12, # OJEU - F12 - Design Contest Notice - 13, # OJEU - F13 - Results Of Design Contest - 14, # OJEU - F14 - Corrigendum - 15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice - 20, # OJEU - F20 - Modification Notice - 21, # OJEU - F21 - Social And other Specific Services(Public Contracts) - 22, # OJEU - F22 - Social And other Specific Services(Utilities) - 23, # OJEU - F23 - Social And other Specific Services(Concessions) - 24, # OJEU - F24 - Concession Notice - 25, # OJEU - F25 - Concession Award Notice - 101, # Site Notice - Website Contract Notice - 102, # Site Notice - Website Prior Information Notice - 103, # Site Notice - Website Contract Award Notice - 104, # Site Notice - Quick Quote Award - ] + def build_urls(self, pattern, date): + for notice_type in self.notice_types: + yield pattern.format(date, notice_type) - now = date.today() - if self.from_date: - start = date(self.from_date.year, self.from_date.month, 1) - else: - start = date(now.year - 1, now.month, 1) - if self.sample: - start = now - - for d in date_range_by_month(start, now): - date_string = '{:02d}-{:04d}'.format(d.month, d.year) - for notice_type in notice_types: - yield self.build_request( - pattern.format(date_string, notice_type), - formatter=parameters('noticeType', 'dateFrom') - ) + def get_formatter(self): + return parameters('noticeType', 'dateFrom') diff --git a/kingfisher_scrapy/spiders/scotland_proactis.py b/kingfisher_scrapy/spiders/scotland_proactis.py index d3599c0c..3172089d 100644 --- a/kingfisher_scrapy/spiders/scotland_proactis.py +++ b/kingfisher_scrapy/spiders/scotland_proactis.py @@ -13,7 +13,4 @@ class ScotlandProactis(ScotlandBase): """ name = 'scotland_proactis' data_type = 'release_package' - - def start_requests(self): - pattern = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={}&outputType=0¬iceType={}' - return self.parse_requests(pattern) + pattern = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={:%m-%Y}&outputType=0¬iceType={}' diff --git a/kingfisher_scrapy/spiders/scotland_public_contracts.py b/kingfisher_scrapy/spiders/scotland_public_contracts.py index c1f20fd5..6fb556ec 100644 --- a/kingfisher_scrapy/spiders/scotland_public_contracts.py +++ b/kingfisher_scrapy/spiders/scotland_public_contracts.py @@ -13,7 +13,4 @@ class ScotlandPublicContracts(ScotlandBase): """ name = 'scotland_public_contracts' data_type = 'release_package' - - def start_requests(self): - pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={}&outputType=1¬iceType={}' - return self.parse_requests(pattern) + pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={0.year}&outputType=1¬iceType={1}' diff --git a/kingfisher_scrapy/spiders/uruguay_base.py b/kingfisher_scrapy/spiders/uruguay_base.py index 97b4af3d..9ce65412 100644 --- a/kingfisher_scrapy/spiders/uruguay_base.py +++ b/kingfisher_scrapy/spiders/uruguay_base.py @@ -1,28 +1,18 @@ from abc import abstractmethod -from kingfisher_scrapy.base_spider import SimpleSpider -from kingfisher_scrapy.util import components, date_range_by_month +from kingfisher_scrapy.base_spider import PeriodicalSpider +from kingfisher_scrapy.util import components -class UruguayBase(SimpleSpider): +class UruguayBase(PeriodicalSpider): download_delay = 0.9 - default_from_date = '2017-11' + start = '2017-11' date_format = 'year-month' + pattern = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' + start_requests_callback = 'parse_list' - @classmethod - def from_crawler(cls, crawler, from_date=None, *args, **kwargs): - if not from_date: - from_date = cls.default_from_date - - return super().from_crawler(crawler, from_date=from_date, *args, **kwargs) - - def start_requests(self): - url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' - if self.sample: - self.from_date = self.until_date - - for d in date_range_by_month(self.from_date, self.until_date): - yield self.build_request(url.format(d), formatter=components(-2), callback=self.parse_list) + def get_formatter(self): + return components(-2) @abstractmethod def parse_list(self): diff --git a/kingfisher_scrapy/spiders/uruguay_historical.py b/kingfisher_scrapy/spiders/uruguay_historical.py index 1dcca0e3..e6913aa3 100644 --- a/kingfisher_scrapy/spiders/uruguay_historical.py +++ b/kingfisher_scrapy/spiders/uruguay_historical.py @@ -1,8 +1,8 @@ -from kingfisher_scrapy.base_spider import CompressedFileSpider -from kingfisher_scrapy.util import components, date_range_by_year +from kingfisher_scrapy.base_spider import CompressedFileSpider, PeriodicalSpider +from kingfisher_scrapy.util import components -class UruguayHistorical(CompressedFileSpider): +class UruguayHistorical(CompressedFileSpider, PeriodicalSpider): """ Bulk download documentation https://www.gub.uy/agencia-compras-contrataciones-estado/datos-y-estadisticas/datos/open-contracting @@ -15,20 +15,17 @@ class UruguayHistorical(CompressedFileSpider): # the files takes too long to be downloaded, so we increase the download timeout download_timeout = 1000 + start = 2002 + stop = 2017 + date_format = 'year' custom_settings = { # It seems some websites don't like it and block when your user agent is not a browser. # see https://github.com/scrapy/scrapy/issues/3103 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/37.0.2049.0 Safari/537.36', } + pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \ + '/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip' - def start_requests(self): - start = 2002 - stop = 2017 - if self.sample: - start = stop - - pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \ - '/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip' - for year in date_range_by_year(start, stop): - yield self.build_request(pattern.format(year), formatter=components(-1)) + def get_formatter(self): + return components(-1) From 751f391d6983fed99b63c148f221396c846ed3c5 Mon Sep 17 00:00:00 2001 From: Romina Fernandez Date: Mon, 27 Jul 2020 14:36:57 -0400 Subject: [PATCH 02/11] Fix issues with Scotland spiders --- kingfisher_scrapy/base_spider.py | 25 ++++++++++++------- kingfisher_scrapy/spiders/scotland_base.py | 5 ++-- .../spiders/scotland_public_contracts.py | 2 +- 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 92660c29..05933835 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -1,6 +1,7 @@ import json import os from abc import abstractmethod +from datetime import date as DateClass from datetime import datetime from io import BytesIO from zipfile import ZipFile @@ -459,12 +460,21 @@ class PeriodicalSpider(SimpleSpider): def __init__(self, *args, **kwargs): self.date_format_key = self.date_format + super().__init__(*args, **kwargs) + if hasattr(self, 'start_requests_callback'): self.start_requests_callback = getattr(self, self.start_requests_callback) else: self.start_requests_callback = self.parse - super().__init__(*args, **kwargs) + if not isinstance(self.start, DateClass): + self.start = datetime.strptime(str(self.start), self.date_format) + + if hasattr(self, 'stop'): + if not isinstance(self.stop, DateClass): + self.stop = datetime.strptime(str(self.stop), self.date_format) + else: + self.stop = datetime.today() @classmethod def get_default_until_date(cls, spider): @@ -475,15 +485,9 @@ def get_default_until_date(cls, spider): def start_requests(self): - start = datetime.strptime(str(self.start), self.date_format) \ - if not (hasattr(self, 'from_date') and self.from_date) else self.from_date - - if hasattr(self, 'stop'): - self.stop = datetime.strptime(str(self.stop), self.date_format) - else: - self.stop = datetime.today() + start = self.start if not self.exists('from_date') else self.from_date - stop = self.stop if not (hasattr(self, 'until_date') and self.until_date) else self.until_date + stop = self.stop if not self.exists('until_date') else self.until_date if self.sample: start = stop @@ -503,3 +507,6 @@ def get_formatter(self): def build_urls(self, pattern, date): yield pattern.format(date) + + def exists(self, attr): + return hasattr(self, attr) and getattr(self, attr) diff --git a/kingfisher_scrapy/spiders/scotland_base.py b/kingfisher_scrapy/spiders/scotland_base.py index e9300d2c..bcb59fe0 100644 --- a/kingfisher_scrapy/spiders/scotland_base.py +++ b/kingfisher_scrapy/spiders/scotland_base.py @@ -1,4 +1,4 @@ -from datetime import date, timedelta +from datetime import date from kingfisher_scrapy.base_spider import PeriodicalSpider from kingfisher_scrapy.util import parameters @@ -6,7 +6,8 @@ class ScotlandBase(PeriodicalSpider): date_format = 'year-month' - start = (date.today() - timedelta(days=1)).strftime("%Y-%m") + stop = date.today() + start = date(stop.year - 1, stop.month, 1) notice_types = [ 1, # OJEU - F1 - Prior Information Notice diff --git a/kingfisher_scrapy/spiders/scotland_public_contracts.py b/kingfisher_scrapy/spiders/scotland_public_contracts.py index 6fb556ec..c4542e68 100644 --- a/kingfisher_scrapy/spiders/scotland_public_contracts.py +++ b/kingfisher_scrapy/spiders/scotland_public_contracts.py @@ -13,4 +13,4 @@ class ScotlandPublicContracts(ScotlandBase): """ name = 'scotland_public_contracts' data_type = 'release_package' - pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={0.year}&outputType=1¬iceType={1}' + pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={:%m-%Y}&outputType=1¬iceType={}' From 60c29ffb0a89b5fc0ad24c918f93f272e02fb9e4 Mon Sep 17 00:00:00 2001 From: Romina Fernandez Date: Mon, 27 Jul 2020 15:11:24 -0400 Subject: [PATCH 03/11] Fix issues with from_date parameter --- kingfisher_scrapy/base_spider.py | 7 +++---- kingfisher_scrapy/spiders/scotland_public_contracts.py | 1 - 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 05933835..0521253d 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -461,6 +461,7 @@ class PeriodicalSpider(SimpleSpider): def __init__(self, *args, **kwargs): self.date_format_key = self.date_format super().__init__(*args, **kwargs) + self.default_from_date = datetime.strptime(str(self.start), self.date_format) if hasattr(self, 'start_requests_callback'): self.start_requests_callback = getattr(self, self.start_requests_callback) @@ -469,6 +470,7 @@ def __init__(self, *args, **kwargs): if not isinstance(self.start, DateClass): self.start = datetime.strptime(str(self.start), self.date_format) + self.default_from_date = self.start if hasattr(self, 'stop'): if not isinstance(self.stop, DateClass): @@ -478,10 +480,7 @@ def __init__(self, *args, **kwargs): @classmethod def get_default_until_date(cls, spider): - try: - return str(spider.stop) - except AttributeError: - return super().get_default_until_date(spider) + return spider.stop.strftime(spider.date_format) def start_requests(self): diff --git a/kingfisher_scrapy/spiders/scotland_public_contracts.py b/kingfisher_scrapy/spiders/scotland_public_contracts.py index 05647b1e..230ca1de 100644 --- a/kingfisher_scrapy/spiders/scotland_public_contracts.py +++ b/kingfisher_scrapy/spiders/scotland_public_contracts.py @@ -14,4 +14,3 @@ class ScotlandPublicContracts(ScotlandBase): name = 'scotland_public_contracts' data_type = 'release_package' pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={:%m-%Y}&outputType=0¬iceType={}' - From 86518da7fcbec44c74cbef02c0013b32dae99426 Mon Sep 17 00:00:00 2001 From: Romina Fernandez Date: Mon, 27 Jul 2020 15:21:33 -0400 Subject: [PATCH 04/11] Fix test issue --- kingfisher_scrapy/base_spider.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 0521253d..ab2d60a5 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -461,7 +461,6 @@ class PeriodicalSpider(SimpleSpider): def __init__(self, *args, **kwargs): self.date_format_key = self.date_format super().__init__(*args, **kwargs) - self.default_from_date = datetime.strptime(str(self.start), self.date_format) if hasattr(self, 'start_requests_callback'): self.start_requests_callback = getattr(self, self.start_requests_callback) @@ -470,7 +469,7 @@ def __init__(self, *args, **kwargs): if not isinstance(self.start, DateClass): self.start = datetime.strptime(str(self.start), self.date_format) - self.default_from_date = self.start + self.default_from_date = self.start.strftime(self.date_format) if hasattr(self, 'stop'): if not isinstance(self.stop, DateClass): From 100f2a5e7a11f42715ec8ed74599365d6d8a9548 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Romina=20Fern=C3=A1ndez?= Date: Mon, 27 Jul 2020 18:23:25 -0400 Subject: [PATCH 05/11] Apply suggestions from code review Add Yohanna's suggestions Co-authored-by: Yohanna Lisnichuk --- kingfisher_scrapy/base_spider.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index ab2d60a5..624e47b2 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -442,8 +442,8 @@ class PeriodicalSpider(SimpleSpider): 1. Extend from ``PeriodicalSpider``. 1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'. - 1. Set a ``start`` year or month-year. - 1. Optionally, set a ``stop`` year or month-year. If absent, ``stop`` defaults to the current year or month-year. + 1. Set a ``start`` year or year-month. + 1. Optionally, set a ``stop`` year or year-month. If absent, ``stop`` defaults to the current year or year-month. 1. Set the ``pattern`` parameter with the url to retrieve. 1. Implement the `get_formatter` method. From 11d2526ac2cb5c06102815bcf7f61f19c33fb7f5 Mon Sep 17 00:00:00 2001 From: Romina Fernandez Date: Fri, 31 Jul 2020 21:49:40 -0400 Subject: [PATCH 06/11] [WIP] Periodical spider tests --- tests/test_periodical_spider.py | 45 +++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 tests/test_periodical_spider.py diff --git a/tests/test_periodical_spider.py b/tests/test_periodical_spider.py new file mode 100644 index 00000000..9a9cd430 --- /dev/null +++ b/tests/test_periodical_spider.py @@ -0,0 +1,45 @@ +import pytest +from datetime import date, datetime + +from . import spider_with_crawler +from kingfisher_scrapy.base_spider import PeriodicalSpider +from kingfisher_scrapy.util import components, date_range_by_year, date_range_by_month + + +def _get_urls(type, pattern, arg_start, arg_end=datetime.now()): + if type == 'year': + date_range = date_range_by_year + start = arg_start.year + end = arg_end.year + else: + date_range = date_range_by_month + start = datetime.strptime(arg_start, '%Y-%m') + end = datetime.strptime(arg_end, '%Y-%m') + + return [pattern.format(x) for x in date_range(start, end)] + + +@pytest.mark.parametrize('date,date_format,expected', [ + (2008, 'year', '2008'), + ('2007-10', 'year-month', '2007-10') +]) +def test_default_from_date(date, date_format, expected): + spider = PeriodicalSpider(name='test', start=date, date_format=date_format) + assert spider.default_from_date == expected + + +@pytest.mark.parametrize('start,from_date,date_format,pattern,expected_start', [ + (2008, 2017, 'year', 'http://example.com/{}', 2017), + ('2011-01', '2018-01', 'year-month', 'http://example.com/{%Y-%m}', '2018-01') +]) +def test_start(start, from_date, date_format, pattern, expected_start): + expected = _get_urls(date_format, pattern, datetime.strptime(str(expected_start), date_format)) + + TestSpider = type('TestSpider', (PeriodicalSpider,), dict(start=start, date_format=date_format, + get_formatter=lambda x: components(-1), pattern=pattern)) + + spider = spider_with_crawler(spider_class=TestSpider, from_date=str(from_date)) + + requests = [x for x in spider.start_requests()] + + assert len(requests) == len(expected) From 8cdb0901fe01174ea040da2b0365b24514601d2e Mon Sep 17 00:00:00 2001 From: Romina Fernandez Date: Fri, 28 Aug 2020 02:29:24 -0400 Subject: [PATCH 07/11] Tests for PeriodicalSpider --- kingfisher_scrapy/base_spider.py | 66 ++++++++------ kingfisher_scrapy/spiders/moldova_old.py | 4 +- kingfisher_scrapy/spiders/nepal_portal.py | 4 +- kingfisher_scrapy/spiders/scotland_base.py | 4 +- kingfisher_scrapy/spiders/uruguay_base.py | 2 +- .../spiders/uruguay_historical.py | 4 +- tests/test_periodical_spider.py | 90 ++++++++++++++----- 7 files changed, 115 insertions(+), 59 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 829b57ae..67992cc6 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -1,7 +1,6 @@ import json import os from abc import abstractmethod -from datetime import date as DateClass from datetime import datetime from io import BytesIO from zipfile import ZipFile @@ -60,7 +59,7 @@ class BaseSpider(scrapy.Spider): MAX_SAMPLE = 10 MAX_RELEASES_PER_PACKAGE = 100 - VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'} + VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'} ocds_version = '1.1' date_format = 'date' @@ -117,16 +116,25 @@ def from_crawler(cls, crawler, *args, **kwargs): if not spider.from_date: # Default to `default_from_date` class attribute. spider.from_date = spider.default_from_date + if isinstance(spider.from_date, str): + # convert to date format, if needed + spider.from_date = datetime.strptime(spider.from_date, spider.date_format) + else: + try: + spider.from_date = datetime.strptime(spider.from_date, spider.date_format) + except ValueError as e: + raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e)) + if not spider.until_date: spider.until_date = cls.get_default_until_date(spider) - try: - spider.from_date = datetime.strptime(spider.from_date, spider.date_format) - except ValueError as e: - raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e)) - try: - spider.until_date = datetime.strptime(spider.until_date, spider.date_format) - except ValueError as e: - raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e)) + if isinstance(spider.until_date, str): + # convert to date format, if needed + spider.until_date = datetime.strptime(spider.until_date, spider.date_format) + else: + try: + spider.until_date = datetime.strptime(spider.until_date, spider.date_format) + except ValueError as e: + raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e)) return spider @@ -275,7 +283,7 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None @classmethod def get_default_until_date(cls, spider): - return datetime.now().strftime(spider.date_format) + return datetime.now() class SimpleSpider(BaseSpider): @@ -448,8 +456,8 @@ class PeriodicalSpider(SimpleSpider): 1. Extend from ``PeriodicalSpider``. 1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'. - 1. Set a ``start`` year or month-year. - 1. Optionally, set a ``stop`` year or month-year. If absent, ``stop`` defaults to the current year or month-year. + 1. Set a ``default_from_date`` year or month-year. + 1. Optionally, set a ``default_until_date`` year or month-year. If absent, ``stop`` defaults to the current year or month-year. 1. Set the ``pattern`` parameter with the url to retrieve. 1. Implement the `get_formatter` method. @@ -473,25 +481,32 @@ def __init__(self, *args, **kwargs): else: self.start_requests_callback = self.parse - if not isinstance(self.start, DateClass): - self.start = datetime.strptime(str(self.start), self.date_format) - self.default_from_date = self.start.strftime(self.date_format) + @classmethod + def from_crawler(cls, crawler, *args, **kwargs): + spider = super(SimpleSpider, cls).from_crawler(crawler, *args, **kwargs) - if hasattr(self, 'stop'): - if not isinstance(self.stop, DateClass): - self.stop = datetime.strptime(str(self.stop), self.date_format) - else: - self.stop = datetime.today() + if not spider.from_date: + spider.from_date = spider.default_from_date + if isinstance(spider.from_date, str): + spider.from_date = datetime.strptime(spider.from_date, spider.date_format) + spider.until_date = cls.get_default_until_date(spider) + if isinstance(spider.until_date, str): + spider.until_date = datetime.strptime(spider.until_date, spider.date_format) + + return spider @classmethod def get_default_until_date(cls, spider): - return spider.stop.strftime(spider.date_format) + if hasattr(spider, 'default_until_date') and spider.default_until_date: + return spider.default_until_date + else: + return datetime.today() def start_requests(self): - start = self.start if not self.exists('from_date') else self.from_date + start = self.from_date - stop = self.stop if not self.exists('until_date') else self.until_date + stop = self.until_date if self.sample: start = stop @@ -511,6 +526,3 @@ def get_formatter(self): def build_urls(self, pattern, date): yield pattern.format(date) - - def exists(self, attr): - return hasattr(self, attr) and getattr(self, attr) diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py index eb3e54e2..5d79b81b 100644 --- a/kingfisher_scrapy/spiders/moldova_old.py +++ b/kingfisher_scrapy/spiders/moldova_old.py @@ -12,8 +12,8 @@ class MoldovaOld(PeriodicalSpider): """ name = 'moldova_old' data_type = 'release_package' - start = 2012 - stop = 2018 + default_from_date = '2012' + default_until_date = '2018' pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}' date_format = 'year' diff --git a/kingfisher_scrapy/spiders/nepal_portal.py b/kingfisher_scrapy/spiders/nepal_portal.py index 3c1f81cd..8b8c211a 100644 --- a/kingfisher_scrapy/spiders/nepal_portal.py +++ b/kingfisher_scrapy/spiders/nepal_portal.py @@ -13,8 +13,8 @@ class NepalPortal(PeriodicalSpider): name = 'nepal_portal' data_type = 'release_package' ocds_version = '1.0' - start = 2012 - stop = 2018 + default_from_date = '2012' + default_until_date = '2018' pattern = 'http://ppip.gov.np/bulk-download/{}' date_format = 'year' diff --git a/kingfisher_scrapy/spiders/scotland_base.py b/kingfisher_scrapy/spiders/scotland_base.py index 9b4c6aa2..afb8935a 100644 --- a/kingfisher_scrapy/spiders/scotland_base.py +++ b/kingfisher_scrapy/spiders/scotland_base.py @@ -7,8 +7,8 @@ class ScotlandBase(PeriodicalSpider): default_from_date = '2019-01' date_format = 'year-month' - stop = date.today() - start = date(stop.year - 1, stop.month, 1) + default_until_date = date.today() + default_from_date = date(default_until_date.year - 1, default_until_date.month, 1) notice_types = [ 1, # OJEU - F1 - Prior Information Notice diff --git a/kingfisher_scrapy/spiders/uruguay_base.py b/kingfisher_scrapy/spiders/uruguay_base.py index 9ce65412..0824fc48 100644 --- a/kingfisher_scrapy/spiders/uruguay_base.py +++ b/kingfisher_scrapy/spiders/uruguay_base.py @@ -6,7 +6,7 @@ class UruguayBase(PeriodicalSpider): download_delay = 0.9 - start = '2017-11' + default_from_date = '2017-11' date_format = 'year-month' pattern = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' start_requests_callback = 'parse_list' diff --git a/kingfisher_scrapy/spiders/uruguay_historical.py b/kingfisher_scrapy/spiders/uruguay_historical.py index e6913aa3..f0bd0454 100644 --- a/kingfisher_scrapy/spiders/uruguay_historical.py +++ b/kingfisher_scrapy/spiders/uruguay_historical.py @@ -15,8 +15,8 @@ class UruguayHistorical(CompressedFileSpider, PeriodicalSpider): # the files takes too long to be downloaded, so we increase the download timeout download_timeout = 1000 - start = 2002 - stop = 2017 + default_from_date = '2002' + default_until_date = '2017' date_format = 'year' custom_settings = { # It seems some websites don't like it and block when your user agent is not a browser. diff --git a/tests/test_periodical_spider.py b/tests/test_periodical_spider.py index 9a9cd430..4153c11e 100644 --- a/tests/test_periodical_spider.py +++ b/tests/test_periodical_spider.py @@ -1,45 +1,89 @@ import pytest -from datetime import date, datetime +from datetime import datetime from . import spider_with_crawler from kingfisher_scrapy.base_spider import PeriodicalSpider from kingfisher_scrapy.util import components, date_range_by_year, date_range_by_month -def _get_urls(type, pattern, arg_start, arg_end=datetime.now()): - if type == 'year': +def _format_urls(arg_type, pattern, arg_start, arg_end): + if arg_type == 'year': date_range = date_range_by_year start = arg_start.year end = arg_end.year else: date_range = date_range_by_month - start = datetime.strptime(arg_start, '%Y-%m') - end = datetime.strptime(arg_end, '%Y-%m') + start = arg_start + end = arg_end return [pattern.format(x) for x in date_range(start, end)] -@pytest.mark.parametrize('date,date_format,expected', [ - (2008, 'year', '2008'), - ('2007-10', 'year-month', '2007-10') -]) -def test_default_from_date(date, date_format, expected): - spider = PeriodicalSpider(name='test', start=date, date_format=date_format) - assert spider.default_from_date == expected +TEST_CASES = [ + # default from date + ('year', 'http://example.com/{}', '2012', datetime.today().year, {'default_from_date': '2012'}, {}), + ('year-month', 'http://example.com/{:%Y-%m}', '2010-06', datetime.today().strftime('%Y-%m'), { + 'default_from_date': '2010-06' + }, {}), + # default from & end dates + ('year', 'http://example.com/{}', '2012', '2016', { + 'default_from_date': '2012', + 'default_until_date': '2016' + }, {}), + ('year-month', 'http://example.com/{:%Y-%m}', '2010-06', '2019-12', { + 'default_from_date': '2010-06', + 'default_until_date': '2019-12' + }, {}), + # from_date specified by the user + ('year', 'http://example.com/{}', '2017', datetime.today().year, { + 'default_from_date': '2008' + }, { + 'from_date': '2017' + }), + ('year-month', 'http://example.com/{:%Y-%m}', '2018-01', datetime.today().strftime('%Y-%m'), { + 'default_from_date': '2011-01' + }, { + 'from_date': '2018-01' + }), + # until_date specified by the user + ('year', 'http://example.com/{}', '2008', '2010', { + 'default_from_date': '2008', + 'default_until_date': '2017' + }, { + 'until_date': '2010' + }), + ('year-month', 'http://example.com/{:%Y-%m}', '2011-01', '2019-06', { + 'default_from_date': '2011-01' + }, { + 'until_date': '2019-06' + }), + # pass the 'sample' parameter + ('year', 'http://example.com/{}', datetime.today().year, datetime.today().year, { + 'default_from_date': '2008', + }, { + 'sample': 'true' + }), +] -@pytest.mark.parametrize('start,from_date,date_format,pattern,expected_start', [ - (2008, 2017, 'year', 'http://example.com/{}', 2017), - ('2011-01', '2018-01', 'year-month', 'http://example.com/{%Y-%m}', '2018-01') -]) -def test_start(start, from_date, date_format, pattern, expected_start): - expected = _get_urls(date_format, pattern, datetime.strptime(str(expected_start), date_format)) +@pytest.mark.parametrize( + 'date_format,pattern,expected_start,expected_end,class_args,user_args', + TEST_CASES) +def test_urls(date_format, pattern, expected_start, expected_end, class_args, user_args): + expected = _format_urls( + date_format, + pattern, + datetime.strptime(str(expected_start), PeriodicalSpider.VALID_DATE_FORMATS[date_format]), + datetime.strptime(str(expected_end), PeriodicalSpider.VALID_DATE_FORMATS[date_format]) + ) - TestSpider = type('TestSpider', (PeriodicalSpider,), dict(start=start, date_format=date_format, - get_formatter=lambda x: components(-1), pattern=pattern)) - - spider = spider_with_crawler(spider_class=TestSpider, from_date=str(from_date)) + test_spider = type('TestSpider', (PeriodicalSpider,), dict(date_format=date_format, + get_formatter=lambda x: components(-1), + pattern=pattern, + **class_args)) + spider = spider_with_crawler(spider_class=test_spider, **user_args) requests = [x for x in spider.start_requests()] - assert len(requests) == len(expected) + for request, expected_url in zip(requests, expected): + assert request.url == expected_url From 7efb28d2dc884bcff187c61c5d0f2d641a54ab44 Mon Sep 17 00:00:00 2001 From: Romina Fernandez Date: Fri, 28 Aug 2020 02:40:39 -0400 Subject: [PATCH 08/11] Fix quality issues --- kingfisher_scrapy/base_spider.py | 3 ++- kingfisher_scrapy/spiders/scotland_base.py | 1 - kingfisher_scrapy/spiders/scotland_proactis.py | 1 - kingfisher_scrapy/spiders/scotland_public_contracts.py | 1 - tests/test_periodical_spider.py | 8 +++++--- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 67992cc6..41a21306 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -457,7 +457,8 @@ class PeriodicalSpider(SimpleSpider): 1. Extend from ``PeriodicalSpider``. 1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'. 1. Set a ``default_from_date`` year or month-year. - 1. Optionally, set a ``default_until_date`` year or month-year. If absent, ``stop`` defaults to the current year or month-year. + 1. Optionally, set a ``default_until_date`` year or month-year. If absent, ``default_until_date`` defaults to the + current year or month-year. 1. Set the ``pattern`` parameter with the url to retrieve. 1. Implement the `get_formatter` method. diff --git a/kingfisher_scrapy/spiders/scotland_base.py b/kingfisher_scrapy/spiders/scotland_base.py index afb8935a..98d7ec29 100644 --- a/kingfisher_scrapy/spiders/scotland_base.py +++ b/kingfisher_scrapy/spiders/scotland_base.py @@ -40,4 +40,3 @@ def build_urls(self, pattern, date): def get_formatter(self): return parameters('noticeType', 'dateFrom') - diff --git a/kingfisher_scrapy/spiders/scotland_proactis.py b/kingfisher_scrapy/spiders/scotland_proactis.py index 9c8936d0..543a99bc 100644 --- a/kingfisher_scrapy/spiders/scotland_proactis.py +++ b/kingfisher_scrapy/spiders/scotland_proactis.py @@ -14,4 +14,3 @@ class ScotlandProactis(ScotlandBase): name = 'scotland_proactis' data_type = 'release_package' pattern = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={:%m-%Y}&outputType=0¬iceType={}' - diff --git a/kingfisher_scrapy/spiders/scotland_public_contracts.py b/kingfisher_scrapy/spiders/scotland_public_contracts.py index 0425e740..e1d9f256 100644 --- a/kingfisher_scrapy/spiders/scotland_public_contracts.py +++ b/kingfisher_scrapy/spiders/scotland_public_contracts.py @@ -14,4 +14,3 @@ class ScotlandPublicContracts(ScotlandBase): name = 'scotland_public_contracts' data_type = 'release_package' pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={:%m-%Y}&outputType=0¬iceType={}' - diff --git a/tests/test_periodical_spider.py b/tests/test_periodical_spider.py index 4153c11e..3430f64a 100644 --- a/tests/test_periodical_spider.py +++ b/tests/test_periodical_spider.py @@ -1,9 +1,11 @@ -import pytest from datetime import datetime -from . import spider_with_crawler +import pytest + from kingfisher_scrapy.base_spider import PeriodicalSpider -from kingfisher_scrapy.util import components, date_range_by_year, date_range_by_month +from kingfisher_scrapy.util import components, date_range_by_month, date_range_by_year + +from . import spider_with_crawler def _format_urls(arg_type, pattern, arg_start, arg_end): From 726b8d0b4cd8d49b44d55b325e846fde34f4dbc9 Mon Sep 17 00:00:00 2001 From: Romina Fernandez Date: Mon, 31 Aug 2020 11:41:54 -0400 Subject: [PATCH 09/11] Make changes suggested in the PR review --- kingfisher_scrapy/spiders/scotland_base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kingfisher_scrapy/spiders/scotland_base.py b/kingfisher_scrapy/spiders/scotland_base.py index 98d7ec29..d22147af 100644 --- a/kingfisher_scrapy/spiders/scotland_base.py +++ b/kingfisher_scrapy/spiders/scotland_base.py @@ -5,10 +5,8 @@ class ScotlandBase(PeriodicalSpider): - default_from_date = '2019-01' date_format = 'year-month' - default_until_date = date.today() - default_from_date = date(default_until_date.year - 1, default_until_date.month, 1) + default_from_date = date(date.today().year - 1, date.today().month, 1) notice_types = [ 1, # OJEU - F1 - Prior Information Notice From be0eba676d25516ca10c3f0a0e5ed5c412de7f6e Mon Sep 17 00:00:00 2001 From: Romina Fernandez Date: Mon, 31 Aug 2020 15:55:42 -0400 Subject: [PATCH 10/11] Make changes suggested in the PR review --- kingfisher_scrapy/base_spider.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 41a21306..811cc5d4 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -121,7 +121,8 @@ def from_crawler(cls, crawler, *args, **kwargs): spider.from_date = datetime.strptime(spider.from_date, spider.date_format) else: try: - spider.from_date = datetime.strptime(spider.from_date, spider.date_format) + if isinstance(spider.from_date, str): + spider.from_date = datetime.strptime(spider.from_date, spider.date_format) except ValueError as e: raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e)) @@ -483,16 +484,11 @@ def __init__(self, *args, **kwargs): self.start_requests_callback = self.parse @classmethod - def from_crawler(cls, crawler, *args, **kwargs): - spider = super(SimpleSpider, cls).from_crawler(crawler, *args, **kwargs) - - if not spider.from_date: - spider.from_date = spider.default_from_date - if isinstance(spider.from_date, str): - spider.from_date = datetime.strptime(spider.from_date, spider.date_format) - spider.until_date = cls.get_default_until_date(spider) - if isinstance(spider.until_date, str): - spider.until_date = datetime.strptime(spider.until_date, spider.date_format) + def from_crawler(cls, crawler, from_date=None, *args, **kwargs): + if not from_date: + from_date = cls.default_from_date + + spider = super(SimpleSpider, cls).from_crawler(crawler, from_date=from_date, *args, **kwargs) return spider From 9bd5665559028b0dc69a3fee9242cbe55a5d37a8 Mon Sep 17 00:00:00 2001 From: Romina Fernandez Date: Thu, 17 Sep 2020 14:53:45 -0400 Subject: [PATCH 11/11] Remove date conversions from base_spider --- kingfisher_scrapy/base_spider.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py index 811cc5d4..193e78b4 100644 --- a/kingfisher_scrapy/base_spider.py +++ b/kingfisher_scrapy/base_spider.py @@ -116,26 +116,20 @@ def from_crawler(cls, crawler, *args, **kwargs): if not spider.from_date: # Default to `default_from_date` class attribute. spider.from_date = spider.default_from_date + try: if isinstance(spider.from_date, str): # convert to date format, if needed spider.from_date = datetime.strptime(spider.from_date, spider.date_format) - else: - try: - if isinstance(spider.from_date, str): - spider.from_date = datetime.strptime(spider.from_date, spider.date_format) - except ValueError as e: - raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e)) + except ValueError as e: + raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e)) if not spider.until_date: spider.until_date = cls.get_default_until_date(spider) + try: if isinstance(spider.until_date, str): - # convert to date format, if needed - spider.until_date = datetime.strptime(spider.until_date, spider.date_format) - else: - try: spider.until_date = datetime.strptime(spider.until_date, spider.date_format) - except ValueError as e: - raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e)) + except ValueError as e: + raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e)) return spider