Merge 726b8d0 into 5ab8eeb

open-contracting · Aug 31, 2020 · 73610b9 · 73610b9
2 parents 5ab8eeb + 726b8d0
commit 73610b9
Show file tree

Hide file tree

Showing 9 changed files with 263 additions and 116 deletions.
diff --git a/kingfisher_scrapy/base_spider.py b/kingfisher_scrapy/base_spider.py
@@ -1,5 +1,6 @@
 import json
 import os
+from abc import abstractmethod
 from datetime import datetime
 from io import BytesIO
 from zipfile import ZipFile
@@ -58,7 +59,7 @@ class BaseSpider(scrapy.Spider):
 
     MAX_SAMPLE = 10
     MAX_RELEASES_PER_PACKAGE = 100
-    VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'}
+    VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'}
 
     ocds_version = '1.1'
     date_format = 'date'
@@ -115,17 +116,25 @@ def from_crawler(cls, crawler, *args, **kwargs):
             if not spider.from_date:
                 # Default to `default_from_date` class attribute.
                 spider.from_date = spider.default_from_date
+                if isinstance(spider.from_date, str):
+                    # convert to date format, if needed
+                    spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
+            else:
+                try:
+                    spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
+                except ValueError as e:
+                    raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e))
+
             if not spider.until_date:
-                # Default to today.
-                spider.until_date = datetime.now().strftime(spider.date_format)
-            try:
-                spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
-            except ValueError as e:
-                raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e))
-            try:
-                spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
-            except ValueError as e:
-                raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e))
+                spider.until_date = cls.get_default_until_date(spider)
+                if isinstance(spider.until_date, str):
+                    # convert to date format, if needed
+                    spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
+            else:
+                try:
+                    spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
+                except ValueError as e:
+                    raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e))
 
         return spider
 
@@ -272,6 +281,10 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None
             if self.sample:
                 break
 
+    @classmethod
+    def get_default_until_date(cls, spider):
+        return datetime.now()
+
 
 class SimpleSpider(BaseSpider):
     """
@@ -435,3 +448,82 @@ def next_link(self, response, **kwargs):
 
         if response.meta['depth'] == 0:
             raise MissingNextLinkError('next link not found on the first page: {}'.format(response.url))
+
+
+class PeriodicalSpider(SimpleSpider):
+    """
+    This class helps to crawl urls that receive a year (YYYY) or a month and year (YYYY-mm) as parameters. To use it:
+
+    1. Extend from ``PeriodicalSpider``.
+    1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'.
+    1. Set a ``default_from_date`` year or month-year.
+    1. Optionally, set a ``default_until_date`` year or month-year. If absent, ``default_until_date`` defaults to the
+    current year or month-year.
+    1. Set the ``pattern`` parameter with the url to retrieve.
+    1. Implement the `get_formatter` method.
+
+    The ``pattern`` should include a placeholder for a year or month-year parameter. With the year parameter, an int is
+    passed. If the year-month parameter is used, a ``Date`` instance is passed. Example:
+
+    .. code-block: python
+
+        url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}'
+
+    When the ``sample`` option is used, the latest year or month of data is retrieved.
+    """
+    VALID_DATE_FORMATS = {'year': '%Y', 'year-month': '%Y-%m'}
+
+    def __init__(self, *args, **kwargs):
+        self.date_format_key = self.date_format
+        super().__init__(*args, **kwargs)
+
+        if hasattr(self, 'start_requests_callback'):
+            self.start_requests_callback = getattr(self, self.start_requests_callback)
+        else:
+            self.start_requests_callback = self.parse
+
+    @classmethod
+    def from_crawler(cls, crawler, *args, **kwargs):
+        spider = super(SimpleSpider, cls).from_crawler(crawler, *args, **kwargs)
+
+        if not spider.from_date:
+            spider.from_date = spider.default_from_date
+            if isinstance(spider.from_date, str):
+                spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
+            spider.until_date = cls.get_default_until_date(spider)
+            if isinstance(spider.until_date, str):
+                spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
+
+        return spider
+
+    @classmethod
+    def get_default_until_date(cls, spider):
+        if hasattr(spider, 'default_until_date') and spider.default_until_date:
+            return spider.default_until_date
+        else:
+            return datetime.today()
+
+    def start_requests(self):
+
+        start = self.from_date
+
+        stop = self.until_date
+
+        if self.sample:
+            start = stop
+
+        if self.date_format_key == 'year':
+            date_range = util.date_range_by_year(start.year, stop.year)
+        else:
+            date_range = util.date_range_by_month(start, stop)
+
+        for date in date_range:
+            for url in self.build_urls(self.pattern, date):
+                yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback)
+
+    @abstractmethod
+    def get_formatter(self):
+        pass
+
+    def build_urls(self, pattern, date):
+        yield pattern.format(date)
diff --git a/kingfisher_scrapy/spiders/moldova_old.py b/kingfisher_scrapy/spiders/moldova_old.py
@@ -1,8 +1,8 @@
-from kingfisher_scrapy.base_spider import SimpleSpider
-from kingfisher_scrapy.util import components, date_range_by_year
+from kingfisher_scrapy.base_spider import PeriodicalSpider
+from kingfisher_scrapy.util import components
 
 
-class MoldovaOld(SimpleSpider):
+class MoldovaOld(PeriodicalSpider):
     """
     Bulk download documentation
       http://opencontracting.date.gov.md/downloads
@@ -12,14 +12,10 @@ class MoldovaOld(SimpleSpider):
     """
     name = 'moldova_old'
     data_type = 'release_package'
+    default_from_date = '2012'
+    default_until_date = '2018'
+    pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'
+    date_format = 'year'
 
-    def start_requests(self):
-        pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'
-
-        start = 2012
-        stop = 2018
-        if self.sample:
-            start = 2018
-
-        for year in date_range_by_year(start, stop):
-            yield self.build_request(pattern.format(year), formatter=components(-1))
+    def get_formatter(self):
+        return components(-1)
diff --git a/kingfisher_scrapy/spiders/nepal_portal.py b/kingfisher_scrapy/spiders/nepal_portal.py
@@ -1,10 +1,8 @@
-from datetime import date
+from kingfisher_scrapy.base_spider import PeriodicalSpider
+from kingfisher_scrapy.util import components
 
-from kingfisher_scrapy.base_spider import SimpleSpider
-from kingfisher_scrapy.util import components, date_range_by_year
 
-
-class NepalPortal(SimpleSpider):
+class NepalPortal(PeriodicalSpider):
     """
     Bulk download documentation
       http://ppip.gov.np/downloads
@@ -15,16 +13,10 @@ class NepalPortal(SimpleSpider):
     name = 'nepal_portal'
     data_type = 'release_package'
     ocds_version = '1.0'
+    default_from_date = '2012'
+    default_until_date = '2018'
+    pattern = 'http://ppip.gov.np/bulk-download/{}'
+    date_format = 'year'
 
-    def start_requests(self):
-        pattern = 'http://ppip.gov.np/bulk-download/{}'
-
-        if self.sample:
-            start = 2018
-            stop = 2018
-        else:
-            start = 2012
-            stop = date.today().year  # HTTP 500 after 2018
-
-        for year in date_range_by_year(start, stop):
-            yield self.build_request(pattern.format(year), formatter=components(-1))
+    def get_formatter(self):
+        return components(-1)
diff --git a/kingfisher_scrapy/spiders/scotland_base.py b/kingfisher_scrapy/spiders/scotland_base.py
@@ -1,51 +1,40 @@
 from datetime import date
 
-from kingfisher_scrapy.base_spider import SimpleSpider
-from kingfisher_scrapy.util import date_range_by_month, parameters
+from kingfisher_scrapy.base_spider import PeriodicalSpider
+from kingfisher_scrapy.util import parameters
 
 
-class ScotlandBase(SimpleSpider):
-    default_from_date = '2019-01'
+class ScotlandBase(PeriodicalSpider):
     date_format = 'year-month'
+    default_from_date = date(date.today().year - 1, date.today().month, 1)
 
-    @classmethod
-    def from_crawler(cls, crawler, from_date=None, *args, **kwargs):
-        if not from_date:
-            from_date = cls.default_from_date
+    notice_types = [
+        1,  # OJEU - F1 - Prior Information Notice
+        2,  # OJEU - F2 - Contract Notice
+        3,  # OJEU - F3 - Contract Award Notice
+        4,  # OJEU - F4 - Prior Information Notice(Utilities)
+        5,  # OJEU - F5 - Contract Notice(Utilities)
+        6,  # OJEU - F6 - Contract Award Notice(Utilities)
+        7,  # OJEU - F7 - Qualification Systems(Utilities)
+        12,  # OJEU - F12 - Design Contest Notice
+        13,  # OJEU - F13 - Results Of Design Contest
+        14,  # OJEU - F14 - Corrigendum
+        15,  # OJEU - F15 - Voluntary Ex Ante Transparency Notice
+        20,  # OJEU - F20 - Modification Notice
+        21,  # OJEU - F21 - Social And other Specific Services(Public Contracts)
+        22,  # OJEU - F22 - Social And other Specific Services(Utilities)
+        23,  # OJEU - F23 - Social And other Specific Services(Concessions)
+        24,  # OJEU - F24 - Concession Notice
+        25,  # OJEU - F25 - Concession Award Notice
+        101,  # Site Notice - Website Contract Notice
+        102,  # Site Notice - Website Prior Information Notice
+        103,  # Site Notice - Website Contract Award Notice
+        104,  # Site Notice - Quick Quote Award
+    ]
 
-        return super().from_crawler(crawler, from_date=from_date, *args, **kwargs)
+    def build_urls(self, pattern, date):
+        for notice_type in self.notice_types:
+            yield pattern.format(date, notice_type)
 
-    def start_requests(self):
-        notice_types = [
-            1,  # OJEU - F1 - Prior Information Notice
-            2,  # OJEU - F2 - Contract Notice
-            3,  # OJEU - F3 - Contract Award Notice
-            4,  # OJEU - F4 - Prior Information Notice(Utilities)
-            5,  # OJEU - F5 - Contract Notice(Utilities)
-            6,  # OJEU - F6 - Contract Award Notice(Utilities)
-            7,  # OJEU - F7 - Qualification Systems(Utilities)
-            12,  # OJEU - F12 - Design Contest Notice
-            13,  # OJEU - F13 - Results Of Design Contest
-            14,  # OJEU - F14 - Corrigendum
-            15,  # OJEU - F15 - Voluntary Ex Ante Transparency Notice
-            20,  # OJEU - F20 - Modification Notice
-            21,  # OJEU - F21 - Social And other Specific Services(Public Contracts)
-            22,  # OJEU - F22 - Social And other Specific Services(Utilities)
-            23,  # OJEU - F23 - Social And other Specific Services(Concessions)
-            24,  # OJEU - F24 - Concession Notice
-            25,  # OJEU - F25 - Concession Award Notice
-            101,  # Site Notice - Website Contract Notice
-            102,  # Site Notice - Website Prior Information Notice
-            103,  # Site Notice - Website Contract Award Notice
-            104,  # Site Notice - Quick Quote Award
-        ]
-
-        for year_month in date_range_by_month(self.from_date, date.today()):
-            date_string = year_month.strftime('%m-%Y')
-            for notice_type in notice_types:
-                yield self.build_request(
-                    self.url.format(date_string, notice_type),
-                    formatter=parameters('noticeType', 'dateFrom')
-                )
-            if self.sample:
-                return
+    def get_formatter(self):
+        return parameters('noticeType', 'dateFrom')
diff --git a/kingfisher_scrapy/spiders/scotland_proactis.py b/kingfisher_scrapy/spiders/scotland_proactis.py
@@ -13,4 +13,4 @@ class ScotlandProactis(ScotlandBase):
     """
     name = 'scotland_proactis'
     data_type = 'release_package'
-    url = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={}&outputType=0&noticeType={}'
+    pattern = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={:%m-%Y}&outputType=0&noticeType={}'
diff --git a/kingfisher_scrapy/spiders/scotland_public_contracts.py b/kingfisher_scrapy/spiders/scotland_public_contracts.py
@@ -13,4 +13,4 @@ class ScotlandPublicContracts(ScotlandBase):
     """
     name = 'scotland_public_contracts'
     data_type = 'release_package'
-    url = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={}&outputType=1&noticeType={}'
+    pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={:%m-%Y}&outputType=0&noticeType={}'
diff --git a/kingfisher_scrapy/spiders/uruguay_base.py b/kingfisher_scrapy/spiders/uruguay_base.py
@@ -1,28 +1,18 @@
 from abc import abstractmethod
 
-from kingfisher_scrapy.base_spider import SimpleSpider
-from kingfisher_scrapy.util import components, date_range_by_month
+from kingfisher_scrapy.base_spider import PeriodicalSpider
+from kingfisher_scrapy.util import components
 
 
-class UruguayBase(SimpleSpider):
+class UruguayBase(PeriodicalSpider):
     download_delay = 0.9
     default_from_date = '2017-11'
     date_format = 'year-month'
+    pattern = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}'
+    start_requests_callback = 'parse_list'
 
-    @classmethod
-    def from_crawler(cls, crawler, from_date=None, *args, **kwargs):
-        if not from_date:
-            from_date = cls.default_from_date
-
-        return super().from_crawler(crawler, from_date=from_date, *args, **kwargs)
-
-    def start_requests(self):
-        url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}'
-        if self.sample:
-            self.from_date = self.until_date
-
-        for d in date_range_by_month(self.from_date, self.until_date):
-            yield self.build_request(url.format(d), formatter=components(-2), callback=self.parse_list)
+    def get_formatter(self):
+        return components(-2)
 
     @abstractmethod
     def parse_list(self):

diff --git a/kingfisher_scrapy/spiders/uruguay_historical.py b/kingfisher_scrapy/spiders/uruguay_historical.py
@@ -1,8 +1,8 @@
-from kingfisher_scrapy.base_spider import CompressedFileSpider
-from kingfisher_scrapy.util import components, date_range_by_year
+from kingfisher_scrapy.base_spider import CompressedFileSpider, PeriodicalSpider
+from kingfisher_scrapy.util import components
 
 
-class UruguayHistorical(CompressedFileSpider):
+class UruguayHistorical(CompressedFileSpider, PeriodicalSpider):
     """
     Bulk download documentation
       https://www.gub.uy/agencia-compras-contrataciones-estado/datos-y-estadisticas/datos/open-contracting
@@ -15,20 +15,17 @@ class UruguayHistorical(CompressedFileSpider):
 
     # the files takes too long to be downloaded, so we increase the download timeout
     download_timeout = 1000
+    default_from_date = '2002'
+    default_until_date = '2017'
+    date_format = 'year'
     custom_settings = {
         # It seems some websites don't like it and block when your user agent is not a browser.
         # see https://github.com/scrapy/scrapy/issues/3103
         'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                       'Chrome/37.0.2049.0 Safari/537.36',
     }
+    pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \
+              '/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip'
 
-    def start_requests(self):
-        start = 2002
-        stop = 2017
-        if self.sample:
-            start = stop
-
-        pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \
-                  '/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip'
-        for year in date_range_by_year(start, stop):
-            yield self.build_request(pattern.format(year), formatter=components(-1))
+    def get_formatter(self):
+        return components(-1)