From 5f171a53c078b0b171bdcf0282a0388f3a709e29 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Fri, 5 Apr 2024 16:36:00 -0400 Subject: [PATCH 1/5] fix(mexico_quien_es_quien*): remove records and fix releases records: pagination no longer working releases: add dates filters to get all data --- docs/spiders.rst | 7 ---- .../spiders/mexico_quien_es_quien_base.py | 15 ------- .../spiders/mexico_quien_es_quien_records.py | 40 ------------------- .../spiders/mexico_quien_es_quien_releases.py | 23 +++++++++-- 4 files changed, 20 insertions(+), 65 deletions(-) delete mode 100644 kingfisher_scrapy/spiders/mexico_quien_es_quien_base.py delete mode 100644 kingfisher_scrapy/spiders/mexico_quien_es_quien_records.py diff --git a/docs/spiders.rst b/docs/spiders.rst index abf91619..f0737543 100644 --- a/docs/spiders.rst +++ b/docs/spiders.rst @@ -796,13 +796,6 @@ Mexico scrapy crawl mexico_nuevo_leon_releases -.. autoclass:: kingfisher_scrapy.spiders.mexico_quien_es_quien_records.MexicoQuienEsQuienRecords - :no-members: - -.. code-block:: bash - - scrapy crawl mexico_quien_es_quien_records - .. autoclass:: kingfisher_scrapy.spiders.mexico_quien_es_quien_releases.MexicoQuienEsQuienReleases :no-members: diff --git a/kingfisher_scrapy/spiders/mexico_quien_es_quien_base.py b/kingfisher_scrapy/spiders/mexico_quien_es_quien_base.py deleted file mode 100644 index a1ed4923..00000000 --- a/kingfisher_scrapy/spiders/mexico_quien_es_quien_base.py +++ /dev/null @@ -1,15 +0,0 @@ -import scrapy - -from kingfisher_scrapy.base_spiders import IndexSpider - - -class MexicoQuienEsQuienBase(IndexSpider): - download_delay = 1 - - # IndexSpider - result_count_pointer = '/data/index/contracts/count' - limit = 1000 # >= 10000 causes "Search size is bigger than 10000. Elasticsearch does not allow it." - - def start_requests(self): - url = 'https://api.quienesquien.wiki/v3/sources' - yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list) diff --git a/kingfisher_scrapy/spiders/mexico_quien_es_quien_records.py b/kingfisher_scrapy/spiders/mexico_quien_es_quien_records.py deleted file mode 100644 index 1c0494b8..00000000 --- a/kingfisher_scrapy/spiders/mexico_quien_es_quien_records.py +++ /dev/null @@ -1,40 +0,0 @@ -import json - -from kingfisher_scrapy.spiders.mexico_quien_es_quien_base import MexicoQuienEsQuienBase -from kingfisher_scrapy.util import handle_http_error - - -class MexicoQuienEsQuienRecords(MexicoQuienEsQuienBase): - """ - Domain - QuiénEsQuién.Wiki - Caveats - The 'record' endpoint returns a 'data' array with the first entry as a record package and the subsequent ones - as records. This spider transform the output to package all the records within a package using the first package - metadata. The 'uri' and 'publicationDate' record package metadata fields are removed. - API documentation - https://qqwapi-elastic.readthedocs.io/es/latest/ - Swagger API documentation - https://api.quienesquien.wiki/v3/docs/ - """ - name = 'mexico_quien_es_quien_records' - - # SimpleSpider - data_type = 'record_package' - - # IndexSpider - base_url = 'https://api.quienesquien.wiki/v3/record' - limit = 100 # Decrease the limit so the output file is not too big. - - @handle_http_error - def parse(self, response): - data = response.json() - # The first entry of the array is a record package with 'records' as an object. The remaining entries - # are records. We use the package metadata to wrap all the records into a single record package. - package = data['data'][0].copy() - del package['uri'] - del package['publishedDate'] - package['records'] = [package['records']] - package['records'].extend(data['data'][1:]) - response = response.replace(body=json.dumps(package)) - yield self.build_file_from_response(response, data_type=self.data_type) diff --git a/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py b/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py index 540edde5..10c35dd6 100644 --- a/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py +++ b/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py @@ -1,22 +1,39 @@ -from kingfisher_scrapy.spiders.mexico_quien_es_quien_base import MexicoQuienEsQuienBase +from kingfisher_scrapy.base_spiders import IndexSpider, PeriodicSpider +from kingfisher_scrapy.util import parameters -class MexicoQuienEsQuienReleases(MexicoQuienEsQuienBase): +class MexicoQuienEsQuienReleases(IndexSpider, PeriodicSpider): """ Domain QuiénEsQuién.Wiki + Spider arguments + from_date + Download only data from this date onward (YYYY-MM-DD format). Defaults to '1999-01-01'. + until_date + Download only data until this date (YYYY-MM-DD format). Defaults to '2021-12-31'. API documentation https://qqwapi-elastic.readthedocs.io/es/latest/ Swagger API documentation https://api.quienesquien.wiki/v3/docs/ """ name = 'mexico_quien_es_quien_releases' + download_delay = 1 # BaseSpider + default_from_date = '1999-01-01' + default_until_date = '2021-12-31' + date_format = 'date' root_path = 'data.item' # SimpleSpider data_type = 'release' # IndexSpider - base_url = 'https://api.quienesquien.wiki/v3/contracts?sort=date&sort_direction=desc' + result_count_pointer = '/count' + limit = 1000 + + # PeriodicSpider + formatter = staticmethod(parameters('start_date_min', 'start_date_max', 'offset')) + pattern = 'https://api.quienesquien.wiki/v3/contracts?start_date_min={0:%Y-%m-%d}&start_date_max={' \ + '1:%Y-%m-%d}&limit=25&offset=0 ' + start_requests_callback = 'parse_list' From 3229788e520f7de6b49eb89f5231ddad2f22b92d Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Fri, 5 Apr 2024 18:27:48 -0400 Subject: [PATCH 2/5] docs: add mexico_quien_es_quien_records to history --- docs/history.rst | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/history.rst b/docs/history.rst index df252924..4258073b 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -1,7 +1,12 @@ +Removed spiders +=============== + +This page records the spiders that were removed from Kingfisher Collect because they stopped publishing data or are broken. + Lapsed spiders -============== +-------------- -This page records the spiders that were available but stopped publishing and therefore where removed from Kingfisher Collect, since January 2022: +This section records the spiders that were available but stopped publishing, since January 2022: - 2024-01-09: `mexico_puebla_itaipue, nigeria_edo_state `__ - 2023-10-04: `honduras_cost, kenya_makueni, kyrgyzstan, portugal_bulk `__ @@ -12,3 +17,11 @@ This page records the spiders that were available but stopped publishing and the .. note:: Since January 2022, any spider that stops working for more than six months will be deleted. + + +Broken spiders +-------------- + +This section records the spiders that were available but then stop working, since April 2024: + +- 2024-04-05: `mexico_quien_es_quien_records `__ (the pagination is not working anymore) From d0ca56983dfe4b0356fa4996c7fd41aa2c9187a1 Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Fri, 5 Apr 2024 18:28:15 -0400 Subject: [PATCH 3/5] fix: remove hardcoded limit from mexico_quien_es_quien_releases --- kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py b/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py index 10c35dd6..018f72ce 100644 --- a/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py +++ b/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py @@ -34,6 +34,6 @@ class MexicoQuienEsQuienReleases(IndexSpider, PeriodicSpider): # PeriodicSpider formatter = staticmethod(parameters('start_date_min', 'start_date_max', 'offset')) - pattern = 'https://api.quienesquien.wiki/v3/contracts?start_date_min={0:%Y-%m-%d}&start_date_max={' \ - '1:%Y-%m-%d}&limit=25&offset=0 ' + pattern = 'https://api.quienesquien.wiki/v3/contracts?start_date_min={0:%Y-%m-%d}&start_date_max={1:%Y-%m-%d}' \ + f'&offset=0&limit={limit} ' start_requests_callback = 'parse_list' From fe7b3cf9ac9c55dcbceace30030a3487ad3837f3 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 5 Apr 2024 20:06:38 -0400 Subject: [PATCH 4/5] fix: Remove trailing space from URL pattern --- kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py b/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py index 018f72ce..8c3af966 100644 --- a/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py +++ b/kingfisher_scrapy/spiders/mexico_quien_es_quien_releases.py @@ -35,5 +35,5 @@ class MexicoQuienEsQuienReleases(IndexSpider, PeriodicSpider): # PeriodicSpider formatter = staticmethod(parameters('start_date_min', 'start_date_max', 'offset')) pattern = 'https://api.quienesquien.wiki/v3/contracts?start_date_min={0:%Y-%m-%d}&start_date_max={1:%Y-%m-%d}' \ - f'&offset=0&limit={limit} ' + f'&offset=0&limit={limit}' start_requests_callback = 'parse_list' From 2654d3bb81d94b04e353038dcd4e04baaea31057 Mon Sep 17 00:00:00 2001 From: James McKinney <26463+jpmckinney@users.noreply.github.com> Date: Fri, 5 Apr 2024 20:06:49 -0400 Subject: [PATCH 5/5] docs(history): Copy-edit --- docs/history.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/history.rst b/docs/history.rst index 4258073b..8d42bf2c 100644 --- a/docs/history.rst +++ b/docs/history.rst @@ -1,12 +1,12 @@ Removed spiders =============== -This page records the spiders that were removed from Kingfisher Collect because they stopped publishing data or are broken. +This page records the spiders removed from Kingfisher Collect. Lapsed spiders -------------- -This section records the spiders that were available but stopped publishing, since January 2022: +Spiders for publications that were available but stopped publishing, since January 2022: - 2024-01-09: `mexico_puebla_itaipue, nigeria_edo_state `__ - 2023-10-04: `honduras_cost, kenya_makueni, kyrgyzstan, portugal_bulk `__ @@ -18,10 +18,9 @@ This section records the spiders that were available but stopped publishing, sin Since January 2022, any spider that stops working for more than six months will be deleted. - Broken spiders -------------- -This section records the spiders that were available but then stop working, since April 2024: +Spiders for publications that became broken, since April 2024: -- 2024-04-05: `mexico_quien_es_quien_records `__ (the pagination is not working anymore) +- 2024-04-05: `mexico_quien_es_quien_records `__ (pagination is broken)