From 8cb68051801873c9ef7ae61f00662b55354af2cc Mon Sep 17 00:00:00 2001 From: Yohanna Lisnichuk Date: Thu, 9 May 2024 14:31:16 -0400 Subject: [PATCH] feat: add guatemala_bulk --- docs/spiders.rst | 10 +++ kingfisher_scrapy/spiders/guatemala_bulk.py | 65 +++++++++++++++++++ ...inistry_of_infrastructure_and_transport.py | 4 +- ...tate_seseap_plataforma_digital_nacional.py | 2 +- 4 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 kingfisher_scrapy/spiders/guatemala_bulk.py diff --git a/docs/spiders.rst b/docs/spiders.rst index 4806a1b2..7e4860ca 100644 --- a/docs/spiders.rst +++ b/docs/spiders.rst @@ -505,6 +505,16 @@ Greece scrapy crawl greece_digiwhist +Guatemala +~~~~~~~~~ + +.. autoclass:: kingfisher_scrapy.spiders.guatemala_bulk.GuatemalaBulk + :no-members: + +.. code-block:: bash + + scrapy crawl guatemala_bulk + Honduras ~~~~~~~~ diff --git a/kingfisher_scrapy/spiders/guatemala_bulk.py b/kingfisher_scrapy/spiders/guatemala_bulk.py new file mode 100644 index 00000000..d2a1c3c5 --- /dev/null +++ b/kingfisher_scrapy/spiders/guatemala_bulk.py @@ -0,0 +1,65 @@ +from datetime import datetime + +import scrapy + +from kingfisher_scrapy.base_spiders import SimpleSpider +from kingfisher_scrapy.util import components, handle_http_error + + +class GuatemalaBulk(SimpleSpider): + """ + Domain + Ministerio de Finanzas Públicas - Dirección General de Adquisiciones del Estado + Spider arguments + from_date + Download only data from this month onward (YYYY-MM format). + If ``until_date`` is provided, defaults to '2020-01'. + until_date + Download only data until this month (YYYY-MM format). + If ``from_date`` is provided, defaults to the current month. + API documentation + https://ocds.guatecompras.gt/api-ocds + Bulk download documentation + https://ocds.guatecompras.gt/descarga-datos + """ + name = 'guatemala_bulk' + + # BaseSpider + date_format = 'year-month' + default_from_date = '2020-01' + + # SimpleSpider + data_type = 'record_package' + + def start_requests(self): + url = 'https://ocds.guatecompras.gt/files' + yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list) + + @handle_http_error + def parse_list(self, response): + # An example of expected response is: + # { + # "id": "gc-{year}-{month}" + # "results": [ + # { + # "files": { + # "csv": "...", + # "sha": "...", + # "json": "...", + # "xlsx": "..." + # }, + # "year": "values between 2020 to the current year", + # "month": "values between 1 and 12", + # "monthName": "values between enero to diciembre", + # "source": "Guatecompras", + # "timestamp": "last updated date in timestamp with time zone format" + # }, ... + # ] + # } + for item in response.json()["result"]: + if self.from_date and self.until_date: + date = datetime(int(item['year']), int(item['month']), 1) + if not (self.from_date <= date <= self.until_date): + continue + + yield self.build_request(item['files']['json'], formatter=components(-2)) diff --git a/kingfisher_scrapy/spiders/italy_ministry_of_infrastructure_and_transport.py b/kingfisher_scrapy/spiders/italy_ministry_of_infrastructure_and_transport.py index e7a993ed..535b3ceb 100644 --- a/kingfisher_scrapy/spiders/italy_ministry_of_infrastructure_and_transport.py +++ b/kingfisher_scrapy/spiders/italy_ministry_of_infrastructure_and_transport.py @@ -10,10 +10,10 @@ class ItalyMinistryOfInfrastructureAndTransport(SimpleSpider): Public Contracts Service (SCP) of the Ministry of Infrastructure and Transport Spider arguments from_date - Download only data from this time onward (YYYY-MM-DD format). + Download only data from this date onward (YYYY-MM-DD format). If ``until_date`` is provided, defaults to '2022-01-01'. until_date - Download only data until this time (YYYY-MM-DD format). + Download only data until this date (YYYY-MM-DD format). If ``from_date`` is provided, defaults to today. Swagger API documentation https://www.serviziocontrattipubblici.it/ocds-ms/swagger-ui.html diff --git a/kingfisher_scrapy/spiders/mexico_puebla_state_seseap_plataforma_digital_nacional.py b/kingfisher_scrapy/spiders/mexico_puebla_state_seseap_plataforma_digital_nacional.py index e25cb44b..90401a0c 100644 --- a/kingfisher_scrapy/spiders/mexico_puebla_state_seseap_plataforma_digital_nacional.py +++ b/kingfisher_scrapy/spiders/mexico_puebla_state_seseap_plataforma_digital_nacional.py @@ -1,7 +1,7 @@ from kingfisher_scrapy.spiders.mexico_plataforma_digital_nacional_base import MexicoPlataformaDigitalNacionalBase -class MexicoPueblaStateSESEAPlataformaDigitalNacional(MexicoPlataformaDigitalNacionalBase): +class MexicoPueblaStateSESEAPPlataformaDigitalNacional(MexicoPlataformaDigitalNacionalBase): """ Domain Secretaría Ejecutiva del Sistema Estatal Anticorrupción del Estado de Puebla (SESEAP) (Mexico) -