diff --git a/docs/spiders.rst b/docs/spiders.rst index f2101fb8..4806a1b2 100644 --- a/docs/spiders.rst +++ b/docs/spiders.rst @@ -645,6 +645,13 @@ Italy scrapy crawl italy_digiwhist +.. autoclass:: kingfisher_scrapy.spiders.italy_ministry_of_infrastructure_and_transport.ItalyMinistryOfInfrastructureAndTransport + :no-members: + +.. code-block:: bash + + scrapy crawl italy_ministry_of_infrastructure_and_transport + Kosovo ~~~~~~ diff --git a/kingfisher_scrapy/spiders/italy_ministry_of_infrastructure_and_transport.py b/kingfisher_scrapy/spiders/italy_ministry_of_infrastructure_and_transport.py new file mode 100644 index 00000000..248ca922 --- /dev/null +++ b/kingfisher_scrapy/spiders/italy_ministry_of_infrastructure_and_transport.py @@ -0,0 +1,58 @@ +import scrapy + +from kingfisher_scrapy.base_spiders import SimpleSpider +from kingfisher_scrapy.util import handle_http_error, parameters, replace_parameters + + +class ItalyMinistryOfInfrastructureAndTransport(SimpleSpider): + """ + Domain + Public Contracts Service (SCP) of the Ministry of Infrastructure and Transport + Spider arguments + from_date + Download only data from this time onward (YYYY-MM-DD format). + If ``until_date`` is provided, defaults to '2022-01-01'. + until_date + Download only data until this time (YYYY-MM-DD format). + If ``from_date`` is provided, defaults to today. + Swagger API documentation + https://www.serviziocontrattipubblici.it/ocds-ms/swagger-ui.html + """ + name = 'italy_ministry_of_infrastructure_and_transport' + + # BaseSpider + date_format = 'date' + default_from_date = '2022-01-01' + + # SimpleSpider + data_type = 'release_package' + + def start_requests(self): + url = 'https://www.serviziocontrattipubblici.it/ocdsReleasePackages-ms/v1.0/ocdsReleasePackages?page=1' \ + '&pageSize=5 ' + if self.from_date and self.until_date: + from_date = self.from_date.strftime(self.date_format) + until_date = self.until_date.strftime(self.date_format) + url = f'{url}&dataInvioDa={from_date}&dataInvioA={until_date}' + yield scrapy.Request(url, meta={'file_name': 'page-1.json', 'page': 1}) + + @handle_http_error + def parse(self, response): + data = response.json() + # A 200 HTTP response with a dict like the below is returned instead of 404, for example for not available + # date periods + # { + # "esito": false, + # "errorData": "Si รจ verificato un errore durante la creazione di OCDS" + # } + if "errorData" in data: + data['http_code'] = response.status + yield self.build_file_error_from_response(response, errors=data) + + # An empty release package is returned pages after the last page is reached + if 'releases' not in data: + return + yield from super().parse(response) + next_page = response.request.meta['page']+1 + yield self.build_request(replace_parameters(response.url, page=next_page), meta={'page': next_page}, + formatter=parameters('page'))