Skip to content

Commit

Permalink
feat: add new spider italy_ministry_of_infrastructure_and_transport
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed Apr 22, 2024
1 parent 477ef13 commit d67b3ad
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 0 deletions.
7 changes: 7 additions & 0 deletions docs/spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,13 @@ Italy
scrapy crawl italy_digiwhist
.. autoclass:: kingfisher_scrapy.spiders.italy_ministry_of_infrastructure_and_transport.ItalyMinistryOfInfrastructureAndTransport
:no-members:

.. code-block:: bash
scrapy crawl italy_ministry_of_infrastructure_and_transport
Kosovo
~~~~~~

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.util import handle_http_error, parameters, replace_parameters


class ItalyMinistryOfInfrastructureAndTransport(SimpleSpider):
"""
Domain
Public Contracts Service (SCP) of the Ministry of Infrastructure and Transport
Spider arguments
from_date
Download only data from this time onward (YYYY-MM-DD format).
If ``until_date`` is provided, defaults to '2022-01-01'.
until_date
Download only data until this time (YYYY-MM-DD format).
If ``from_date`` is provided, defaults to today.
Swagger API documentation
https://www.serviziocontrattipubblici.it/ocds-ms/swagger-ui.html
"""
name = 'italy_ministry_of_infrastructure_and_transport'

# BaseSpider
date_format = 'date'
default_from_date = '2022-01-01'

# SimpleSpider
data_type = 'release_package'

def start_requests(self):
url = 'https://www.serviziocontrattipubblici.it/ocdsReleasePackages-ms/v1.0/ocdsReleasePackages?page=1' \
'&pageSize=5 '
if self.from_date and self.until_date:
from_date = self.from_date.strftime(self.date_format)
until_date = self.until_date.strftime(self.date_format)
url = f'{url}&dataInvioDa={from_date}&dataInvioA={until_date}'
yield scrapy.Request(url, meta={'file_name': 'page-1.json', 'page': 1})

@handle_http_error
def parse(self, response):
data = response.json()
# A 200 HTTP response with a dict like the below is returned instead of 404, for example for not available
# date periods
# {
# "esito": false,
# "errorData": "Si è verificato un errore durante la creazione di OCDS"
# }
if "errorData" in data:
data['http_code'] = response.status
yield self.build_file_error_from_response(response, errors=data)

# An empty release package is returned pages after the last page is reached
if 'releases' not in data:
return
yield from super().parse(response)
next_page = response.request.meta['page']+1
yield self.build_request(replace_parameters(response.url, page=next_page), meta={'page': next_page},
formatter=parameters('page'))

0 comments on commit d67b3ad

Please sign in to comment.