Skip to content

Commit

Permalink
Update uruguay historical spider #696
Browse files Browse the repository at this point in the history
  • Loading branch information
nativaldezt committed Apr 22, 2021
1 parent 4bf312c commit da84d96
Showing 1 changed file with 26 additions and 11 deletions.
37 changes: 26 additions & 11 deletions kingfisher_scrapy/spiders/uruguay_historical.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,50 @@
from kingfisher_scrapy.base_spider import CompressedFileSpider, PeriodicSpider, browser_user_agent
from kingfisher_scrapy.util import components
import datetime

import scrapy

class UruguayHistorical(CompressedFileSpider, PeriodicSpider):
from kingfisher_scrapy.base_spider import CompressedFileSpider
from kingfisher_scrapy.util import components, handle_http_error


class UruguayHistorical(CompressedFileSpider):
"""
Domain
Agencia Reguladora de Compras Estatales (ARCE)
Spider arguments
from_date
Download only data from this month onward (YYYY format). Defaults to '2002'.
until_date
Download only data until this month (YYYY format). Defaults to '2017'.
Download only data until this month (YYYY format).
Bulk download documentation
https://www.gub.uy/agencia-compras-contrataciones-estado/datos-y-estadisticas/datos/open-contracting
"""
name = 'uruguay_historical'
download_timeout = 1000
user_agent = browser_user_agent

# BaseSpider
date_format = 'year'
default_from_date = '2002'
default_until_date = '2017'
skip_pluck = 'Already covered (see code for details)' # uruguay_releases

# SimpleSpider
data_type = 'release_package'

# PeriodicSpider
pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \
'/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip'
def start_requests(self):
# A CKAN API JSON response.
url = 'https://catalogodatos.gub.uy/api/3/action/package_show?id=arce-datos-historicos-de-compras'
yield scrapy.Request(url, meta={'file_name': 'list.json'}, callback=self.parse_list)

def get_formatter(self):
return components(-1)
@handle_http_error
def parse_list(self, response):
data = response.json()
for resource in data['result']['resources']:
if resource['format'].upper() == 'JSON':
url = resource['url']
if self.from_date and self.until_date:
# URL looks like
# https://catalogodatos.gub.uy/dataset/44d3-b09c/resource/1e39-453d/download/ocds-2002.zip
url_year = int(url.split('-')[-1].split('.')[0])
url_date = datetime.datetime(url_year, 1, 1)
if not (self.from_date <= url_date <= self.until_date):
continue
yield self.build_request(url, formatter=components(-1))

0 comments on commit da84d96

Please sign in to comment.