Skip to content

Commit

Permalink
Merge 879a52e into 0daf57d
Browse files Browse the repository at this point in the history
  • Loading branch information
yolile committed May 13, 2020
2 parents 0daf57d + 879a52e commit 55d9d4c
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 10 deletions.
15 changes: 7 additions & 8 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,15 +41,18 @@ class BaseSpider(scrapy.Spider):

MAX_SAMPLE = 10
MAX_RELEASES_PER_PACKAGE = 100
VALID_DATE_FORMATS = {'year_month_day': '%Y-%m-%d', 'year_month_day_time': '%Y-%m-%dT%H:%M:%S'}

def __init__(self, sample=None, note=None, from_date=None, until_date=None, *args, **kwargs):
def __init__(self, sample=None, note=None, from_date=None, until_date=None,
date_format='year_month_day', *args, **kwargs):
super().__init__(*args, **kwargs)

# https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments
self.sample = sample == 'true'
self.from_date = from_date
self.until_date = until_date
self.note = note
self.date_format = self.VALID_DATE_FORMATS[date_format]

spider_arguments = {
'sample': sample,
Expand All @@ -66,22 +69,18 @@ def from_crawler(cls, crawler, *args, **kwargs):

# Checks Spider date ranges arguments
if spider.from_date or spider.until_date:
# YYYY-MM-DD format
date_format = '%Y-%m-%d'

if not spider.from_date:
# 'from_date' defaults to 'default_from_date' spider class attribute
spider.from_date = spider.default_from_date
if not spider.until_date:
# 'until_date' defaults to today
spider.until_date = datetime.now().strftime(date_format)

spider.until_date = datetime.now().strftime(spider.date_format)
try:
spider.from_date = datetime.strptime(spider.from_date, date_format)
spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e))
try:
spider.until_date = datetime.strptime(spider.until_date, date_format)
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e))

Expand Down
8 changes: 6 additions & 2 deletions kingfisher_scrapy/spiders/paraguay_dncp_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class ParaguayDNCPBaseSpider(BaseSpider):
request_token = None
max_attempts = 10
data_type = None
default_from_date = '2010-01-01T00:00:00'

custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
Expand All @@ -36,7 +37,8 @@ class ParaguayDNCPBaseSpider(BaseSpider):

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(ParaguayDNCPBaseSpider, cls).from_crawler(crawler, *args, **kwargs)
spider = super(ParaguayDNCPBaseSpider, cls).from_crawler(crawler, date_format='year_month_day_time',
*args, **kwargs)

spider.request_token = crawler.settings.get('KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN')

Expand All @@ -47,6 +49,9 @@ def from_crawler(cls, crawler, *args, **kwargs):
return spider

def start_requests(self):
if self.from_date:
self.base_page_url = '{}/search/processes?tipo_fecha=fecha_release&fecha_desde={}'\
.format(self.base_url, self.from_date.strftime(self.date_format))
yield scrapy.Request(
self.base_page_url,
# send duplicate requests when the token expired and in the continuation of last_request saved.
Expand Down Expand Up @@ -126,7 +131,6 @@ def parse_pages(self, response):
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
'url': response.request.url,
'errors': {'http_code': response.status}
}
Expand Down
13 changes: 13 additions & 0 deletions kingfisher_scrapy/spiders/paraguay_dncp_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@


class ParaguayDNCPRecords(ParaguayDNCPBaseSpider):
"""
Swagger API documentation
https://contrataciones.gov.py/datos/api/v3/doc
Spider arguments
sample
Download only 10 records.
from_date
Download only records from this date onward (YYYY-MM-DDTHH:mm:ss format).
If `from_date` is not provided, defaults to '2010-01-01T00:00:00'.
Environment variables
KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN
To get an API account and request token go to https://contrataciones.gov.py/datos/adm/login.
"""
name = 'paraguay_dncp_records'
data_type = 'record_package'

Expand Down
13 changes: 13 additions & 0 deletions kingfisher_scrapy/spiders/paraguay_dncp_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@


class ParaguayDNCPReleases(ParaguayDNCPBaseSpider):
"""
Swagger API documentation
https://contrataciones.gov.py/datos/api/v3/doc
Spider arguments
sample
Download only 10 releases.
from_date
Download only releases from this release.date onward (YYYY-MM-DDTHH:mm:ss format).
If `from_date` is not provided, defaults to '2010-01-01T00:00:00'.
Environment variables
KINGFISHER_PARAGUAY_DNCP_REQUEST_TOKEN
To get an API account and request token go to https://contrataciones.gov.py/datos/adm/login.
"""
name = 'paraguay_dncp_releases'
data_type = 'release_package'

Expand Down

0 comments on commit 55d9d4c

Please sign in to comment.