Skip to content

Commit

Permalink
Merge branch 'master' into 337-index-spider
Browse files Browse the repository at this point in the history
# Conflicts:
#	kingfisher_scrapy/base_spider.py
#	kingfisher_scrapy/spiders/honduras_portal_records.py
#	kingfisher_scrapy/spiders/honduras_portal_releases.py
  • Loading branch information
romifz committed Sep 21, 2020
2 parents 9e5e658 + 93aef51 commit 33cb386
Show file tree
Hide file tree
Showing 17 changed files with 335 additions and 179 deletions.
94 changes: 88 additions & 6 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
from abc import abstractmethod
from datetime import datetime
from io import BytesIO
from math import ceil
Expand Down Expand Up @@ -59,7 +60,7 @@ class BaseSpider(scrapy.Spider):

MAX_SAMPLE = 10
MAX_RELEASES_PER_PACKAGE = 100
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'}
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'}

ocds_version = '1.1'
date_format = 'date'
Expand Down Expand Up @@ -116,15 +117,18 @@ def from_crawler(cls, crawler, *args, **kwargs):
if not spider.from_date:
# Default to `default_from_date` class attribute.
spider.from_date = spider.default_from_date
if not spider.until_date:
# Default to today.
spider.until_date = datetime.now().strftime(spider.date_format)
try:
spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
if isinstance(spider.from_date, str):
# convert to date format, if needed
spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e))

if not spider.until_date:
spider.until_date = cls.get_default_until_date(spider)
try:
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
if isinstance(spider.until_date, str):
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e))

Expand Down Expand Up @@ -273,6 +277,10 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None
if self.sample:
break

@classmethod
def get_default_until_date(cls, spider):
return datetime.now()


class SimpleSpider(BaseSpider):
"""
Expand Down Expand Up @@ -440,6 +448,80 @@ def next_link(self, response, **kwargs):
raise MissingNextLinkError('next link not found on the first page: {}'.format(response.url))


class PeriodicalSpider(SimpleSpider):
"""
This class helps to crawl urls that receive a year (YYYY) or a month and year (YYYY-mm) as parameters. To use it:
1. Extend from ``PeriodicalSpider``.
1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'.
1. Set a ``default_from_date`` year or month-year.
1. Optionally, set a ``default_until_date`` year or month-year. If absent, ``default_until_date`` defaults to the
current year or month-year.
1. Set the ``pattern`` parameter with the url to retrieve.
1. Implement the `get_formatter` method.
The ``pattern`` should include a placeholder for a year or month-year parameter. With the year parameter, an int is
passed. If the year-month parameter is used, a ``Date`` instance is passed. Example:
.. code-block: python
url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}'
When the ``sample`` option is used, the latest year or month of data is retrieved.
"""
VALID_DATE_FORMATS = {'year': '%Y', 'year-month': '%Y-%m'}

def __init__(self, *args, **kwargs):
self.date_format_key = self.date_format
super().__init__(*args, **kwargs)

if hasattr(self, 'start_requests_callback'):
self.start_requests_callback = getattr(self, self.start_requests_callback)
else:
self.start_requests_callback = self.parse

@classmethod
def from_crawler(cls, crawler, from_date=None, *args, **kwargs):
if not from_date:
from_date = cls.default_from_date

spider = super(SimpleSpider, cls).from_crawler(crawler, from_date=from_date, *args, **kwargs)

return spider

@classmethod
def get_default_until_date(cls, spider):
if hasattr(spider, 'default_until_date') and spider.default_until_date:
return spider.default_until_date
else:
return datetime.today()

def start_requests(self):

start = self.from_date

stop = self.until_date

if self.sample:
start = stop

if self.date_format_key == 'year':
date_range = util.date_range_by_year(start.year, stop.year)
else:
date_range = util.date_range_by_month(start, stop)

for date in date_range:
for url in self.build_urls(self.pattern, date):
yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback)

@abstractmethod
def get_formatter(self):
pass

def build_urls(self, pattern, date):
yield pattern.format(date)


class IndexSpider(SimpleSpider):
"""
This class can be used to collect data from an API which includes the total number of results or pages in their
Expand Down
39 changes: 6 additions & 33 deletions kingfisher_scrapy/spiders/ecuador_emergency.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,43 +14,16 @@ class EcuadorEmergency(SimpleSpider):
"""
name = 'ecuador_emergency'
data_type = 'release_package'
custom_settings = {
'CONCURRENT_REQUESTS': 1,
}
urls = []

def start_requests(self):
url = 'https://portal.compraspublicas.gob.ec/sercop/data-estandar-ocds/'
url = 'https://datosabiertos.compraspublicas.gob.ec/OCDS/'
yield scrapy.Request(url, meta={'file_name': 'list.html'}, callback=self.parse_list)

@handle_http_error
def parse_list(self, response):
for row in response.xpath('//tr'):
html_url = row.xpath('td/strong/a/@href').extract_first()
filename = row.xpath('td/p/strong/text()').extract_first()
if html_url:
data_url = f'{html_url.replace("sharing", "fsdownload")}/ocds-{filename}.json'
self.urls.append((html_url, data_url))
if self.sample:
break
html_urls = response.xpath('//a/@href').getall()
for html_url in html_urls:
yield self.build_request(response.request.url + html_url, formatter=components(-1))

yield self.request_cookie()

def request_cookie(self):
# This request sets a cookie, which must be used immediately to download the data. So, we set
# `CONCURRENT_REQUESTS` to 1, and yield the requests in order.
html_url, data_url = self.urls.pop()
return self.build_request(html_url, meta={'next': data_url}, formatter=components(-1),
callback=self.parse_page)

@handle_http_error
def parse_page(self, response):
# If there is an error, a request for the data URL redirects to the html URL. To treat this as an error, we set
# `dont_redirect`.
yield self.build_request(response.meta['next'], meta={'dont_redirect': True}, formatter=components(-1))

def parse(self, response):
yield from super().parse(response)

if self.urls:
yield self.request_cookie()
if self.sample:
break
27 changes: 27 additions & 0 deletions kingfisher_scrapy/spiders/honduras_portal_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import scrapy

from kingfisher_scrapy.base_spider import IndexSpider
from kingfisher_scrapy.util import parameters


class HondurasPortalBase(IndexSpider):
next_pointer = '/next'
formatter = staticmethod(parameters('page'))
total_pages_pointer = '/pages'
publishers = ['oncae', 'sefin']

download_delay = 0.9

@classmethod
def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
if publisher and publisher not in spider.publishers:
raise scrapy.exceptions.CloseSpider('Specified publisher is not recognized')

return spider

def start_requests(self):
url = self.url
if self.publisher:
url = url + '&publisher=' + self.publisher
yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_list)
24 changes: 21 additions & 3 deletions kingfisher_scrapy/spiders/honduras_portal_bulk_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,28 @@ class HondurasPortalBulkFiles(SimpleSpider):
Bulk download documentation
http://www.contratacionesabiertas.gob.hn/descargas/
Spider arguments
publisher
Filter the data by a specific publisher.
``oncae`` for "Oficina Normativa de Contratación y Adquisiciones del Estado" publisher.
``sefin`` for "Secretaria de Finanzas de Honduras" publisher.
sample
Downloads the first package listed in http://www.contratacionesabiertas.gob.hn/api/v1/descargas/?format=json.
If ``publisher'' is also provided, a single package is downloaded from that publisher.
"""
name = 'honduras_portal_bulk_files'
data_type = 'release_package'
skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases
publishers = {'oncae': 'ONCAE', 'sefin': 'Secretaria de Finanzas'}

@classmethod
def from_crawler(cls, crawler, publisher=None, *args, **kwargs):
spider = super().from_crawler(crawler, publisher=publisher, *args, **kwargs)
if publisher and publisher not in spider.publishers.keys():
raise scrapy.exceptions.CloseSpider('Specified publisher is not recognized')

spider.publisher_filter = spider.publishers.get(publisher)

return spider

def start_requests(self):
yield scrapy.Request(
Expand All @@ -28,9 +44,11 @@ def start_requests(self):
@handle_http_error
def parse_list(self, response):
items = json.loads(response.text)
if self.sample:
items = [items[0]]

for item in items:
if self.publisher and self.publisher_filter not in item['publicador']:
continue
url = item['urls']['json']
yield self.build_request(url, formatter=components(-1))

if self.sample:
return
21 changes: 8 additions & 13 deletions kingfisher_scrapy/spiders/honduras_portal_records.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,23 @@
import scrapy
from kingfisher_scrapy.spiders.honduras_portal_base import HondurasPortalBase

from kingfisher_scrapy.base_spider import IndexSpider
from kingfisher_scrapy.util import parameters


class HondurasPortalRecords(IndexSpider):
class HondurasPortalRecords(HondurasPortalBase):
"""
API documentation
http://www.contratacionesabiertas.gob.hn/manual_api/
Swagger API documentation
http://www.contratacionesabiertas.gob.hn/servicio/
Spider arguments
publisher
Filter the data by a specific publisher.
``oncae`` for "Oficina Normativa de Contratación y Adquisiciones del Estado" publisher.
``sefin`` for "Secretaria de Finanzas de Honduras" publisher.
sample
Download only the first record package in the dataset.
If ``publisher'' is also provided, a single package is downloaded from that publisher.
"""
name = 'honduras_portal_records'
data_type = 'record_package'
data_pointer = '/recordPackage'
total_pages_pointer = '/pages'
formatter = staticmethod(parameters('page'))
skip_pluck = 'Already covered (see code for details)' # honduras_portal_releases

download_delay = 0.9

def start_requests(self):
url = 'http://www.contratacionesabiertas.gob.hn/api/v1/record/?format=json'
yield scrapy.Request(url, meta={'file_name': 'page-1.json'}, callback=self.parse_list)
url = 'http://www.contratacionesabiertas.gob.hn/api/v1/record/?format=json'
21 changes: 8 additions & 13 deletions kingfisher_scrapy/spiders/honduras_portal_releases.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,22 @@
import scrapy
from kingfisher_scrapy.spiders.honduras_portal_base import HondurasPortalBase

from kingfisher_scrapy.base_spider import IndexSpider
from kingfisher_scrapy.util import parameters


class HondurasPortalReleases(IndexSpider):
class HondurasPortalReleases(HondurasPortalBase):
"""
API documentation
http://www.contratacionesabiertas.gob.hn/manual_api/
Swagger API documentation
http://www.contratacionesabiertas.gob.hn/servicio/
Spider arguments
publisher
Filter the data by a specific publisher.
``oncae`` for "Oficina Normativa de Contratación y Adquisiciones del Estado" publisher.
``sefin`` for "Secretaria de Finanzas de Honduras" publisher.
sample
Download only the first release package in the dataset.
If ``publisher'' is also provided, a single package is downloaded from that publisher.
"""
name = 'honduras_portal_releases'
data_type = 'release_package'
data_pointer = '/releasePackage'
total_pages_pointer = '/pages'
formatter = staticmethod(parameters('page'))

download_delay = 0.9

def start_requests(self):
yield scrapy.Request('http://www.contratacionesabiertas.gob.hn/api/v1/release/?format=json',
meta={'file_name': 'page-1.json'}, callback=self.parse_list)
url = 'http://www.contratacionesabiertas.gob.hn/api/v1/release/?format=json'
22 changes: 9 additions & 13 deletions kingfisher_scrapy/spiders/moldova_old.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, date_range_by_year
from kingfisher_scrapy.base_spider import PeriodicalSpider
from kingfisher_scrapy.util import components


class MoldovaOld(SimpleSpider):
class MoldovaOld(PeriodicalSpider):
"""
Bulk download documentation
http://opencontracting.date.gov.md/downloads
Expand All @@ -12,14 +12,10 @@ class MoldovaOld(SimpleSpider):
"""
name = 'moldova_old'
data_type = 'release_package'
default_from_date = '2012'
default_until_date = '2018'
pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'
date_format = 'year'

def start_requests(self):
pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'

start = 2012
stop = 2018
if self.sample:
start = 2018

for year in date_range_by_year(start, stop):
yield self.build_request(pattern.format(year), formatter=components(-1))
def get_formatter(self):
return components(-1)
26 changes: 9 additions & 17 deletions kingfisher_scrapy/spiders/nepal_portal.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from datetime import date
from kingfisher_scrapy.base_spider import PeriodicalSpider
from kingfisher_scrapy.util import components

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, date_range_by_year


class NepalPortal(SimpleSpider):
class NepalPortal(PeriodicalSpider):
"""
Bulk download documentation
http://ppip.gov.np/downloads
Expand All @@ -15,16 +13,10 @@ class NepalPortal(SimpleSpider):
name = 'nepal_portal'
data_type = 'release_package'
ocds_version = '1.0'
default_from_date = '2012'
default_until_date = '2018'
pattern = 'http://ppip.gov.np/bulk-download/{}'
date_format = 'year'

def start_requests(self):
pattern = 'http://ppip.gov.np/bulk-download/{}'

if self.sample:
start = 2018
stop = 2018
else:
start = 2012
stop = date.today().year # HTTP 500 after 2018

for year in date_range_by_year(start, stop):
yield self.build_request(pattern.format(year), formatter=components(-1))
def get_formatter(self):
return components(-1)
Loading

0 comments on commit 33cb386

Please sign in to comment.