Skip to content

Commit

Permalink
Merge 726b8d0 into 5ab8eeb
Browse files Browse the repository at this point in the history
  • Loading branch information
romifz committed Aug 31, 2020
2 parents 5ab8eeb + 726b8d0 commit 73610b9
Show file tree
Hide file tree
Showing 9 changed files with 263 additions and 116 deletions.
114 changes: 103 additions & 11 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
from abc import abstractmethod
from datetime import datetime
from io import BytesIO
from zipfile import ZipFile
Expand Down Expand Up @@ -58,7 +59,7 @@ class BaseSpider(scrapy.Spider):

MAX_SAMPLE = 10
MAX_RELEASES_PER_PACKAGE = 100
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'}
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'}

ocds_version = '1.1'
date_format = 'date'
Expand Down Expand Up @@ -115,17 +116,25 @@ def from_crawler(cls, crawler, *args, **kwargs):
if not spider.from_date:
# Default to `default_from_date` class attribute.
spider.from_date = spider.default_from_date
if isinstance(spider.from_date, str):
# convert to date format, if needed
spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
else:
try:
spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e))

if not spider.until_date:
# Default to today.
spider.until_date = datetime.now().strftime(spider.date_format)
try:
spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e))
try:
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e))
spider.until_date = cls.get_default_until_date(spider)
if isinstance(spider.until_date, str):
# convert to date format, if needed
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
else:
try:
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)
except ValueError as e:
raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e))

return spider

Expand Down Expand Up @@ -272,6 +281,10 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None
if self.sample:
break

@classmethod
def get_default_until_date(cls, spider):
return datetime.now()


class SimpleSpider(BaseSpider):
"""
Expand Down Expand Up @@ -435,3 +448,82 @@ def next_link(self, response, **kwargs):

if response.meta['depth'] == 0:
raise MissingNextLinkError('next link not found on the first page: {}'.format(response.url))


class PeriodicalSpider(SimpleSpider):
"""
This class helps to crawl urls that receive a year (YYYY) or a month and year (YYYY-mm) as parameters. To use it:
1. Extend from ``PeriodicalSpider``.
1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'.
1. Set a ``default_from_date`` year or month-year.
1. Optionally, set a ``default_until_date`` year or month-year. If absent, ``default_until_date`` defaults to the
current year or month-year.
1. Set the ``pattern`` parameter with the url to retrieve.
1. Implement the `get_formatter` method.
The ``pattern`` should include a placeholder for a year or month-year parameter. With the year parameter, an int is
passed. If the year-month parameter is used, a ``Date`` instance is passed. Example:
.. code-block: python
url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}'
When the ``sample`` option is used, the latest year or month of data is retrieved.
"""
VALID_DATE_FORMATS = {'year': '%Y', 'year-month': '%Y-%m'}

def __init__(self, *args, **kwargs):
self.date_format_key = self.date_format
super().__init__(*args, **kwargs)

if hasattr(self, 'start_requests_callback'):
self.start_requests_callback = getattr(self, self.start_requests_callback)
else:
self.start_requests_callback = self.parse

@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(SimpleSpider, cls).from_crawler(crawler, *args, **kwargs)

if not spider.from_date:
spider.from_date = spider.default_from_date
if isinstance(spider.from_date, str):
spider.from_date = datetime.strptime(spider.from_date, spider.date_format)
spider.until_date = cls.get_default_until_date(spider)
if isinstance(spider.until_date, str):
spider.until_date = datetime.strptime(spider.until_date, spider.date_format)

return spider

@classmethod
def get_default_until_date(cls, spider):
if hasattr(spider, 'default_until_date') and spider.default_until_date:
return spider.default_until_date
else:
return datetime.today()

def start_requests(self):

start = self.from_date

stop = self.until_date

if self.sample:
start = stop

if self.date_format_key == 'year':
date_range = util.date_range_by_year(start.year, stop.year)
else:
date_range = util.date_range_by_month(start, stop)

for date in date_range:
for url in self.build_urls(self.pattern, date):
yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback)

@abstractmethod
def get_formatter(self):
pass

def build_urls(self, pattern, date):
yield pattern.format(date)
22 changes: 9 additions & 13 deletions kingfisher_scrapy/spiders/moldova_old.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, date_range_by_year
from kingfisher_scrapy.base_spider import PeriodicalSpider
from kingfisher_scrapy.util import components


class MoldovaOld(SimpleSpider):
class MoldovaOld(PeriodicalSpider):
"""
Bulk download documentation
http://opencontracting.date.gov.md/downloads
Expand All @@ -12,14 +12,10 @@ class MoldovaOld(SimpleSpider):
"""
name = 'moldova_old'
data_type = 'release_package'
default_from_date = '2012'
default_until_date = '2018'
pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'
date_format = 'year'

def start_requests(self):
pattern = 'http://opencontracting.date.gov.md/ocds-api/year/{}'

start = 2012
stop = 2018
if self.sample:
start = 2018

for year in date_range_by_year(start, stop):
yield self.build_request(pattern.format(year), formatter=components(-1))
def get_formatter(self):
return components(-1)
26 changes: 9 additions & 17 deletions kingfisher_scrapy/spiders/nepal_portal.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from datetime import date
from kingfisher_scrapy.base_spider import PeriodicalSpider
from kingfisher_scrapy.util import components

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, date_range_by_year


class NepalPortal(SimpleSpider):
class NepalPortal(PeriodicalSpider):
"""
Bulk download documentation
http://ppip.gov.np/downloads
Expand All @@ -15,16 +13,10 @@ class NepalPortal(SimpleSpider):
name = 'nepal_portal'
data_type = 'release_package'
ocds_version = '1.0'
default_from_date = '2012'
default_until_date = '2018'
pattern = 'http://ppip.gov.np/bulk-download/{}'
date_format = 'year'

def start_requests(self):
pattern = 'http://ppip.gov.np/bulk-download/{}'

if self.sample:
start = 2018
stop = 2018
else:
start = 2012
stop = date.today().year # HTTP 500 after 2018

for year in date_range_by_year(start, stop):
yield self.build_request(pattern.format(year), formatter=components(-1))
def get_formatter(self):
return components(-1)
75 changes: 32 additions & 43 deletions kingfisher_scrapy/spiders/scotland_base.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,40 @@
from datetime import date

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import date_range_by_month, parameters
from kingfisher_scrapy.base_spider import PeriodicalSpider
from kingfisher_scrapy.util import parameters


class ScotlandBase(SimpleSpider):
default_from_date = '2019-01'
class ScotlandBase(PeriodicalSpider):
date_format = 'year-month'
default_from_date = date(date.today().year - 1, date.today().month, 1)

@classmethod
def from_crawler(cls, crawler, from_date=None, *args, **kwargs):
if not from_date:
from_date = cls.default_from_date
notice_types = [
1, # OJEU - F1 - Prior Information Notice
2, # OJEU - F2 - Contract Notice
3, # OJEU - F3 - Contract Award Notice
4, # OJEU - F4 - Prior Information Notice(Utilities)
5, # OJEU - F5 - Contract Notice(Utilities)
6, # OJEU - F6 - Contract Award Notice(Utilities)
7, # OJEU - F7 - Qualification Systems(Utilities)
12, # OJEU - F12 - Design Contest Notice
13, # OJEU - F13 - Results Of Design Contest
14, # OJEU - F14 - Corrigendum
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice
20, # OJEU - F20 - Modification Notice
21, # OJEU - F21 - Social And other Specific Services(Public Contracts)
22, # OJEU - F22 - Social And other Specific Services(Utilities)
23, # OJEU - F23 - Social And other Specific Services(Concessions)
24, # OJEU - F24 - Concession Notice
25, # OJEU - F25 - Concession Award Notice
101, # Site Notice - Website Contract Notice
102, # Site Notice - Website Prior Information Notice
103, # Site Notice - Website Contract Award Notice
104, # Site Notice - Quick Quote Award
]

return super().from_crawler(crawler, from_date=from_date, *args, **kwargs)
def build_urls(self, pattern, date):
for notice_type in self.notice_types:
yield pattern.format(date, notice_type)

def start_requests(self):
notice_types = [
1, # OJEU - F1 - Prior Information Notice
2, # OJEU - F2 - Contract Notice
3, # OJEU - F3 - Contract Award Notice
4, # OJEU - F4 - Prior Information Notice(Utilities)
5, # OJEU - F5 - Contract Notice(Utilities)
6, # OJEU - F6 - Contract Award Notice(Utilities)
7, # OJEU - F7 - Qualification Systems(Utilities)
12, # OJEU - F12 - Design Contest Notice
13, # OJEU - F13 - Results Of Design Contest
14, # OJEU - F14 - Corrigendum
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice
20, # OJEU - F20 - Modification Notice
21, # OJEU - F21 - Social And other Specific Services(Public Contracts)
22, # OJEU - F22 - Social And other Specific Services(Utilities)
23, # OJEU - F23 - Social And other Specific Services(Concessions)
24, # OJEU - F24 - Concession Notice
25, # OJEU - F25 - Concession Award Notice
101, # Site Notice - Website Contract Notice
102, # Site Notice - Website Prior Information Notice
103, # Site Notice - Website Contract Award Notice
104, # Site Notice - Quick Quote Award
]

for year_month in date_range_by_month(self.from_date, date.today()):
date_string = year_month.strftime('%m-%Y')
for notice_type in notice_types:
yield self.build_request(
self.url.format(date_string, notice_type),
formatter=parameters('noticeType', 'dateFrom')
)
if self.sample:
return
def get_formatter(self):
return parameters('noticeType', 'dateFrom')
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/scotland_proactis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ class ScotlandProactis(ScotlandBase):
"""
name = 'scotland_proactis'
data_type = 'release_package'
url = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={}&outputType=0&noticeType={}'
pattern = 'https://sandbox4.proactislabs.com/v1/Notices?dateFrom={:%m-%Y}&outputType=0&noticeType={}'
2 changes: 1 addition & 1 deletion kingfisher_scrapy/spiders/scotland_public_contracts.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ class ScotlandPublicContracts(ScotlandBase):
"""
name = 'scotland_public_contracts'
data_type = 'release_package'
url = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={}&outputType=1&noticeType={}'
pattern = 'https://api.publiccontractsscotland.gov.uk/v1/Notices?dateFrom={:%m-%Y}&outputType=0&noticeType={}'
24 changes: 7 additions & 17 deletions kingfisher_scrapy/spiders/uruguay_base.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,18 @@
from abc import abstractmethod

from kingfisher_scrapy.base_spider import SimpleSpider
from kingfisher_scrapy.util import components, date_range_by_month
from kingfisher_scrapy.base_spider import PeriodicalSpider
from kingfisher_scrapy.util import components


class UruguayBase(SimpleSpider):
class UruguayBase(PeriodicalSpider):
download_delay = 0.9
default_from_date = '2017-11'
date_format = 'year-month'
pattern = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}'
start_requests_callback = 'parse_list'

@classmethod
def from_crawler(cls, crawler, from_date=None, *args, **kwargs):
if not from_date:
from_date = cls.default_from_date

return super().from_crawler(crawler, from_date=from_date, *args, **kwargs)

def start_requests(self):
url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}'
if self.sample:
self.from_date = self.until_date

for d in date_range_by_month(self.from_date, self.until_date):
yield self.build_request(url.format(d), formatter=components(-2), callback=self.parse_list)
def get_formatter(self):
return components(-2)

@abstractmethod
def parse_list(self):
Expand Down
23 changes: 10 additions & 13 deletions kingfisher_scrapy/spiders/uruguay_historical.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from kingfisher_scrapy.base_spider import CompressedFileSpider
from kingfisher_scrapy.util import components, date_range_by_year
from kingfisher_scrapy.base_spider import CompressedFileSpider, PeriodicalSpider
from kingfisher_scrapy.util import components


class UruguayHistorical(CompressedFileSpider):
class UruguayHistorical(CompressedFileSpider, PeriodicalSpider):
"""
Bulk download documentation
https://www.gub.uy/agencia-compras-contrataciones-estado/datos-y-estadisticas/datos/open-contracting
Expand All @@ -15,20 +15,17 @@ class UruguayHistorical(CompressedFileSpider):

# the files takes too long to be downloaded, so we increase the download timeout
download_timeout = 1000
default_from_date = '2002'
default_until_date = '2017'
date_format = 'year'
custom_settings = {
# It seems some websites don't like it and block when your user agent is not a browser.
# see https://github.com/scrapy/scrapy/issues/3103
'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/37.0.2049.0 Safari/537.36',
}
pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \
'/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip'

def start_requests(self):
start = 2002
stop = 2017
if self.sample:
start = stop

pattern = 'https://www.gub.uy/agencia-compras-contrataciones-estado/sites' \
'/agencia-compras-contrataciones-estado/files/2019-04/OCDS-{}.zip'
for year in date_range_by_year(start, stop):
yield self.build_request(pattern.format(year), formatter=components(-1))
def get_formatter(self):
return components(-1)

0 comments on commit 73610b9

Please sign in to comment.