-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add PeriodicalSpider #467
Add PeriodicalSpider #467
Changes from all commits
4f39bd9
751f391
0303c64
60c29ff
86518da
100f2a5
11d2526
8b884a9
8cdb090
77f6c06
7efb28d
726b8d0
be0eba6
9bd5665
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
import json | ||
import os | ||
from abc import abstractmethod | ||
from datetime import datetime | ||
from io import BytesIO | ||
from zipfile import ZipFile | ||
|
@@ -58,7 +59,7 @@ class BaseSpider(scrapy.Spider): | |
|
||
MAX_SAMPLE = 10 | ||
MAX_RELEASES_PER_PACKAGE = 100 | ||
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'} | ||
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'} | ||
|
||
ocds_version = '1.1' | ||
date_format = 'date' | ||
|
@@ -115,15 +116,18 @@ def from_crawler(cls, crawler, *args, **kwargs): | |
if not spider.from_date: | ||
# Default to `default_from_date` class attribute. | ||
spider.from_date = spider.default_from_date | ||
if not spider.until_date: | ||
# Default to today. | ||
spider.until_date = datetime.now().strftime(spider.date_format) | ||
try: | ||
spider.from_date = datetime.strptime(spider.from_date, spider.date_format) | ||
if isinstance(spider.from_date, str): | ||
# convert to date format, if needed | ||
spider.from_date = datetime.strptime(spider.from_date, spider.date_format) | ||
except ValueError as e: | ||
raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e)) | ||
|
||
if not spider.until_date: | ||
spider.until_date = cls.get_default_until_date(spider) | ||
try: | ||
spider.until_date = datetime.strptime(spider.until_date, spider.date_format) | ||
if isinstance(spider.until_date, str): | ||
spider.until_date = datetime.strptime(spider.until_date, spider.date_format) | ||
except ValueError as e: | ||
raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e)) | ||
|
||
|
@@ -272,6 +276,10 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None | |
if self.sample: | ||
break | ||
|
||
@classmethod | ||
def get_default_until_date(cls, spider): | ||
return datetime.now() | ||
|
||
|
||
class SimpleSpider(BaseSpider): | ||
""" | ||
|
@@ -435,3 +443,77 @@ def next_link(self, response, **kwargs): | |
|
||
if response.meta['depth'] == 0: | ||
raise MissingNextLinkError('next link not found on the first page: {}'.format(response.url)) | ||
|
||
|
||
class PeriodicalSpider(SimpleSpider): | ||
""" | ||
This class helps to crawl urls that receive a year (YYYY) or a month and year (YYYY-mm) as parameters. To use it: | ||
|
||
1. Extend from ``PeriodicalSpider``. | ||
1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'. | ||
1. Set a ``default_from_date`` year or month-year. | ||
1. Optionally, set a ``default_until_date`` year or month-year. If absent, ``default_until_date`` defaults to the | ||
current year or month-year. | ||
1. Set the ``pattern`` parameter with the url to retrieve. | ||
1. Implement the `get_formatter` method. | ||
|
||
The ``pattern`` should include a placeholder for a year or month-year parameter. With the year parameter, an int is | ||
passed. If the year-month parameter is used, a ``Date`` instance is passed. Example: | ||
|
||
.. code-block: python | ||
|
||
url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' | ||
|
||
When the ``sample`` option is used, the latest year or month of data is retrieved. | ||
""" | ||
VALID_DATE_FORMATS = {'year': '%Y', 'year-month': '%Y-%m'} | ||
|
||
def __init__(self, *args, **kwargs): | ||
self.date_format_key = self.date_format | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @romifz is this needed? cant we use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case yes, because internally |
||
super().__init__(*args, **kwargs) | ||
|
||
if hasattr(self, 'start_requests_callback'): | ||
self.start_requests_callback = getattr(self, self.start_requests_callback) | ||
else: | ||
self.start_requests_callback = self.parse | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler, from_date=None, *args, **kwargs): | ||
if not from_date: | ||
from_date = cls.default_from_date | ||
|
||
spider = super(SimpleSpider, cls).from_crawler(crawler, from_date=from_date, *args, **kwargs) | ||
|
||
return spider | ||
|
||
@classmethod | ||
def get_default_until_date(cls, spider): | ||
if hasattr(spider, 'default_until_date') and spider.default_until_date: | ||
return spider.default_until_date | ||
else: | ||
return datetime.today() | ||
|
||
def start_requests(self): | ||
|
||
start = self.from_date | ||
|
||
stop = self.until_date | ||
|
||
if self.sample: | ||
start = stop | ||
|
||
if self.date_format_key == 'year': | ||
date_range = util.date_range_by_year(start.year, stop.year) | ||
else: | ||
date_range = util.date_range_by_month(start, stop) | ||
|
||
for date in date_range: | ||
for url in self.build_urls(self.pattern, date): | ||
yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback) | ||
|
||
@abstractmethod | ||
def get_formatter(self): | ||
pass | ||
|
||
def build_urls(self, pattern, date): | ||
yield pattern.format(date) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,51 +1,40 @@ | ||
from datetime import date | ||
|
||
from kingfisher_scrapy.base_spider import SimpleSpider | ||
from kingfisher_scrapy.util import date_range_by_month, parameters | ||
from kingfisher_scrapy.base_spider import PeriodicalSpider | ||
from kingfisher_scrapy.util import parameters | ||
|
||
|
||
class ScotlandBase(SimpleSpider): | ||
default_from_date = '2019-01' | ||
class ScotlandBase(PeriodicalSpider): | ||
date_format = 'year-month' | ||
default_from_date = date(date.today().year - 1, date.today().month, 1) | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler, from_date=None, *args, **kwargs): | ||
if not from_date: | ||
from_date = cls.default_from_date | ||
notice_types = [ | ||
1, # OJEU - F1 - Prior Information Notice | ||
2, # OJEU - F2 - Contract Notice | ||
3, # OJEU - F3 - Contract Award Notice | ||
4, # OJEU - F4 - Prior Information Notice(Utilities) | ||
5, # OJEU - F5 - Contract Notice(Utilities) | ||
6, # OJEU - F6 - Contract Award Notice(Utilities) | ||
7, # OJEU - F7 - Qualification Systems(Utilities) | ||
12, # OJEU - F12 - Design Contest Notice | ||
13, # OJEU - F13 - Results Of Design Contest | ||
14, # OJEU - F14 - Corrigendum | ||
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice | ||
20, # OJEU - F20 - Modification Notice | ||
21, # OJEU - F21 - Social And other Specific Services(Public Contracts) | ||
22, # OJEU - F22 - Social And other Specific Services(Utilities) | ||
23, # OJEU - F23 - Social And other Specific Services(Concessions) | ||
24, # OJEU - F24 - Concession Notice | ||
25, # OJEU - F25 - Concession Award Notice | ||
101, # Site Notice - Website Contract Notice | ||
102, # Site Notice - Website Prior Information Notice | ||
103, # Site Notice - Website Contract Award Notice | ||
104, # Site Notice - Quick Quote Award | ||
] | ||
|
||
return super().from_crawler(crawler, from_date=from_date, *args, **kwargs) | ||
def build_urls(self, pattern, date): | ||
for notice_type in self.notice_types: | ||
yield pattern.format(date, notice_type) | ||
|
||
def start_requests(self): | ||
notice_types = [ | ||
1, # OJEU - F1 - Prior Information Notice | ||
2, # OJEU - F2 - Contract Notice | ||
3, # OJEU - F3 - Contract Award Notice | ||
4, # OJEU - F4 - Prior Information Notice(Utilities) | ||
5, # OJEU - F5 - Contract Notice(Utilities) | ||
6, # OJEU - F6 - Contract Award Notice(Utilities) | ||
7, # OJEU - F7 - Qualification Systems(Utilities) | ||
12, # OJEU - F12 - Design Contest Notice | ||
13, # OJEU - F13 - Results Of Design Contest | ||
14, # OJEU - F14 - Corrigendum | ||
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice | ||
20, # OJEU - F20 - Modification Notice | ||
21, # OJEU - F21 - Social And other Specific Services(Public Contracts) | ||
22, # OJEU - F22 - Social And other Specific Services(Utilities) | ||
23, # OJEU - F23 - Social And other Specific Services(Concessions) | ||
24, # OJEU - F24 - Concession Notice | ||
25, # OJEU - F25 - Concession Award Notice | ||
101, # Site Notice - Website Contract Notice | ||
102, # Site Notice - Website Prior Information Notice | ||
103, # Site Notice - Website Contract Award Notice | ||
104, # Site Notice - Quick Quote Award | ||
] | ||
|
||
for year_month in date_range_by_month(self.from_date, date.today()): | ||
date_string = year_month.strftime('%m-%Y') | ||
for notice_type in notice_types: | ||
yield self.build_request( | ||
self.url.format(date_string, notice_type), | ||
formatter=parameters('noticeType', 'dateFrom') | ||
) | ||
if self.sample: | ||
return | ||
def get_formatter(self): | ||
return parameters('noticeType', 'dateFrom') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we remove 'year-month' from
base_spider
VALID_DATE_FORMATS
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@romifz
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure. I think it should be possible to use the same format for filtering in other spiders, although I think we don't have a case, don't we?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, I think that all of the spiders which uses 'year-month' extends from
PeriodicalSpider
nowThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, no problem, I'll remove the option.