-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add PeriodicalSpider #467
Add PeriodicalSpider #467
Changes from 11 commits
4f39bd9
751f391
0303c64
60c29ff
86518da
100f2a5
11d2526
8b884a9
8cdb090
77f6c06
7efb28d
726b8d0
be0eba6
9bd5665
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
@@ -1,5 +1,6 @@ | ||||||||||||
import json | ||||||||||||
import os | ||||||||||||
from abc import abstractmethod | ||||||||||||
from datetime import datetime | ||||||||||||
from io import BytesIO | ||||||||||||
from zipfile import ZipFile | ||||||||||||
|
@@ -58,7 +59,7 @@ class BaseSpider(scrapy.Spider): | |||||||||||
|
||||||||||||
MAX_SAMPLE = 10 | ||||||||||||
MAX_RELEASES_PER_PACKAGE = 100 | ||||||||||||
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S', 'year-month': '%Y-%m'} | ||||||||||||
VALID_DATE_FORMATS = {'date': '%Y-%m-%d', 'datetime': '%Y-%m-%dT%H:%M:%S'} | ||||||||||||
|
||||||||||||
ocds_version = '1.1' | ||||||||||||
date_format = 'date' | ||||||||||||
|
@@ -115,17 +116,25 @@ def from_crawler(cls, crawler, *args, **kwargs): | |||||||||||
if not spider.from_date: | ||||||||||||
# Default to `default_from_date` class attribute. | ||||||||||||
spider.from_date = spider.default_from_date | ||||||||||||
if isinstance(spider.from_date, str): | ||||||||||||
# convert to date format, if needed | ||||||||||||
spider.from_date = datetime.strptime(spider.from_date, spider.date_format) | ||||||||||||
else: | ||||||||||||
try: | ||||||||||||
spider.from_date = datetime.strptime(spider.from_date, spider.date_format) | ||||||||||||
except ValueError as e: | ||||||||||||
raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e)) | ||||||||||||
|
||||||||||||
if not spider.until_date: | ||||||||||||
# Default to today. | ||||||||||||
spider.until_date = datetime.now().strftime(spider.date_format) | ||||||||||||
try: | ||||||||||||
spider.from_date = datetime.strptime(spider.from_date, spider.date_format) | ||||||||||||
except ValueError as e: | ||||||||||||
raise SpiderArgumentError('spider argument from_date: invalid date value: {}'.format(e)) | ||||||||||||
try: | ||||||||||||
spider.until_date = datetime.strptime(spider.until_date, spider.date_format) | ||||||||||||
except ValueError as e: | ||||||||||||
raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e)) | ||||||||||||
spider.until_date = cls.get_default_until_date(spider) | ||||||||||||
if isinstance(spider.until_date, str): | ||||||||||||
# convert to date format, if needed | ||||||||||||
spider.until_date = datetime.strptime(spider.until_date, spider.date_format) | ||||||||||||
else: | ||||||||||||
try: | ||||||||||||
spider.until_date = datetime.strptime(spider.until_date, spider.date_format) | ||||||||||||
except ValueError as e: | ||||||||||||
raise SpiderArgumentError('spider argument until_date: invalid date value: {}'.format(e)) | ||||||||||||
|
||||||||||||
return spider | ||||||||||||
|
||||||||||||
|
@@ -272,6 +281,10 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None | |||||||||||
if self.sample: | ||||||||||||
break | ||||||||||||
|
||||||||||||
@classmethod | ||||||||||||
def get_default_until_date(cls, spider): | ||||||||||||
return datetime.now() | ||||||||||||
|
||||||||||||
|
||||||||||||
class SimpleSpider(BaseSpider): | ||||||||||||
""" | ||||||||||||
|
@@ -435,3 +448,82 @@ def next_link(self, response, **kwargs): | |||||||||||
|
||||||||||||
if response.meta['depth'] == 0: | ||||||||||||
raise MissingNextLinkError('next link not found on the first page: {}'.format(response.url)) | ||||||||||||
|
||||||||||||
|
||||||||||||
class PeriodicalSpider(SimpleSpider): | ||||||||||||
""" | ||||||||||||
This class helps to crawl urls that receive a year (YYYY) or a month and year (YYYY-mm) as parameters. To use it: | ||||||||||||
|
||||||||||||
1. Extend from ``PeriodicalSpider``. | ||||||||||||
1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'. | ||||||||||||
1. Set a ``default_from_date`` year or month-year. | ||||||||||||
1. Optionally, set a ``default_until_date`` year or month-year. If absent, ``default_until_date`` defaults to the | ||||||||||||
current year or month-year. | ||||||||||||
1. Set the ``pattern`` parameter with the url to retrieve. | ||||||||||||
1. Implement the `get_formatter` method. | ||||||||||||
|
||||||||||||
The ``pattern`` should include a placeholder for a year or month-year parameter. With the year parameter, an int is | ||||||||||||
passed. If the year-month parameter is used, a ``Date`` instance is passed. Example: | ||||||||||||
|
||||||||||||
.. code-block: python | ||||||||||||
|
||||||||||||
url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' | ||||||||||||
|
||||||||||||
When the ``sample`` option is used, the latest year or month of data is retrieved. | ||||||||||||
""" | ||||||||||||
VALID_DATE_FORMATS = {'year': '%Y', 'year-month': '%Y-%m'} | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we remove 'year-month' from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure. I think it should be possible to use the same format for filtering in other spiders, although I think we don't have a case, don't we? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, I think that all of the spiders which uses 'year-month' extends from There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, no problem, I'll remove the option. |
||||||||||||
|
||||||||||||
def __init__(self, *args, **kwargs): | ||||||||||||
self.date_format_key = self.date_format | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @romifz is this needed? cant we use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case yes, because internally |
||||||||||||
super().__init__(*args, **kwargs) | ||||||||||||
|
||||||||||||
if hasattr(self, 'start_requests_callback'): | ||||||||||||
self.start_requests_callback = getattr(self, self.start_requests_callback) | ||||||||||||
else: | ||||||||||||
self.start_requests_callback = self.parse | ||||||||||||
|
||||||||||||
@classmethod | ||||||||||||
def from_crawler(cls, crawler, *args, **kwargs): | ||||||||||||
spider = super(SimpleSpider, cls).from_crawler(crawler, *args, **kwargs) | ||||||||||||
|
||||||||||||
if not spider.from_date: | ||||||||||||
spider.from_date = spider.default_from_date | ||||||||||||
if isinstance(spider.from_date, str): | ||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we are already doing this in the base class, arent we? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but the conversion in the base class happens only if the user specifies There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you can do something like kingfisher-collect/kingfisher_scrapy/spiders/paraguay_dncp_base.py Lines 39 to 43 in 726b8d0
to avoid the validation/conversion duplication There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This would have the same issue as the previous comment, a bad configuration in the spider subclass may rise a I'm thinking that the best solution would be to have a class property to indicate the base class that we need a time interval to be set, like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think that as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's fair. |
||||||||||||
spider.from_date = datetime.strptime(spider.from_date, spider.date_format) | ||||||||||||
spider.until_date = cls.get_default_until_date(spider) | ||||||||||||
if isinstance(spider.until_date, str): | ||||||||||||
spider.until_date = datetime.strptime(spider.until_date, spider.date_format) | ||||||||||||
|
||||||||||||
return spider | ||||||||||||
|
||||||||||||
@classmethod | ||||||||||||
def get_default_until_date(cls, spider): | ||||||||||||
if hasattr(spider, 'default_until_date') and spider.default_until_date: | ||||||||||||
return spider.default_until_date | ||||||||||||
else: | ||||||||||||
return datetime.today() | ||||||||||||
|
||||||||||||
def start_requests(self): | ||||||||||||
|
||||||||||||
start = self.from_date | ||||||||||||
|
||||||||||||
stop = self.until_date | ||||||||||||
|
||||||||||||
if self.sample: | ||||||||||||
start = stop | ||||||||||||
|
||||||||||||
if self.date_format_key == 'year': | ||||||||||||
date_range = util.date_range_by_year(start.year, stop.year) | ||||||||||||
else: | ||||||||||||
date_range = util.date_range_by_month(start, stop) | ||||||||||||
|
||||||||||||
for date in date_range: | ||||||||||||
for url in self.build_urls(self.pattern, date): | ||||||||||||
yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback) | ||||||||||||
|
||||||||||||
@abstractmethod | ||||||||||||
def get_formatter(self): | ||||||||||||
pass | ||||||||||||
|
||||||||||||
def build_urls(self, pattern, date): | ||||||||||||
yield pattern.format(date) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,51 +1,42 @@ | ||
from datetime import date | ||
|
||
from kingfisher_scrapy.base_spider import SimpleSpider | ||
from kingfisher_scrapy.util import date_range_by_month, parameters | ||
from kingfisher_scrapy.base_spider import PeriodicalSpider | ||
from kingfisher_scrapy.util import parameters | ||
|
||
|
||
class ScotlandBase(SimpleSpider): | ||
class ScotlandBase(PeriodicalSpider): | ||
default_from_date = '2019-01' | ||
date_format = 'year-month' | ||
default_until_date = date.today() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the base class already does this, doesn't it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we can remove this. |
||
default_from_date = date(default_until_date.year - 1, default_until_date.month, 1) | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler, from_date=None, *args, **kwargs): | ||
if not from_date: | ||
from_date = cls.default_from_date | ||
notice_types = [ | ||
1, # OJEU - F1 - Prior Information Notice | ||
2, # OJEU - F2 - Contract Notice | ||
3, # OJEU - F3 - Contract Award Notice | ||
4, # OJEU - F4 - Prior Information Notice(Utilities) | ||
5, # OJEU - F5 - Contract Notice(Utilities) | ||
6, # OJEU - F6 - Contract Award Notice(Utilities) | ||
7, # OJEU - F7 - Qualification Systems(Utilities) | ||
12, # OJEU - F12 - Design Contest Notice | ||
13, # OJEU - F13 - Results Of Design Contest | ||
14, # OJEU - F14 - Corrigendum | ||
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice | ||
20, # OJEU - F20 - Modification Notice | ||
21, # OJEU - F21 - Social And other Specific Services(Public Contracts) | ||
22, # OJEU - F22 - Social And other Specific Services(Utilities) | ||
23, # OJEU - F23 - Social And other Specific Services(Concessions) | ||
24, # OJEU - F24 - Concession Notice | ||
25, # OJEU - F25 - Concession Award Notice | ||
101, # Site Notice - Website Contract Notice | ||
102, # Site Notice - Website Prior Information Notice | ||
103, # Site Notice - Website Contract Award Notice | ||
104, # Site Notice - Quick Quote Award | ||
] | ||
|
||
return super().from_crawler(crawler, from_date=from_date, *args, **kwargs) | ||
def build_urls(self, pattern, date): | ||
for notice_type in self.notice_types: | ||
yield pattern.format(date, notice_type) | ||
|
||
def start_requests(self): | ||
notice_types = [ | ||
1, # OJEU - F1 - Prior Information Notice | ||
2, # OJEU - F2 - Contract Notice | ||
3, # OJEU - F3 - Contract Award Notice | ||
4, # OJEU - F4 - Prior Information Notice(Utilities) | ||
5, # OJEU - F5 - Contract Notice(Utilities) | ||
6, # OJEU - F6 - Contract Award Notice(Utilities) | ||
7, # OJEU - F7 - Qualification Systems(Utilities) | ||
12, # OJEU - F12 - Design Contest Notice | ||
13, # OJEU - F13 - Results Of Design Contest | ||
14, # OJEU - F14 - Corrigendum | ||
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice | ||
20, # OJEU - F20 - Modification Notice | ||
21, # OJEU - F21 - Social And other Specific Services(Public Contracts) | ||
22, # OJEU - F22 - Social And other Specific Services(Utilities) | ||
23, # OJEU - F23 - Social And other Specific Services(Concessions) | ||
24, # OJEU - F24 - Concession Notice | ||
25, # OJEU - F25 - Concession Award Notice | ||
101, # Site Notice - Website Contract Notice | ||
102, # Site Notice - Website Prior Information Notice | ||
103, # Site Notice - Website Contract Award Notice | ||
104, # Site Notice - Quick Quote Award | ||
] | ||
|
||
for year_month in date_range_by_month(self.from_date, date.today()): | ||
date_string = year_month.strftime('%m-%Y') | ||
for notice_type in notice_types: | ||
yield self.build_request( | ||
self.url.format(date_string, notice_type), | ||
formatter=parameters('noticeType', 'dateFrom') | ||
) | ||
if self.sample: | ||
return | ||
def get_formatter(self): | ||
return parameters('noticeType', 'dateFrom') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we remove the
else
statement and always convert the date to datetime? to avoid duplicatingspider.from_date = datetime.strptime(spider.from_date, spider.date_format)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This if-else block is used to know whether
from_date
was passed as a parameter or not. If we eliminate theelse
a bad configuration indefault_from_date
by a subclass may raise aSpiderArgumentError
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we remove the else then? (following the same logic that there are not going to be bad configurations in hardcoded default dates)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@romifz not sure if you have seen this!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I'll check if we can remove the if-else block.