-
Notifications
You must be signed in to change notification settings - Fork 12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add PeriodicalSpider #467
Add PeriodicalSpider #467
Changes from 6 commits
4f39bd9
751f391
0303c64
60c29ff
86518da
100f2a5
11d2526
8b884a9
8cdb090
77f6c06
7efb28d
726b8d0
be0eba6
9bd5665
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,7 @@ | ||
import json | ||
import os | ||
from abc import abstractmethod | ||
from datetime import date as DateClass | ||
from datetime import datetime | ||
from io import BytesIO | ||
from zipfile import ZipFile | ||
|
@@ -107,8 +109,7 @@ def from_crawler(cls, crawler, *args, **kwargs): | |
# Default to `default_from_date` class attribute. | ||
spider.from_date = spider.default_from_date | ||
if not spider.until_date: | ||
# Default to today. | ||
spider.until_date = datetime.now().strftime(spider.date_format) | ||
spider.until_date = cls.get_default_until_date(spider) | ||
try: | ||
spider.from_date = datetime.strptime(spider.from_date, spider.date_format) | ||
except ValueError as e: | ||
|
@@ -266,6 +267,10 @@ def parse_json_array(self, f_package, f_list, *, file_name='data.json', url=None | |
if self.sample: | ||
break | ||
|
||
@classmethod | ||
def get_default_until_date(cls, spider): | ||
return datetime.now().strftime(spider.date_format) | ||
|
||
|
||
class SimpleSpider(BaseSpider): | ||
""" | ||
|
@@ -429,3 +434,77 @@ def next_link(self, response): | |
|
||
if response.meta['depth'] == 0: | ||
raise MissingNextLinkError('next link not found on the first page: {}'.format(response.url)) | ||
|
||
|
||
class PeriodicalSpider(SimpleSpider): | ||
""" | ||
This class helps to crawl urls that receive a year (YYYY) or a month and year (YYYY-mm) as parameters. To use it: | ||
|
||
1. Extend from ``PeriodicalSpider``. | ||
1. Set the ``date_format`` attribute if it's not defined already. Valid values are 'year' and 'year-month'. | ||
1. Set a ``start`` year or year-month. | ||
1. Optionally, set a ``stop`` year or year-month. If absent, ``stop`` defaults to the current year or year-month. | ||
1. Set the ``pattern`` parameter with the url to retrieve. | ||
1. Implement the `get_formatter` method. | ||
|
||
The ``pattern`` should include a placeholder for a year or month-year parameter. With the year parameter, an int is | ||
passed. If the year-month parameter is used, a ``Date`` instance is passed. Example: | ||
|
||
.. code-block: python | ||
|
||
url = 'http://comprasestatales.gub.uy/ocds/rss/{0.year:d}/{0.month:02d}' | ||
|
||
When the ``sample`` option is used, the latest year or month of data is retrieved. | ||
""" | ||
VALID_DATE_FORMATS = {'year': '%Y', 'year-month': '%Y-%m'} | ||
|
||
def __init__(self, *args, **kwargs): | ||
self.date_format_key = self.date_format | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @romifz is this needed? cant we use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In this case yes, because internally |
||
super().__init__(*args, **kwargs) | ||
|
||
if hasattr(self, 'start_requests_callback'): | ||
self.start_requests_callback = getattr(self, self.start_requests_callback) | ||
else: | ||
self.start_requests_callback = self.parse | ||
|
||
if not isinstance(self.start, DateClass): | ||
self.start = datetime.strptime(str(self.start), self.date_format) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this can be always a str instead of a integer, and use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure |
||
self.default_from_date = self.start.strftime(self.date_format) | ||
|
||
if hasattr(self, 'stop'): | ||
if not isinstance(self.stop, DateClass): | ||
self.stop = datetime.strptime(str(self.stop), self.date_format) | ||
else: | ||
self.stop = datetime.today() | ||
|
||
@classmethod | ||
def get_default_until_date(cls, spider): | ||
return spider.stop.strftime(spider.date_format) | ||
|
||
def start_requests(self): | ||
|
||
start = self.start if not self.exists('from_date') else self.from_date | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @romifz maybe we can use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can |
||
|
||
stop = self.stop if not self.exists('until_date') else self.until_date | ||
|
||
if self.sample: | ||
start = stop | ||
|
||
if self.date_format_key == 'year': | ||
date_range = util.date_range_by_year(start.year, stop.year) | ||
else: | ||
date_range = util.date_range_by_month(start, stop) | ||
|
||
for date in date_range: | ||
for url in self.build_urls(self.pattern, date): | ||
yield self.build_request(url, self.get_formatter(), callback=self.start_requests_callback) | ||
|
||
@abstractmethod | ||
def get_formatter(self): | ||
pass | ||
|
||
def build_urls(self, pattern, date): | ||
yield pattern.format(date) | ||
|
||
def exists(self, attr): | ||
return hasattr(self, attr) and getattr(self, attr) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,50 +1,41 @@ | ||
from datetime import date | ||
|
||
from kingfisher_scrapy.base_spider import SimpleSpider | ||
from kingfisher_scrapy.util import date_range_by_month, parameters | ||
from kingfisher_scrapy.base_spider import PeriodicalSpider | ||
from kingfisher_scrapy.util import parameters | ||
|
||
|
||
class ScotlandBase(SimpleSpider): | ||
class ScotlandBase(PeriodicalSpider): | ||
date_format = 'year-month' | ||
stop = date.today() | ||
start = date(stop.year - 1, stop.month, 1) | ||
|
||
def parse_requests(self, pattern): | ||
notice_types = [ | ||
1, # OJEU - F1 - Prior Information Notice | ||
2, # OJEU - F2 - Contract Notice | ||
3, # OJEU - F3 - Contract Award Notice | ||
4, # OJEU - F4 - Prior Information Notice(Utilities) | ||
5, # OJEU - F5 - Contract Notice(Utilities) | ||
6, # OJEU - F6 - Contract Award Notice(Utilities) | ||
7, # OJEU - F7 - Qualification Systems(Utilities) | ||
12, # OJEU - F12 - Design Contest Notice | ||
13, # OJEU - F13 - Results Of Design Contest | ||
14, # OJEU - F14 - Corrigendum | ||
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice | ||
20, # OJEU - F20 - Modification Notice | ||
21, # OJEU - F21 - Social And other Specific Services(Public Contracts) | ||
22, # OJEU - F22 - Social And other Specific Services(Utilities) | ||
23, # OJEU - F23 - Social And other Specific Services(Concessions) | ||
24, # OJEU - F24 - Concession Notice | ||
25, # OJEU - F25 - Concession Award Notice | ||
101, # Site Notice - Website Contract Notice | ||
102, # Site Notice - Website Prior Information Notice | ||
103, # Site Notice - Website Contract Award Notice | ||
104, # Site Notice - Quick Quote Award | ||
] | ||
|
||
notice_types = [ | ||
1, # OJEU - F1 - Prior Information Notice | ||
2, # OJEU - F2 - Contract Notice | ||
3, # OJEU - F3 - Contract Award Notice | ||
4, # OJEU - F4 - Prior Information Notice(Utilities) | ||
5, # OJEU - F5 - Contract Notice(Utilities) | ||
6, # OJEU - F6 - Contract Award Notice(Utilities) | ||
7, # OJEU - F7 - Qualification Systems(Utilities) | ||
12, # OJEU - F12 - Design Contest Notice | ||
13, # OJEU - F13 - Results Of Design Contest | ||
14, # OJEU - F14 - Corrigendum | ||
15, # OJEU - F15 - Voluntary Ex Ante Transparency Notice | ||
20, # OJEU - F20 - Modification Notice | ||
21, # OJEU - F21 - Social And other Specific Services(Public Contracts) | ||
22, # OJEU - F22 - Social And other Specific Services(Utilities) | ||
23, # OJEU - F23 - Social And other Specific Services(Concessions) | ||
24, # OJEU - F24 - Concession Notice | ||
25, # OJEU - F25 - Concession Award Notice | ||
101, # Site Notice - Website Contract Notice | ||
102, # Site Notice - Website Prior Information Notice | ||
103, # Site Notice - Website Contract Award Notice | ||
104, # Site Notice - Quick Quote Award | ||
] | ||
def build_urls(self, pattern, date): | ||
for notice_type in self.notice_types: | ||
yield pattern.format(date, notice_type) | ||
|
||
now = date.today() | ||
if self.from_date: | ||
start = date(self.from_date.year, self.from_date.month, 1) | ||
else: | ||
start = date(now.year - 1, now.month, 1) | ||
if self.sample: | ||
start = now | ||
|
||
for d in date_range_by_month(start, now): | ||
date_string = '{:02d}-{:04d}'.format(d.month, d.year) | ||
for notice_type in notice_types: | ||
yield self.build_request( | ||
pattern.format(date_string, notice_type), | ||
formatter=parameters('noticeType', 'dateFrom') | ||
) | ||
def get_formatter(self): | ||
return parameters('noticeType', 'dateFrom') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should we remove 'year-month' from
base_spider
VALID_DATE_FORMATS
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@romifz
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure. I think it should be possible to use the same format for filtering in other spiders, although I think we don't have a case, don't we?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, I think that all of the spiders which uses 'year-month' extends from
PeriodicalSpider
nowThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, no problem, I'll remove the option.