Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Scrapy items #383

Merged
merged 6 commits into from
May 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 3 additions & 16 deletions docs/writing-spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Then, for *each* URL:

2. GET requests are passes it to the crawler engine to be executed. The response is passed back to the ``parse`` method in the spider.
3. The ``parse`` method must check the return status code! If it is not 200, this must be reported by fielding a block of information.
4. If the ``parse`` method has got a file it wants to save, it must call ``save_response_to_disk`` to do so! It should then yield a block of information.
4. If the ``parse`` method has got a file it wants to save, it must yield something, typically using ``build_file_from_response``.
5. The ``parse`` method can then yield further requests for processing.
6. The blocks of information are passed to the pipeline which fires it all off to the Kingfisher Process API.

Expand All @@ -66,24 +66,11 @@ Here is a sample:
# We must check the response code
if response.status == 200:
# It was a success!
# We must call to save to the disk
self.save_response_to_disk(response, response.request.meta['kf_filename'])
# We must send some information about this success
yield {
'success': True,
'file_name': response.request.meta['kf_filename'],
"data_type": "release_package",
"url": response.request.url,
}
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type='release_package')
else:
# It was a failure :-(
# We must send some information about this failure
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)

Spider properties
-----------------
Expand Down
54 changes: 26 additions & 28 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from kingfisher_scrapy import util
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.items import File, FileError, FileItem


class BaseSpider(scrapy.Spider):
Expand Down Expand Up @@ -91,38 +92,45 @@ def get_start_time(self, format):
"""
return self.crawler.stats.get_value('start_time').strftime(format)

def save_response_to_disk(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True):
def build_file_from_response(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True):
"""
Returns an item to yield, based on the response to a request.
"""
return self.save_data_to_disk(response.body, filename, response.request.url, data_type, encoding,
post_to_api)
return self.build_file(response.body, filename, response.request.url, data_type, encoding, post_to_api)

def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True):
def build_file(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True):
"""
Returns an item to yield.
"""
return {
'success': True,
return File({
'file_name': filename,
'data': data,
'data_type': data_type,
'url': url,
'encoding': encoding,
'post_to_api': post_to_api,
}
})

def _build_file_item(self, number, data, data_type, url, encoding, file_name):
return {
'success': True,
def build_file_item(self, number, data, data_type, url, encoding, file_name):
return FileItem({
'number': number,
'file_name': file_name,
'data': data,
'data_type': data_type,
'url': url,
'encoding': encoding,
'post_to_api': True,
})

def build_file_error_from_response(self, response, **kwargs):
file_error = {
'url': response.request.url,
'errors': {'http_code': response.status},
}
if 'kf_filename' in response.request.meta:
file_error['file_name'] = response.request.meta['kf_filename']
file_error.update(kwargs)
return FileError(file_error)

def _get_package_metadata(self, f, skip_key):
"""
Expand All @@ -144,7 +152,7 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.
break
if isinstance(line, bytes):
line = line.decode(encoding=encoding)
yield self._build_file_item(number, line, data_type, url, encoding, file_name)
yield self.build_file_item(number, line, data_type, url, encoding, file_name)

def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases',
file_name='data.json'):
Expand All @@ -158,7 +166,7 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8',
for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1):
package[array_field_name] = filter(None, items)
data = json.dumps(package, default=util.default)
yield self._build_file_item(number, data, data_type, url, encoding, file_name)
yield self.build_file_item(number, data, data_type, url, encoding, file_name)
if self.sample:
break

Expand Down Expand Up @@ -187,7 +195,7 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')
if response.status == 200:
if file_format:
filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest())
self.save_response_to_disk(response, filename, post_to_api=False)
self.build_file_from_response(response, filename, post_to_api=False)

zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
Expand All @@ -205,15 +213,10 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')
yield from self.parse_json_array(package, data, data_type, response.request.url,
encoding=encoding, file_name=filename)
else:
yield self.save_data_to_disk(data.read(), filename, data_type=data_type, url=response.request.url,
encoding=encoding)
yield self.build_file(data.read(), filename, data_type=data_type, url=response.request.url,
encoding=encoding)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
'url': response.request.url,
'errors': {'http_code': response.status}
}
yield self.build_file_error_from_response(response)


class LinksSpider(BaseSpider):
Expand All @@ -235,14 +238,9 @@ def next_link(response):
def parse_next_link(self, response, data_type):
if response.status == 200:

yield self.save_response_to_disk(response, response.request.meta['kf_filename'], data_type=data_type)
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type=data_type)

if not self.sample:
yield self.next_link(response)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
'url': response.request.url,
'errors': {'http_code': response.status}
}
yield self.build_file_error_from_response(response)
23 changes: 11 additions & 12 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension

import json
import os

from scrapy import signals
from scrapy.exceptions import NotConfigured

from kingfisher_scrapy.items import File, FileError, FileItem
from kingfisher_scrapy.kingfisher_process import Client


# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
class KingfisherFilesStore:
def __init__(self, directory):
self.directory = directory
Expand All @@ -21,12 +23,11 @@ def from_crawler(cls, crawler):

def item_scraped(self, item, spider):
"""
Writes the item's data to the filename in the crawl's directory.
If the item is a file, writes its data to the filename in the crawl's directory.

Writes a ``<filename>.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata.
"""
# Skip failures and parts of files.
if not item['success'] or 'number' in item:
if not isinstance(item, File):
return

# The crawl's relative directory, in the format `<spider_name>[_sample]/<YYMMDD_HHMMSS>`.
Expand Down Expand Up @@ -119,14 +120,18 @@ def item_scraped(self, item, spider):
'url': item['url'],
}

if item['success']:
if isinstance(item, FileError):
data['errors'] = json.dumps(item['errors'])

self._request(item, spider, 'create_file_error', data, name='File Errors API')
else:
data['data_type'] = item['data_type']
data['encoding'] = item.get('encoding', 'utf-8')
if spider.note:
data['collection_note'] = spider.note

# File Item
if 'number' in item:
if isinstance(item, FileItem):
data['number'] = item['number']
data['data'] = item['data']

Expand All @@ -145,12 +150,6 @@ def item_scraped(self, item, spider):

self._request(item, spider, 'create_file', data, files)

# File Error
else:
data['errors'] = json.dumps(item['errors'])

self._request(item, spider, 'create_file_error', data, name='File Errors API')

def _request(self, item, spider, method, *args, name='API'):
response = getattr(self.client, method)(*args)
if not response.ok:
Expand Down
36 changes: 26 additions & 10 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
# -*- coding: utf-8 -*-
import scrapy

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
class File(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()
post_to_api = scrapy.Field()

# Added by extensions.
path = scrapy.Field()
files_store = scrapy.Field()


class FileItem(scrapy.Item):
number = scrapy.Field()
file_name = scrapy.Field()
url = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()
post_to_api = scrapy.Field()


class KingfisherScrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class FileError(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()
errors = scrapy.Field()
26 changes: 8 additions & 18 deletions kingfisher_scrapy/spiders/afghanistan_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,12 @@ def parse_list(self, response):
callback=self.parse_record
)
else:
yield {
'success': False,
'file_name': 'list.json',
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response, file_name='list.json')

def parse_record(self, response):
if response.status == 200:

yield self.save_response_to_disk(response, response.request.meta['kf_filename'], data_type="record")
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="record")

elif response.status == 429:
self.crawler.engine.pause()
Expand All @@ -51,15 +46,10 @@ def parse_record(self, response):
url = response.request.url
# This is dangerous as we might get stuck in a loop here if we always get a 429 response. Try this for now.
yield scrapy.Request(
url=url,
meta={'kf_filename': url.split('/')[-1]+'.json'},
callback=self.parse_record,
dont_filter=True,
)
url=url,
meta={'kf_filename': url.split('/')[-1]+'.json'},
callback=self.parse_record,
dont_filter=True,
)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)
23 changes: 4 additions & 19 deletions kingfisher_scrapy/spiders/afghanistan_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,7 @@ def parse_list(self, response):
callback=self.parse_release_list
)
else:
yield {
'success': False,
'file_name': 'list.json',
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response, filename='list.json')

def parse_release_list(self, response):
if response.status == 200:
Expand Down Expand Up @@ -64,17 +59,12 @@ def parse_release_list(self, response):
dont_filter=True,
)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)

def parse_release(self, response):
if response.status == 200:

yield self.save_response_to_disk(response, response.request.meta['kf_filename'], data_type="release")
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="release")

elif response.status == 429:
self.crawler.engine.pause()
Expand All @@ -89,9 +79,4 @@ def parse_release(self, response):
dont_filter=True,
)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)
7 changes: 1 addition & 6 deletions kingfisher_scrapy/spiders/argentina_buenos_aires.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,7 @@ def parse_list(self, response):
if resource['format'].upper() == 'JSON':
yield scrapy.Request(url=resource['url'])
else:
yield {
'success': False,
'file_name': 'list.json',
'url': response.request.url,
'errors': {'http_code': response.status}
}
yield self.build_file_error_from_response(response, filename='list.json')

def parse(self, response):
yield from self.parse_zipfile(response, 'release_package', file_format='release_package')
9 changes: 2 additions & 7 deletions kingfisher_scrapy/spiders/argentina_vialidad.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@ def start_requests(self):

def parse(self, response):
if response.status == 200:
yield self.save_response_to_disk(response, 'all.json', data_type='release_package_list')
yield self.build_file_from_response(response, 'all.json', data_type='release_package_list')
else:
yield {
'success': False,
'file_name': 'all.json',
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response, filename='all.json')
Loading