Skip to content

Commit

Permalink
Merge branch 'master' into 347-add_docstrings
Browse files Browse the repository at this point in the history
  • Loading branch information
aguilerapy committed May 28, 2020
2 parents e2269b1 + 4e59a97 commit 0f39a42
Show file tree
Hide file tree
Showing 55 changed files with 301 additions and 595 deletions.
19 changes: 3 additions & 16 deletions docs/writing-spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ Then, for *each* URL:

2. GET requests are passes it to the crawler engine to be executed. The response is passed back to the ``parse`` method in the spider.
3. The ``parse`` method must check the return status code! If it is not 200, this must be reported by fielding a block of information.
4. If the ``parse`` method has got a file it wants to save, it must call ``save_response_to_disk`` to do so! It should then yield a block of information.
4. If the ``parse`` method has got a file it wants to save, it must yield something, typically using ``build_file_from_response``.
5. The ``parse`` method can then yield further requests for processing.
6. The blocks of information are passed to the pipeline which fires it all off to the Kingfisher Process API.

Expand All @@ -66,24 +66,11 @@ Here is a sample:
# We must check the response code
if response.status == 200:
# It was a success!
# We must call to save to the disk
self.save_response_to_disk(response, response.request.meta['kf_filename'])
# We must send some information about this success
yield {
'success': True,
'file_name': response.request.meta['kf_filename'],
"data_type": "release_package",
"url": response.request.url,
}
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type='release_package')
else:
# It was a failure :-(
# We must send some information about this failure
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)
Spider properties
-----------------
Expand Down
55 changes: 26 additions & 29 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from kingfisher_scrapy import util
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.items import File, FileError, FileItem


class BaseSpider(scrapy.Spider):
Expand Down Expand Up @@ -91,38 +92,44 @@ def get_start_time(self, format):
"""
return self.crawler.stats.get_value('start_time').strftime(format)

def save_response_to_disk(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True):
def build_file_from_response(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True):
"""
Returns an item to yield, based on the response to a request.
"""
return self.save_data_to_disk(response.body, filename, response.request.url, data_type, encoding,
post_to_api)
return self.build_file(response.body, filename, response.request.url, data_type, encoding, post_to_api)

def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True):
def build_file(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True):
"""
Returns an item to yield.
"""
return {
'success': True,
return File({
'file_name': filename,
'data': data,
'data_type': data_type,
'url': url,
'encoding': encoding,
'post_to_api': post_to_api,
}
})

def _build_file_item(self, number, data, data_type, url, encoding, file_name):
return {
'success': True,
def build_file_item(self, number, data, data_type, url, encoding, file_name):
return FileItem({
'number': number,
'file_name': file_name,
'data': data,
'data_type': data_type,
'url': url,
'encoding': encoding,
'post_to_api': True,
})

def build_file_error_from_response(self, response, **kwargs):
file_error = {
'url': response.request.url,
'errors': {'http_code': response.status},
}
if 'kf_filename' in response.request.meta:
file_error['file_name'] = response.request.meta['kf_filename']
file_error.update(kwargs)
return FileError(file_error)

def _get_package_metadata(self, f, skip_key):
"""
Expand All @@ -144,7 +151,7 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.
break
if isinstance(line, bytes):
line = line.decode(encoding=encoding)
yield self._build_file_item(number, line, data_type, url, encoding, file_name)
yield self.build_file_item(number, line, data_type, url, encoding, file_name)

def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases',
file_name='data.json'):
Expand All @@ -158,7 +165,7 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8',
for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1):
package[array_field_name] = filter(None, items)
data = json.dumps(package, default=util.default)
yield self._build_file_item(number, data, data_type, url, encoding, file_name)
yield self.build_file_item(number, data, data_type, url, encoding, file_name)
if self.sample:
break

Expand Down Expand Up @@ -187,7 +194,7 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')
if response.status == 200:
if file_format:
filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest())
self.save_response_to_disk(response, filename, post_to_api=False)
self.build_file_from_response(response, filename, post_to_api=False)

zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
Expand All @@ -205,15 +212,10 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')
yield from self.parse_json_array(package, data, data_type, response.request.url,
encoding=encoding, file_name=filename)
else:
yield self.save_data_to_disk(data.read(), filename, data_type=data_type, url=response.request.url,
encoding=encoding)
yield self.build_file(data.read(), filename, data_type=data_type, url=response.request.url,
encoding=encoding)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
'url': response.request.url,
'errors': {'http_code': response.status}
}
yield self.build_file_error_from_response(response)


class LinksSpider(BaseSpider):
Expand All @@ -235,14 +237,9 @@ def next_link(response):
def parse_next_link(self, response, data_type):
if response.status == 200:

yield self.save_response_to_disk(response, response.request.meta['kf_filename'], data_type=data_type)
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type=data_type)

if not self.sample:
yield self.next_link(response)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
'url': response.request.url,
'errors': {'http_code': response.status}
}
yield self.build_file_error_from_response(response)
6 changes: 3 additions & 3 deletions kingfisher_scrapy/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ class KingfisherScrapyError(Exception):
"""Base class for exceptions from within this application"""


class AuthenticationFailureException(KingfisherScrapyError):
"""Raised when the maximum attempts to get an access token has been reached"""
class AuthenticationError(KingfisherScrapyError):
"""Raised when the maximum number of attempts to retrieve an access token is reached"""


class SpiderArgumentError(KingfisherScrapyError):
"""Raises when an error has occurred with the spider arguments"""
"""Raises when a spider argument's value is invalid"""
33 changes: 17 additions & 16 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension

import json
import os

from scrapy import signals
from scrapy.exceptions import NotConfigured

from kingfisher_scrapy.items import File, FileError, FileItem
from kingfisher_scrapy.kingfisher_process import Client


# https://docs.scrapy.org/en/latest/topics/extensions.html#writing-your-own-extension
class KingfisherFilesStore:
def __init__(self, directory):
self.directory = directory
Expand All @@ -21,12 +23,11 @@ def from_crawler(cls, crawler):

def item_scraped(self, item, spider):
"""
Writes the item's data to the filename in the crawl's directory.
If the item is a file, writes its data to the filename in the crawl's directory.
Writes a ``<filename>.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata.
"""
# Skip failures and parts of files.
if not item['success'] or 'number' in item:
if not isinstance(item, File):
return

# The crawl's relative directory, in the format `<spider_name>[_sample]/<YYMMDD_HHMMSS>`.
Expand Down Expand Up @@ -62,7 +63,11 @@ def _write_file(self, path, data, spider):
json.dump(data, f)


class KingfisherAPI:
class KingfisherProcessAPI:
"""
If the ``KINGFISHER_API_URI`` and ``KINGFISHER_API_KEY`` environment variables or configuration settings are set,
then messages are sent to a Kingfisher Process API for the ``item_scraped`` and ``spider_closed`` signals.
"""
def __init__(self, url, key, directory=None):
"""
Initializes a Kingfisher Process API client.
Expand Down Expand Up @@ -105,8 +110,7 @@ def spider_closed(self, spider, reason):

def item_scraped(self, item, spider):
"""
If the Scrapy item indicates success, sends a Kingfisher Process API request to create either a Kingfisher
Process file or file item. Otherwise, sends an API request to create a file error.
Sends an API request to store the file, file item or file error in Kingfisher Process.
"""
if not item.get('post_to_api', True):
return
Expand All @@ -119,14 +123,17 @@ def item_scraped(self, item, spider):
'url': item['url'],
}

if item['success']:
if isinstance(item, FileError):
data['errors'] = json.dumps(item['errors'])

self._request(item, spider, 'create_file_error', data, name='File Errors API')
else:
data['data_type'] = item['data_type']
data['encoding'] = item.get('encoding', 'utf-8')
if spider.note:
data['collection_note'] = spider.note

# File Item
if 'number' in item:
if isinstance(item, FileItem):
data['number'] = item['number']
data['data'] = item['data']

Expand All @@ -145,12 +152,6 @@ def item_scraped(self, item, spider):

self._request(item, spider, 'create_file', data, files)

# File Error
else:
data['errors'] = json.dumps(item['errors'])

self._request(item, spider, 'create_file_error', data, name='File Errors API')

def _request(self, item, spider, method, *args, name='API'):
response = getattr(self.client, method)(*args)
if not response.ok:
Expand Down
37 changes: 27 additions & 10 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,31 @@
# -*- coding: utf-8 -*-
import scrapy

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
class File(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()

# If a file is split into file items, the file is stored to disk, but not sent to Kingfisher Process.
post_to_api = scrapy.Field()

# Added by the KingfisherFilesStore extension, for the KingfisherProcessAPI extension to read the file.
path = scrapy.Field()
files_store = scrapy.Field()


class FileItem(scrapy.Item):
number = scrapy.Field()
file_name = scrapy.Field()
url = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()


class KingfisherScrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class FileError(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()
errors = scrapy.Field()
6 changes: 3 additions & 3 deletions kingfisher_scrapy/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
EXTENSIONS = {
# `KingfisherFilesStore` must run before `KingfisherAPI`, because the file needs to be written before the request
# is sent to Kingfisher Process.
# `KingfisherFilesStore` must run before `KingfisherProcessAPI`, because the file needs to be written before the
# request is sent to Kingfisher Process.
'kingfisher_scrapy.extensions.KingfisherFilesStore': 100,
'kingfisher_scrapy.extensions.KingfisherAPI': 500,
'kingfisher_scrapy.extensions.KingfisherProcessAPI': 500,
}

# Configure item pipelines
Expand Down
26 changes: 8 additions & 18 deletions kingfisher_scrapy/spiders/afghanistan_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,12 @@ def parse_list(self, response):
callback=self.parse_record
)
else:
yield {
'success': False,
'file_name': 'list.json',
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response, file_name='list.json')

def parse_record(self, response):
if response.status == 200:

yield self.save_response_to_disk(response, response.request.meta['kf_filename'], data_type="record")
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="record")

elif response.status == 429:
self.crawler.engine.pause()
Expand All @@ -51,15 +46,10 @@ def parse_record(self, response):
url = response.request.url
# This is dangerous as we might get stuck in a loop here if we always get a 429 response. Try this for now.
yield scrapy.Request(
url=url,
meta={'kf_filename': url.split('/')[-1]+'.json'},
callback=self.parse_record,
dont_filter=True,
)
url=url,
meta={'kf_filename': url.split('/')[-1]+'.json'},
callback=self.parse_record,
dont_filter=True,
)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)

0 comments on commit 0f39a42

Please sign in to comment.