Skip to content

Commit

Permalink
Merge 958370b into c7a195f
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed May 27, 2020
2 parents c7a195f + 958370b commit 9986476
Show file tree
Hide file tree
Showing 52 changed files with 265 additions and 549 deletions.
54 changes: 26 additions & 28 deletions kingfisher_scrapy/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from kingfisher_scrapy import util
from kingfisher_scrapy.exceptions import SpiderArgumentError
from kingfisher_scrapy.items import File, FileError, FileItem


class BaseSpider(scrapy.Spider):
Expand Down Expand Up @@ -91,38 +92,45 @@ def get_start_time(self, format):
"""
return self.crawler.stats.get_value('start_time').strftime(format)

def save_response_to_disk(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True):
def build_file_from_response(self, response, filename, data_type=None, encoding='utf-8', post_to_api=True):
"""
Returns an item to yield, based on the response to a request.
"""
return self.save_data_to_disk(response.body, filename, response.request.url, data_type, encoding,
post_to_api)
return self.build_file(response.body, filename, response.request.url, data_type, encoding, post_to_api)

def save_data_to_disk(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True):
def build_file(self, data, filename, url=None, data_type=None, encoding='utf-8', post_to_api=True):
"""
Returns an item to yield.
"""
return {
'success': True,
return File({
'file_name': filename,
'data': data,
'data_type': data_type,
'url': url,
'encoding': encoding,
'post_to_api': post_to_api,
}
})

def _build_file_item(self, number, data, data_type, url, encoding, file_name):
return {
'success': True,
def build_file_item(self, number, data, data_type, url, encoding, file_name):
return FileItem({
'number': number,
'file_name': file_name,
'data': data,
'data_type': data_type,
'url': url,
'encoding': encoding,
'post_to_api': True,
})

def build_file_error_from_response(self, response, **kwargs):
file_error = {
'url': response.request.url,
'errors': {'http_code': response.status},
}
if 'kf_filename' in response.request.meta:
file_error['file_name'] = response.request.meta['kf_filename']
file_error.update(kwargs)
return FileError(file_error)

def _get_package_metadata(self, f, skip_key):
"""
Expand All @@ -144,7 +152,7 @@ def parse_json_lines(self, f, data_type, url, encoding='utf-8', file_name='data.
break
if isinstance(line, bytes):
line = line.decode(encoding=encoding)
yield self._build_file_item(number, line, data_type, url, encoding, file_name)
yield self.build_file_item(number, line, data_type, url, encoding, file_name)

def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8', array_field_name='releases',
file_name='data.json'):
Expand All @@ -158,7 +166,7 @@ def parse_json_array(self, f_package, f_list, data_type, url, encoding='utf-8',
for number, items in enumerate(util.grouper(ijson.items(f_list, '{}.item'.format(array_field_name)), size), 1):
package[array_field_name] = filter(None, items)
data = json.dumps(package, default=util.default)
yield self._build_file_item(number, data, data_type, url, encoding, file_name)
yield self.build_file_item(number, data, data_type, url, encoding, file_name)
if self.sample:
break

Expand Down Expand Up @@ -187,7 +195,7 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')
if response.status == 200:
if file_format:
filename = '{}.zip'.format(hashlib.md5(response.url.encode('utf-8')).hexdigest())
self.save_response_to_disk(response, filename, post_to_api=False)
self.build_file_from_response(response, filename, post_to_api=False)

zip_file = ZipFile(BytesIO(response.body))
for finfo in zip_file.infolist():
Expand All @@ -205,15 +213,10 @@ def parse_zipfile(self, response, data_type, file_format=None, encoding='utf-8')
yield from self.parse_json_array(package, data, data_type, response.request.url,
encoding=encoding, file_name=filename)
else:
yield self.save_data_to_disk(data.read(), filename, data_type=data_type, url=response.request.url,
encoding=encoding)
yield self.build_file(data.read(), filename, data_type=data_type, url=response.request.url,
encoding=encoding)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
'url': response.request.url,
'errors': {'http_code': response.status}
}
yield self.build_file_error_from_response(response)


class LinksSpider(BaseSpider):
Expand All @@ -235,14 +238,9 @@ def next_link(response):
def parse_next_link(self, response, data_type):
if response.status == 200:

yield self.save_response_to_disk(response, response.request.meta['kf_filename'], data_type=data_type)
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type=data_type)

if not self.sample:
yield self.next_link(response)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
'url': response.request.url,
'errors': {'http_code': response.status}
}
yield self.build_file_error_from_response(response)
18 changes: 8 additions & 10 deletions kingfisher_scrapy/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from scrapy import signals
from scrapy.exceptions import NotConfigured

from kingfisher_scrapy.items import File, FileError
from kingfisher_scrapy.kingfisher_process import Client


Expand All @@ -21,12 +22,11 @@ def from_crawler(cls, crawler):

def item_scraped(self, item, spider):
"""
Writes the item's data to the filename in the crawl's directory.
If the item is a file, writes its data to the filename in the crawl's directory.
Writes a ``<filename>.fileinfo`` metadata file in the crawl's directory, and returns a dict with the metadata.
"""
# Skip failures and parts of files.
if not item['success'] or 'number' in item:
if not isinstance(item, File):
return

# The crawl's relative directory, in the format `<spider_name>[_sample]/<YYMMDD_HHMMSS>`.
Expand Down Expand Up @@ -119,7 +119,11 @@ def item_scraped(self, item, spider):
'url': item['url'],
}

if item['success']:
if isinstance(item, FileError):
data['errors'] = json.dumps(item['errors'])

self._request(item, spider, 'create_file_error', data, name='File Errors API')
else:
data['data_type'] = item['data_type']
data['encoding'] = item.get('encoding', 'utf-8')
if spider.note:
Expand All @@ -145,12 +149,6 @@ def item_scraped(self, item, spider):

self._request(item, spider, 'create_file', data, files)

# File Error
else:
data['errors'] = json.dumps(item['errors'])

self._request(item, spider, 'create_file_error', data, name='File Errors API')

def _request(self, item, spider, method, *args, name='API'):
response = getattr(self.client, method)(*args)
if not response.ok:
Expand Down
36 changes: 26 additions & 10 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
# -*- coding: utf-8 -*-
import scrapy

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy
class File(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()
post_to_api = scrapy.Field()

# Added by extensions.
path = scrapy.Field()
files_store = scrapy.Field()


class FileItem(scrapy.Item):
number = scrapy.Field()
file_name = scrapy.Field()
url = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()
encoding = scrapy.Field()
post_to_api = scrapy.Field()


class KingfisherScrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
class FileError(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()
errors = scrapy.Field()
26 changes: 8 additions & 18 deletions kingfisher_scrapy/spiders/afghanistan_records.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,12 @@ def parse_list(self, response):
callback=self.parse_record
)
else:
yield {
'success': False,
'file_name': 'list.json',
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response, file_name='list.json')

def parse_record(self, response):
if response.status == 200:

yield self.save_response_to_disk(response, response.request.meta['kf_filename'], data_type="record")
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="record")

elif response.status == 429:
self.crawler.engine.pause()
Expand All @@ -51,15 +46,10 @@ def parse_record(self, response):
url = response.request.url
# This is dangerous as we might get stuck in a loop here if we always get a 429 response. Try this for now.
yield scrapy.Request(
url=url,
meta={'kf_filename': url.split('/')[-1]+'.json'},
callback=self.parse_record,
dont_filter=True,
)
url=url,
meta={'kf_filename': url.split('/')[-1]+'.json'},
callback=self.parse_record,
dont_filter=True,
)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)
23 changes: 4 additions & 19 deletions kingfisher_scrapy/spiders/afghanistan_releases.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,7 @@ def parse_list(self, response):
callback=self.parse_release_list
)
else:
yield {
'success': False,
'file_name': 'list.json',
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response, filename='list.json')

def parse_release_list(self, response):
if response.status == 200:
Expand Down Expand Up @@ -64,17 +59,12 @@ def parse_release_list(self, response):
dont_filter=True,
)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)

def parse_release(self, response):
if response.status == 200:

yield self.save_response_to_disk(response, response.request.meta['kf_filename'], data_type="release")
yield self.build_file_from_response(response, response.request.meta['kf_filename'], data_type="release")

elif response.status == 429:
self.crawler.engine.pause()
Expand All @@ -89,9 +79,4 @@ def parse_release(self, response):
dont_filter=True,
)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)
7 changes: 1 addition & 6 deletions kingfisher_scrapy/spiders/argentina_buenos_aires.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,7 @@ def parse_list(self, response):
if resource['format'].upper() == 'JSON':
yield scrapy.Request(url=resource['url'])
else:
yield {
'success': False,
'file_name': 'list.json',
'url': response.request.url,
'errors': {'http_code': response.status}
}
yield self.build_file_error_from_response(response, filename='list.json')

def parse(self, response):
yield from self.parse_zipfile(response, 'release_package', file_format='release_package')
9 changes: 2 additions & 7 deletions kingfisher_scrapy/spiders/argentina_vialidad.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@ def start_requests(self):

def parse(self, response):
if response.status == 200:
yield self.save_response_to_disk(response, 'all.json', data_type='release_package_list')
yield self.build_file_from_response(response, 'all.json', data_type='release_package_list')
else:
yield {
'success': False,
'file_name': 'all.json',
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response, filename='all.json')
11 changes: 3 additions & 8 deletions kingfisher_scrapy/spiders/armenia.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ def start_requests(self):
def parse(self, response):
if response.status == 200:

yield self.save_response_to_disk(response, response.request.meta['kf_filename'],
data_type='release_package')
yield self.build_file_from_response(response, response.request.meta['kf_filename'],
data_type='release_package')

json_data = json.loads(response.text)
if not (self.sample):
Expand All @@ -31,9 +31,4 @@ def parse(self, response):
meta={'kf_filename': hashlib.md5(url.encode('utf-8')).hexdigest()+'.json'}
)
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)
19 changes: 5 additions & 14 deletions kingfisher_scrapy/spiders/australia_nsw.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,22 +61,13 @@ def parse_list(self, response):
)

else:
yield {
'success': False,
'file_name': hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json',
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(
response, filename=hashlib.md5(response.request.url.encode('utf-8')).hexdigest() + '.json')

def parse(self, response):
if response.status == 200:
yield self.save_response_to_disk(response, response.request.meta['kf_filename'],
data_type='release_package')
yield self.build_file_from_response(response, response.request.meta['kf_filename'],
data_type='release_package')

else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)
11 changes: 3 additions & 8 deletions kingfisher_scrapy/spiders/canada_buyandsell.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,7 @@ def start_requests(self):

def parse(self, response):
if response.status == 200:
yield self.save_response_to_disk(response, response.request.meta['kf_filename'],
data_type='release_package')
yield self.build_file_from_response(response, response.request.meta['kf_filename'],
data_type='release_package')
else:
yield {
'success': False,
'file_name': response.request.meta['kf_filename'],
"url": response.request.url,
"errors": {"http_code": response.status}
}
yield self.build_file_error_from_response(response)
Loading

0 comments on commit 9986476

Please sign in to comment.