Skip to content

Commit

Permalink
feat: Switch to dataclasses
Browse files Browse the repository at this point in the history
  • Loading branch information
jpmckinney committed Apr 10, 2024
1 parent 4eb004d commit 1913fe2
Show file tree
Hide file tree
Showing 27 changed files with 537 additions and 486 deletions.
49 changes: 23 additions & 26 deletions kingfisher_scrapy/base_spiders/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def build_request(self, url, formatter, **kwargs):
meta.update(kwargs.pop('meta'))
return scrapy.Request(url, meta=meta, **kwargs)

def build_file_from_response(self, response, **kwargs):
def build_file_from_response(self, response, /, *, data_type, **kwargs):
"""
Returns a File item to yield, based on the response to a request.
Expand All @@ -299,45 +299,42 @@ def build_file_from_response(self, response, **kwargs):
if body.startswith(codecs.BOM_UTF8):
body = body[len(codecs.BOM_UTF8):]
kwargs['data'] = body
return self.build_file(**kwargs)
return self.build_file(data_type=data_type, **kwargs)

def build_file(self, *, file_name=None, url=None, data=None, data_type=None):
def build_file(self, *, file_name=None, url=None, data_type=None, data=None):
"""
Returns a File item to yield.
"""
return File({
'file_name': file_name,
'data': data,
'data_type': data_type,
'url': url,
})
return File(
file_name=file_name,
url=url,
data_type=data_type,
data=data,
)

def build_file_item(self, number, data, item):
"""
Returns a FileItem item to yield.
"""
return FileItem({
'number': number,
'file_name': item['file_name'],
'data': data,
'data_type': item['data_type'],
'url': item['url'],
})

def build_file_error_from_response(self, response, **kwargs):
return FileItem(
file_name=item.file_name,
url=item.url,
data_type=item.data_type,
data=data,
number=number,
)

def build_file_error_from_response(self, response, errors=None):
"""
Returns a FileError item to yield, based on the response to a request.
An ``errors`` keyword argument must be a ``dict``, and should set an ``http_code`` key.
"""
item = FileError({
'url': response.request.url,
'errors': {'http_code': response.status},
})
if 'file_name' in response.request.meta:
item['file_name'] = response.request.meta['file_name']
item.update(kwargs)
return item
return FileError(
file_name=response.request.meta.get('file_name', ''),
url=response.request.url,
errors=errors or {'http_code': response.status},
)

@classmethod
def get_default_until_date(cls, spider):
Expand Down
9 changes: 7 additions & 2 deletions kingfisher_scrapy/base_spiders/big_file_spider.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.exceptions import IncoherentConfigurationError
from kingfisher_scrapy.items import File
from kingfisher_scrapy.util import handle_http_error


Expand Down Expand Up @@ -43,5 +44,9 @@ def from_crawler(cls, crawler, *args, **kwargs):

@handle_http_error
def parse(self, response):
yield self.build_file(file_name=response.request.meta['file_name'], url=response.request.url,
data_type=self.data_type, data={'data': response.body, 'package': response.body})
yield File(
file_name=response.request.meta['file_name'],
url=response.request.url,
data_type=self.data_type,
data={'data': response.body, 'package': response.body},
)
12 changes: 6 additions & 6 deletions kingfisher_scrapy/base_spiders/compressed_file_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@ def parse(self, response):
else:
data = compressed_file

yield File({
'file_name': f'{archive_name}-{basename}',
'data': data,
'data_type': self.data_type,
'url': response.request.url,
})
yield File(
file_name=f'{archive_name}-{basename}',
url=response.request.url,
data_type=self.data_type,
data=data,
)

number += 1
8 changes: 4 additions & 4 deletions kingfisher_scrapy/extensions/files_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ def item_scraped(self, item, spider):
if not isinstance(item, (File, FileItem)):
return

file_name = item['file_name']
file_name = item.file_name
if isinstance(item, FileItem):
name, extension = util.get_file_name_and_extension(file_name)
file_name = f"{name}-{item['number']}.{extension}"
file_name = f"{name}-{item.number}.{extension}"

path = os.path.join(self.relative_crawl_directory(spider), self._get_subdirectory(file_name), file_name)
self._write_file(path, item['data'])
self._write_file(path, item.data)

item['path'] = path
item.path = path

# https://github.com/rails/rails/blob/05ed261/activesupport/lib/active_support/cache/file_store.rb#L150-L175
@staticmethod
Expand Down
6 changes: 3 additions & 3 deletions kingfisher_scrapy/extensions/kingfisher_process_api2.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,13 @@ def item_scraped(self, item, spider):

data = {
'collection_id': self.collection_id,
'url': item['url'],
'url': item.url,
}

if isinstance(item, FileError):
data['errors'] = json.dumps(item['errors'])
data['errors'] = json.dumps(item.errors)
else:
data['path'] = item['path']
data['path'] = item.path

cb = functools.partial(self._when_ready, self.client.publish, data, self.routing_key)
methods.add_callback_threadsafe(self.client.connection, cb)
Expand Down
2 changes: 1 addition & 1 deletion kingfisher_scrapy/extensions/pluck.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def item_scraped(self, item, spider):

self.item_scraped_called = True

self._write(spider, item['value'])
self._write(spider, item.value)

def spider_closed(self, spider, reason):
if not spider.pluck or self.item_scraped_called:
Expand Down
4 changes: 2 additions & 2 deletions kingfisher_scrapy/item_schema/item.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@
}
},
"required": [
"data",
"data_type"
"data_type",
"data"
]
}
}
Expand Down
39 changes: 20 additions & 19 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,35 @@
# https://docs.scrapy.org/en/latest/topics/items.html
import typing
from dataclasses import dataclass

import scrapy


class Item(scrapy.Item):
file_name = scrapy.Field()
url = scrapy.Field()
validate = True
@dataclass
class Item:
file_name: str
url: str


@dataclass
class File(Item):
data = scrapy.Field()
data_type = scrapy.Field()

data_type: str
data: typing.Any
# Added by the FilesStore extension, for the KingfisherProcessAPI2 extension to refer to the file.
path = scrapy.Field()
path: str = ""


@dataclass
class FileItem(Item):
number = scrapy.Field()
data = scrapy.Field()
data_type = scrapy.Field()

data_type: str
data: typing.Any
number: int
# Added by the FilesStore extension, for the KingfisherProcessAPI2 extension to refer to the file.
path = scrapy.Field()
path: str = ""


@dataclass
class FileError(Item):
errors = scrapy.Field()
errors: dict


class PluckedItem(scrapy.Item):
value = scrapy.Field()
@dataclass
class PluckedItem:
value: typing.Any
50 changes: 27 additions & 23 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ def _json_loads(basename):
return json.loads(pkgutil.get_data('kingfisher_scrapy', f'item_schema/{basename}.json'))


# https://docs.scrapy.org/en/latest/topics/item-pipeline.html#duplicates-filter
class Validate:
"""
Drops duplicate files based on ``file_name`` and file items based on ``file_name`` and ``number``.
Expand All @@ -42,19 +43,22 @@ def __init__(self):
self.validators[item] = Draft4Validator(_json_loads(item), registry=registry, format_checker=checker)

def process_item(self, item, spider):
if hasattr(item, 'validate'):
self.validators.get(item.__class__.__name__).validate(dict(item))
validator = self.validators.get(item.__class__.__name__)
if validator:
validator.validate(item.__dict__)

if isinstance(item, FileItem):
key = (item['file_name'], item['number'])
key = (item.file_name, item.number)
if key in self.file_items:
raise DropItem(f'Duplicate FileItem: {key!r}')
self.file_items.add(key)
else:
self.file_items.add(key)
elif isinstance(item, File):
key = item['file_name']
key = item.file_name
if key in self.files:
raise DropItem(f'Duplicate File: {key!r}')
self.files.add(key)
else:
self.files.add(key)

return item

Expand Down Expand Up @@ -98,11 +102,11 @@ def process_item(self, item, spider):
value = None
if spider.pluck_package_pointer:
pointer = spider.pluck_package_pointer
if isinstance(item['data'], dict):
value = _resolve_pointer(item['data'], pointer)
if isinstance(item.data, dict):
value = _resolve_pointer(item.data, pointer)
else:
try:
value = next(transcode(spider, ijson.items, item['data'], pointer[1:].replace('/', '.')))
value = next(transcode(spider, ijson.items, item.data, pointer[1:].replace('/', '.')))
except StopIteration:
value = f'error: {pointer} not found'
except ijson.common.IncompleteJSONError as e:
Expand All @@ -120,16 +124,16 @@ def process_item(self, item, spider):
else:
raise
else: # spider.pluck_release_pointer
if isinstance(item['data'], dict):
data = item['data']
if isinstance(item.data, dict):
data = item.data
else:
data = json.loads(item['data'])
data = json.loads(item.data)

if item['data_type'].startswith('release'):
if item.data_type.startswith('release'):
releases = data['releases']
if releases:
value = max(_resolve_pointer(r, spider.pluck_release_pointer) for r in releases)
elif item['data_type'].startswith('record'):
elif item.data_type.startswith('record'):
records = data['records']
if records:
# This assumes that the first record in the record package has the desired value.
Expand All @@ -142,7 +146,7 @@ def process_item(self, item, spider):
if value and spider.pluck_truncate:
value = value[:spider.pluck_truncate]

return PluckedItem({'value': value})
return PluckedItem(value=value)


class Unflatten:
Expand All @@ -154,16 +158,16 @@ def process_item(self, item, spider):
if not spider.unflatten or not isinstance(item, (File, FileItem)):
return item

input_name = item['file_name']
input_name = item.file_name
if input_name.endswith('.csv'):
item['file_name'] = f'{item["file_name"][:-4]}.json'
item.file_name = f'{item.file_name[:-4]}.json'
input_format = 'csv'
elif input_name.endswith('.xlsx'):
item['file_name'] = f'{item["file_name"][:-5]}.json'
item.file_name = f'{item.file_name[:-5]}.json'
input_format = 'xlsx'
else:
extension = os.path.splitext(input_name)[1]
raise NotSupported(f"Unsupported extension '{extension}' of {input_name} from {item['url']}")
raise NotSupported(f"Unsupported extension '{extension}' of {input_name} from {item.url}")

spider_ocds_version = spider.ocds_version.replace('.', '__')
for tag in reversed(get_tags()):
Expand All @@ -175,14 +179,14 @@ def process_item(self, item, spider):

with tempfile.TemporaryDirectory() as directory:
input_path = os.path.join(directory, input_name)
output_name = os.path.join(directory, item['file_name'])
output_name = os.path.join(directory, item.file_name)
if input_format == 'csv':
input_name = directory
elif input_format == 'xlsx':
input_name = input_path

with open(input_path, 'wb') as f:
f.write(item['data'])
f.write(item.data)

with warnings.catch_warnings():
warnings.filterwarnings('ignore') # flattentool uses UserWarning, so we can't set a specific category
Expand All @@ -197,8 +201,8 @@ def process_item(self, item, spider):
**spider.unflatten_args
)

with open(output_name, 'r') as f:
item['data'] = f.read()
with open(output_name) as f:
item.data = f.read()

return item

Expand Down
Loading

0 comments on commit 1913fe2

Please sign in to comment.