Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Pydantic #1068

Merged
merged 15 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions kingfisher_scrapy/base_spiders/compressed_file_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def start_requests(self):
def parse(self, response):
archive_name, archive_format = get_file_name_and_extension(response.request.meta['file_name'])

# NOTE: If support is added for additional archive formats, remember to update the `Data` type in `items.py`.
if archive_format == 'zip':
cls = ZipFile
elif archive_format == 'rar':
Expand Down
14 changes: 0 additions & 14 deletions kingfisher_scrapy/item_schema/File.json

This file was deleted.

17 changes: 0 additions & 17 deletions kingfisher_scrapy/item_schema/FileError.json

This file was deleted.

18 changes: 0 additions & 18 deletions kingfisher_scrapy/item_schema/FileItem.json

This file was deleted.

48 changes: 0 additions & 48 deletions kingfisher_scrapy/item_schema/item.json

This file was deleted.

77 changes: 54 additions & 23 deletions kingfisher_scrapy/items.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,68 @@
from dataclasses import dataclass
from typing import Any
import enum
import io
import zipfile
from typing import Any, TypedDict

import pydantic
import rarfile

@dataclass
class Item:
file_name: str
url: str
Data = (
# https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.extractfile (DigiwhistBase)
io.BufferedReader
# https://rarfile.readthedocs.io/api.html#rarfile.RarFile.open (CompressedFileSpider)
| rarfile.RarExtFile
# https://docs.python.org/3/library/zipfile.html#zipfile.ZipFile.open (CompressedFileSpider)
| zipfile.ZipExtFile
| pydantic.conbytes(strict=True, min_length=1)
# `dict` behaves better last. https://github.com/open-contracting/kingfisher-collect/issues/995
| dict
)

base_kwargs = {'validate_assignment': True}

# data can be: bytes | dict | list | io.BufferedReader | rarfile.RarExtFile | zipfile.ZipExtFile
@dataclass
class File(Item):
data_type: str
data: Any

class DataType(str, enum.Enum):
record = "record"
release = "release"
record_package = "record_package"
release_package = "release_package"


class Errors(TypedDict):
http_code: pydantic.conint(strict=True, ge=100, lt=600)


class Resource(pydantic.BaseModel, **base_kwargs):
file_name: pydantic.constr(strict=True, regex=r'^[^/\\]+$') # noqa: F722 pydantic/pydantic#2872
url: pydantic.HttpUrl


class DataResource(Resource, arbitrary_types_allowed=True, use_enum_values=True):
data_type: DataType
data: Data
# Added by the FilesStore extension, for the KingfisherProcessAPI2 extension to refer to the file.
path: str = ""

@pydantic.validator('data', pre=True) # `pre` is needed to prevent pydantic from type casting
def check_data(cls, v):
# pydantic has no `condict()` to set `strict=True` or `min_properties=1`. pydantic/pydantic#1277
assert isinstance(v, (Data, bytes)), f'{v.__class__.__name__} is not a valid type'
assert v, 'ensure this value is non-empty'
return v


class File(DataResource):
pass


# This doesn't inherit from the File class, because we want isinstance(item, File) to be false for FileItem instances.
@dataclass
class FileItem(Item):
data_type: str
data: Any
number: int
# Added by the FilesStore extension, for the KingfisherProcessAPI2 extension to refer to the file.
path: str = ""
class FileItem(DataResource):
number: pydantic.conint(strict=True, gt=0)


@dataclass
class FileError(Item):
errors: dict
class FileError(Resource):
errors: Errors


@dataclass
class PluckedItem:
class PluckedItem(pydantic.BaseModel, **base_kwargs):
value: Any
2 changes: 2 additions & 0 deletions kingfisher_scrapy/log_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def _omit_data(self, method, item, *args):
Omits an item's `data` and `path` (not set yet) values from the log message.
"""
item = item.__dict__.copy()
if 'url' in item:
item['url'] = str(item['url']) # avoid pydantic.AnyUrl.__repr__
item.pop('data', None)
item.pop('path', None)
return getattr(super(), method)(item, *args)
21 changes: 0 additions & 21 deletions kingfisher_scrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,51 +2,30 @@
# https://docs.scrapy.org/en/latest/topics/signals.html#item-signals
import json
import os
import pkgutil
import tempfile
import warnings

import ijson
import jsonpointer
from flattentool import unflatten
from jsonschema import FormatChecker
from jsonschema.validators import Draft4Validator
from ocdsmerge.util import get_release_schema_url, get_tags
from referencing import Registry, Resource
from scrapy.exceptions import DropItem, NotSupported

from kingfisher_scrapy.items import File, FileItem, PluckedItem
from kingfisher_scrapy.util import transcode


def _json_loads(basename):
return json.loads(pkgutil.get_data('kingfisher_scrapy', f'item_schema/{basename}.json'))


# https://docs.scrapy.org/en/latest/topics/item-pipeline.html#duplicates-filter
class Validate:
"""
Drops duplicate files based on ``file_name`` and file items based on ``file_name`` and ``number``.

:raises jsonschema.ValidationError: if the item is invalid
"""

def __init__(self):
self.validators = {}
self.files = set()
self.file_items = set()

schema = Resource.from_contents(_json_loads('item'))
registry = Registry().with_resource('urn:item', schema)
checker = FormatChecker()
for item in ('File', 'FileError', 'FileItem'):
self.validators[item] = Draft4Validator(_json_loads(item), registry=registry, format_checker=checker)

def process_item(self, item, spider):
validator = self.validators.get(item.__class__.__name__)
if validator:
validator.validate(item.__dict__)

if isinstance(item, FileItem):
key = (item.file_name, item.number)
if key in self.file_items:
Expand Down
9 changes: 3 additions & 6 deletions kingfisher_scrapy/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,7 @@ async def process_spider_output(self, response, result, spider):
:returns: a generator of File or FileItem objects, in which the ``data`` field is valid JSON
"""
async for item in result:
if (
not isinstance(item, (File, FileItem))
or not spider.validate_json
or isinstance(item.data, (dict, list))
):
if not isinstance(item, (File, FileItem)) or not spider.validate_json or isinstance(item.data, dict):
yield item
continue

Expand Down Expand Up @@ -133,7 +129,8 @@ async def process_spider_output(self, response, result, spider):

data = item.data
# Re-encode the data, to traverse the JSON using only ijson, instead of either ijson or Python.
if isinstance(data, (dict, list)):
# This is only expected to occur when both `root_path` and `concatenated_json` are set.
if isinstance(data, dict):
data = util.json_dumps(data).encode()

iterable = util.transcode(spider, ijson.items, data, spider.root_path)
Expand Down
3 changes: 2 additions & 1 deletion kingfisher_scrapy/spiders/european_dynamics_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,10 @@ def parse_list(self, response):
data = response.json()
# The response can be an HTML document with an error message like "temporary unavailable due to maintenance".
except JSONDecodeError:
return self.build_file_error_from_response(
yield self.build_file_error_from_response(
response, errors={'http_code': response.status, 'text': response.text}
)
return

for number, url in enumerate(reversed(data['packagesPerMonth'])):
path = urlsplit(url).path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ class UnitedKingdomContractsFinderReleases(UnitedKingdomContractsFinderBase):

@handle_http_error
def parse_page(self, response):
return self.parse(response)
yield from self.parse(response)
4 changes: 1 addition & 3 deletions requirements.in
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
flattentool
ijson
jsonpointer
jsonschema
ocdskit[perf]
ocdsmerge
psycopg2
pydantic<2
rarfile
referencing
requests
rfc3986-validator
scrapy
scrapyd
scrapyd-client
Expand Down
23 changes: 5 additions & 18 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
attrs==22.2.0
# via
# automat
# jsonschema
# referencing
# service-identity
# twisted
automat==0.8.0
Expand Down Expand Up @@ -84,10 +82,6 @@ jsonref==1.0.1
# ocdsextensionregistry
# ocdskit
# ocdsmerge
jsonschema==4.18.0a9
# via -r requirements.in
jsonschema-specifications==2023.5.1
# via jsonschema
lxml==4.9.2
# via
# flattentool
Expand Down Expand Up @@ -135,6 +129,8 @@ pyasn1-modules==0.2.7
# via service-identity
pycparser==2.19
# via cffi
pydantic==1.10.15
# via -r requirements.in
pydispatcher==2.0.6
# via scrapy
pyopenssl==24.0.0
Expand All @@ -145,11 +141,6 @@ queuelib==1.6.2
# via scrapy
rarfile==3.1
# via -r requirements.in
referencing==0.28.5
# via
# -r requirements.in
# jsonschema
# jsonschema-specifications
requests==2.31.0
# via
# -r requirements.in
Expand All @@ -163,12 +154,6 @@ requests-cache==0.6.3
# via ocdsextensionregistry
requests-file==1.5.1
# via tldextract
rfc3986-validator==0.1.1
# via -r requirements.in
rpds-py==0.7.1
# via
# jsonschema
# referencing
schema==0.7.2
# via flattentool
scrapy==2.11.1
Expand Down Expand Up @@ -202,7 +187,9 @@ twisted==23.10.0
# scrapy
# scrapyd
typing-extensions==4.8.0
# via twisted
# via
# pydantic
# twisted
uberegg==0.1.1
# via scrapyd-client
url-normalize==1.4.3
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt.sha256
Original file line number Diff line number Diff line change
@@ -1 +1 @@
7b422dc5cc62fb663ac45e841547b7983537cfc528d3c43ef39afe09b62cc7ca requirements.txt
8e1eea5a0879c0002ed9878e2c470b3baaf54ae158179c7a7d260ff2e9219a94 requirements.txt
Loading