From ca6eab89c0a468f4dcb8b79045a7ccb9625787bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20W=C3=B6rpel?= Date: Tue, 2 May 2023 05:16:10 +0200 Subject: [PATCH] Add optional dataset category and metadata to align with aleph --- nomenklatura/dataset/dataset.py | 59 ++++++++++++++++++++++++++++----- nomenklatura/dataset/util.py | 13 ++++++-- tests/dataset/test_dataset.py | 18 ++++++++-- tests/fixtures/catalog.yml | 2 ++ 4 files changed, 79 insertions(+), 13 deletions(-) diff --git a/nomenklatura/dataset/dataset.py b/nomenklatura/dataset/dataset.py index f8d85e96..894e131e 100644 --- a/nomenklatura/dataset/dataset.py +++ b/nomenklatura/dataset/dataset.py @@ -1,21 +1,59 @@ -import yaml from functools import cached_property -from typing import TYPE_CHECKING -from typing import Any, Dict, TypeVar, Type, List, Optional, Set +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Type, TypeVar + +import yaml from followthemoney.types import registry -from nomenklatura.dataset.resource import DataResource -from nomenklatura.dataset.publisher import DataPublisher from nomenklatura.dataset.coverage import DataCoverage -from nomenklatura.dataset.util import Named, cleanup, string_list -from nomenklatura.dataset.util import type_check, type_require -from nomenklatura.util import iso_to_version, PathLike +from nomenklatura.dataset.publisher import DataPublisher +from nomenklatura.dataset.resource import DataResource +from nomenklatura.dataset.util import ( + Named, + cleanup, + string_list, + type_check, + type_require, +) +from nomenklatura.util import PathLike, iso_to_version if TYPE_CHECKING: from nomenklatura.dataset.catalog import DataCatalog DS = TypeVar("DS", bound="Dataset") +# aleph +CATEGORIES = ( + "news", + "leak", + "land", + "gazette", + "court", + "company", + "sanctions", + "procurement", + "finance", + "grey", + "library", + "license", + "regulatory", + "poi", + "customs", + "census", + "transport", + "casefile", + "other", +) + +# aleph +FREQUENCIES = ( + "unknown", + "never", + "daily", + "weekly", + "monthly", + "annual", +) + class Dataset(Named): """A unit of entities. A dataset is a set of data, sez W3C.""" @@ -44,6 +82,9 @@ def __init__(self, catalog: "DataCatalog[DS]", data: Dict[str, Any]) -> None: if rdata is not None: self.resources.append(DataResource(rdata)) + self.frequency = type_check(registry.string, data.get("frequency"), FREQUENCIES) + self.category = type_check(registry.string, data.get("category"), CATEGORIES) + # TODO: get rid of the legacy namings self._parents = set(string_list(data.get("parents", []))) self._parents.update(string_list(data.get("collections", []))) @@ -93,6 +134,8 @@ def to_dict(self) -> Dict[str, Any]: "url": self.url, "version": self.version, "updated_at": self.updated_at, + "frequency": self.frequency, + "category": self.category, "resources": [r.to_dict() for r in self.resources], "children": [c.name for c in self.children], } diff --git a/nomenklatura/dataset/util.py b/nomenklatura/dataset/util.py index caf4fcd9..ced9f1a1 100644 --- a/nomenklatura/dataset/util.py +++ b/nomenklatura/dataset/util.py @@ -1,18 +1,25 @@ -from normality import stringify -from typing import Any, Optional, Dict, List +from typing import Any, Dict, Iterable, List, Optional + from followthemoney.types import registry from followthemoney.types.common import PropertyType +from normality import stringify from nomenklatura.exceptions import MetadataException -def type_check(type_: PropertyType, value: Any) -> Optional[str]: +def type_check( + type_: PropertyType, value: Any, literal: Iterable[Any] = [] +) -> Optional[str]: text = stringify(value) if text is None: return None cleaned = type_.clean_text(text) if cleaned is None: raise MetadataException("Invalid %s: %r" % (type_.name, value)) + if literal and cleaned not in literal: + raise MetadataException( + "Invalid %s: %r not in %s" % (type_.name, value, ",".join(literal)) + ) return cleaned diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index 86ee7dd4..260d87f1 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -1,8 +1,9 @@ -import pytest -from typing import Dict, Any from pathlib import Path +from typing import Any, Dict +import pytest from nomenklatura.dataset import DataCatalog, Dataset +from nomenklatura.exceptions import MetadataException def test_donations_base(catalog_data: Dict[str, Any]): @@ -62,3 +63,16 @@ def test_from_path(catalog_path: Path): data = catalog.to_dict() assert isinstance(data, dict) assert "datasets" in data + + +def test_dataset_aleph_metadata(catalog_data: Dict[str, Any]): + catalog = DataCatalog(Dataset, catalog_data) + ds = catalog.require("leak") + assert ds.category == "leak" + assert ds.frequency == "unknown" + + # invalid metadata + with pytest.raises(MetadataException): + ds = Dataset( + catalog, {"name": "invalid", "title": "Invalid metadata", "category": "foo"} + ) diff --git a/tests/fixtures/catalog.yml b/tests/fixtures/catalog.yml index 061e435a..0de3584c 100644 --- a/tests/fixtures/catalog.yml +++ b/tests/fixtures/catalog.yml @@ -11,6 +11,8 @@ datasets: timestamp: 2021 - name: leak title: Some data leak + category: leak + frequency: unknown - name: company_data title: Company data publisher: