Skip to content

Commit

Permalink
Improve DCAT catalog exposed (#2860)
Browse files Browse the repository at this point in the history
  • Loading branch information
maudetes committed Jul 12, 2023
1 parent b6e4a15 commit f85f94c
Show file tree
Hide file tree
Showing 11 changed files with 59 additions and 27 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

- Improve DCAT harvest of mime type [#2857](https://github.com/opendatateam/udata/pull/2857)
- Don't crash on files not found when purging resources [2858](https://github.com/opendatateam/udata/pull/2858)
- Improve DCAT catalog exposed [#2860](https://github.com/opendatateam/udata/pull/2860)
- Use the resource's extra `analysis:last-modified-at` in the `last_modified` property [#2863](https://github.com/opendatateam/udata/pull/2863)
- Add optionnal harvest validation form [#2864](https://github.com/opendatateam/udata/pull/2864)
- Fix dataset list default sorting [#2867](https://github.com/opendatateam/udata/pull/2867)
Expand Down
2 changes: 1 addition & 1 deletion docs/harvesting.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ Fields are extracted according these rules:
| last_modified | dct:modified | |
| format | dct:format | |
| mime | dcat:mediaType | |
| filesize | dcat:bytesSize | |
| filesize | dcat:byteSize | |
| checksum | spdx:checksum | See [Checksum](#checksum) |


Expand Down
2 changes: 1 addition & 1 deletion docs/rdf.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ and fields are mapped according to:
| last_modified | dct:modified | |
| format | dct:format | |
| mime | dcat:mediaType | |
| filesize | dcat:bytesSize | |
| filesize | dcat:byteSize | |
| checksum | spdx:checksum | |

| TemporalCoverage | dct:PeriodOfTime |
Expand Down
29 changes: 23 additions & 6 deletions udata/core/dataset/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,8 @@ def temporal_to_rdf(daterange, graph=None):
graph = graph or Graph(namespace_manager=namespace_manager)
pot = graph.resource(BNode())
pot.set(RDF.type, DCT.PeriodOfTime)
pot.set(SCHEMA.startDate, Literal(daterange.start))
pot.set(SCHEMA.endDate, Literal(daterange.end))
pot.set(DCAT.startDate, Literal(daterange.start))
pot.set(DCAT.endDate, Literal(daterange.end))
return pot


Expand All @@ -114,6 +114,17 @@ def frequency_to_rdf(frequency, graph=None):
return RDF_FREQUENCIES.get(frequency, getattr(FREQ, frequency))


def owner_to_rdf(dataset, graph=None):
from udata.core.organization.rdf import organization_to_rdf
from udata.core.user.rdf import user_to_rdf

if dataset.owner:
return user_to_rdf(dataset.owner, graph)
elif dataset.organization:
return organization_to_rdf(dataset.organization, graph)
return


def resource_to_rdf(resource, dataset=None, graph=None):
'''
Map a Resource domain model to a DCAT/RDF graph
Expand All @@ -125,7 +136,8 @@ def resource_to_rdf(resource, dataset=None, graph=None):
_anchor='resource-{0}'.format(resource.id)))
else:
id = BNode(resource.id)
permalink = endpoint_for('datasets.resource', 'api.resource_redirect', id=resource.id, _external=True)
permalink = endpoint_for('datasets.resource', 'api.resource_redirect', id=resource.id,
_external=True)
r = graph.resource(id)
r.set(RDF.type, DCAT.Distribution)
r.set(DCT.identifier, Literal(resource.id))
Expand All @@ -140,7 +152,7 @@ def resource_to_rdf(resource, dataset=None, graph=None):
if dataset.license.url:
r.add(DCT.license, URIRef(dataset.license.url))
if resource.filesize is not None:
r.add(DCAT.bytesSize, Literal(resource.filesize))
r.add(DCAT.byteSize, Literal(resource.filesize))
if resource.mime:
r.add(DCAT.mediaType, Literal(resource.mime))
if resource.format:
Expand Down Expand Up @@ -199,6 +211,10 @@ def dataset_to_rdf(dataset, graph=None):
if frequency:
d.set(DCT.accrualPeriodicity, frequency)

publisher = owner_to_rdf(dataset, graph)
if publisher:
d.set(DCT.publisher, publisher)

return d


Expand Down Expand Up @@ -402,7 +418,7 @@ def resource_from_rdf(graph_or_distrib, dataset=None):
resource.title = title_from_rdf(distrib, url)
resource.url = url
resource.description = sanitize_html(distrib.value(DCT.description))
resource.filesize = rdf_value(distrib, DCAT.bytesSize)
resource.filesize = rdf_value(distrib, DCAT.byteSize)
resource.mime = mime_from_rdf(distrib)
resource.format = format_from_rdf(distrib)
checksum = distrib.value(SPDX.checksum)
Expand Down Expand Up @@ -451,7 +467,8 @@ def dataset_from_rdf(graph, dataset=None, node=None):
dataset.acronym = acronym

tags = [tag.toPython() for tag in d.objects(DCAT.keyword)]
tags += [theme.toPython() for theme in d.objects(DCAT.theme) if not isinstance(theme, RdfResource)]
tags += [theme.toPython() for theme in d.objects(DCAT.theme)
if not isinstance(theme, RdfResource)]
dataset.tags = list(set(tags))

temporal_coverage = temporal_from_rdf(d.value(DCT.temporal))
Expand Down
10 changes: 6 additions & 4 deletions udata/core/organization/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from rdflib import Graph, URIRef, Literal, BNode
from rdflib.namespace import RDF, RDFS, FOAF

from udata.rdf import DCAT, DCT, DCAT, namespace_manager, paginate_catalog
from udata.rdf import DCAT, DCT, namespace_manager, paginate_catalog

from udata.core.dataset.rdf import dataset_to_rdf
from udata.utils import Paginable
Expand All @@ -21,7 +21,7 @@ def organization_to_rdf(org, graph=None):
graph = graph or Graph(namespace_manager=namespace_manager)
if org.id:
org_url = endpoint_for('organizations.show_redirect', 'api.organization',
org=org.id, _external=True)
org=org.id, _external=True)
id = URIRef(org_url)
else:
id = BNode()
Expand All @@ -42,13 +42,15 @@ def build_org_catalog(org, datasets, format=None):
catalog = graph.resource(URIRef(org_catalog_url))
catalog.set(RDF.type, DCAT.Catalog)
catalog.set(DCT.publisher, organization_to_rdf(org, graph))
catalog.set(DCT.title, Literal(f"{org.name}"))
catalog.set(DCT.description, Literal(f"{org.name}"))

for dataset in datasets:
catalog.add(DCAT.dataset, dataset_to_rdf(dataset, graph))

values = {'org': org.id}

if isinstance(datasets, Paginable):
paginate_catalog(catalog, graph, datasets, format, 'api.organization_rdf_format', **values)

return catalog
3 changes: 2 additions & 1 deletion udata/core/site/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from udata.core.dataset.rdf import dataset_to_rdf
from udata.core.organization.rdf import organization_to_rdf
from udata.core.user.rdf import user_to_rdf
from udata.rdf import DCAT, DCT, HYDRA, namespace_manager, paginate_catalog
from udata.rdf import DCAT, DCT, namespace_manager, paginate_catalog
from udata.utils import Paginable
from udata.uris import endpoint_for

Expand All @@ -22,6 +22,7 @@ def build_catalog(site, datasets, format=None):

catalog.set(RDF.type, DCAT.Catalog)
catalog.set(DCT.title, Literal(site.title))
catalog.set(DCT.description, Literal(f"{site.title}"))
catalog.set(DCT.language,
Literal(current_app.config['DEFAULT_LANGUAGE']))
catalog.set(FOAF.homepage, URIRef(site_url))
Expand Down
1 change: 1 addition & 0 deletions udata/harvest/tests/dcat/catalog.xml
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
</dcterms:MediaTypeOrExtent>
</dcterms:format>
<dcat:mediaType rdf:resource="https://www.iana.org/assignments/media-types/application/json" />
<dcat:byteSize>12323</dcat:byteSize>
<dcterms:description>A JSON resource</dcterms:description>
<dcat:accessURL>http://data.test.org/datasets/1/resources/1/file.json</dcat:accessURL>
</dcat:Distribution>
Expand Down
11 changes: 6 additions & 5 deletions udata/harvest/tests/test_dcat_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ def test_xml_catalog(self, rmock):
# Format is a IANA URI
assert resource_1.format == 'json'
assert resource_1.mime == 'application/json'
assert resource_1.filesize == 12323
assert resource_1.description == 'A JSON resource'
assert resource_1.url == 'http://data.test.org/datasets/1/resources/1/file.json'

Expand All @@ -305,11 +306,11 @@ def test_geonetwork_xml_catalog(self, rmock):
dataset = Dataset.objects.filter(organization=org).first()
assert dataset is not None
assert dataset.harvest is not None
assert dataset.harvest.remote_id == '0c456d2d-9548-4a2a-94ef-231d9d890ce2 https://sig.oreme.org/geonetwork/srv/resources0c456d2d-9548-4a2a-94ef-231d9d890ce2'
assert dataset.harvest.dct_identifier == '0c456d2d-9548-4a2a-94ef-231d9d890ce2 https://sig.oreme.org/geonetwork/srv/resources0c456d2d-9548-4a2a-94ef-231d9d890ce2'
assert dataset.harvest.remote_id == '0c456d2d-9548-4a2a-94ef-231d9d890ce2 https://sig.oreme.org/geonetwork/srv/resources0c456d2d-9548-4a2a-94ef-231d9d890ce2' # noqa
assert dataset.harvest.dct_identifier == '0c456d2d-9548-4a2a-94ef-231d9d890ce2 https://sig.oreme.org/geonetwork/srv/resources0c456d2d-9548-4a2a-94ef-231d9d890ce2' # noqa
assert dataset.harvest.created_at.date() == date(2004, 11, 3)
assert dataset.harvest.modified_at is None
assert dataset.harvest.uri == 'https://sig.oreme.org/geonetwork/srv/resources/datasets/0c456d2d-9548-4a2a-94ef-231d9d890ce2 https://sig.oreme.org/geonetwork/srv/resources0c456d2d-9548-4a2a-94ef-231d9d890ce2'
assert dataset.harvest.uri == 'https://sig.oreme.org/geonetwork/srv/resources/datasets/0c456d2d-9548-4a2a-94ef-231d9d890ce2 https://sig.oreme.org/geonetwork/srv/resources0c456d2d-9548-4a2a-94ef-231d9d890ce2' # noqa
assert dataset.harvest.remote_url is None # the uri validation failed
assert dataset.description.startswith('Data of type chemistry')
assert dataset.temporal_coverage is not None
Expand Down Expand Up @@ -338,8 +339,8 @@ def test_sigoreme_xml_catalog(self, rmock):
assert dataset.harvest.remote_id == '0437a976-cff1-4fa6-807a-c23006df2f8f'
assert dataset.harvest.created_at is None
assert dataset.harvest.modified_at is None
assert dataset.harvest.uri == 'https://sig.oreme.org/geonetwork/srv/eng/catalog.search#/metadata//datasets/0437a976-cff1-4fa6-807a-c23006df2f8f'
assert dataset.harvest.remote_url == 'https://sig.oreme.org/geonetwork/srv/eng/catalog.search#/metadata//datasets/0437a976-cff1-4fa6-807a-c23006df2f8f'
assert dataset.harvest.uri == 'https://sig.oreme.org/geonetwork/srv/eng/catalog.search#/metadata//datasets/0437a976-cff1-4fa6-807a-c23006df2f8f' # noqa
assert dataset.harvest.remote_url == 'https://sig.oreme.org/geonetwork/srv/eng/catalog.search#/metadata//datasets/0437a976-cff1-4fa6-807a-c23006df2f8f' # noqa
assert dataset.harvest.last_update.date() == date.today()

def test_unsupported_mime_type(self, rmock):
Expand Down
21 changes: 13 additions & 8 deletions udata/tests/dataset/test_dataset_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from flask import url_for

from rdflib import Graph, URIRef, Literal, BNode
from rdflib.namespace import RDF
from rdflib.namespace import FOAF, RDF
from rdflib.resource import Resource as RdfResource

from udata.models import db
Expand All @@ -20,6 +20,7 @@
temporal_from_rdf, frequency_to_rdf, frequency_from_rdf,
EU_RDF_REQUENCIES
)
from udata.core.organization.factories import OrganizationFactory
from udata.i18n import gettext as _
from udata.rdf import DCAT, DCT, FREQ, SPDX, SCHEMA, SKOS
from udata.utils import faker
Expand Down Expand Up @@ -66,8 +67,9 @@ def test_minimal(self):

def test_all_dataset_fields(self):
resources = ResourceFactory.build_batch(3)
org = OrganizationFactory(name="organization")
dataset = DatasetFactory(tags=faker.words(nb=3), resources=resources,
frequency='daily', acronym='acro')
frequency='daily', acronym='acro', organization=org)
d = dataset_to_rdf(dataset)
g = d.graph

Expand All @@ -90,6 +92,9 @@ def test_all_dataset_fields(self):
expected_tags = set(Literal(t) for t in dataset.tags)
assert set(d.objects(DCAT.keyword)) == expected_tags
assert len(list(d.objects(DCAT.distribution))) == len(resources)
org = d.value(DCT.publisher)
assert org.value(RDF.type).identifier == FOAF.Organization
assert org.value(FOAF.name) == Literal("organization")

def test_map_unkownn_frequencies(self):
assert frequency_to_rdf('hourly') == FREQ.continuous
Expand Down Expand Up @@ -139,7 +144,7 @@ def test_all_resource_fields(self):
assert r.value(DCT.rights) == Literal(license.title)
assert r.value(DCAT.downloadURL).identifier == URIRef(resource.url)
assert r.value(DCAT.accessURL).identifier == URIRef(permalink)
assert r.value(DCAT.bytesSize) == Literal(resource.filesize)
assert r.value(DCAT.byteSize) == Literal(resource.filesize)
assert r.value(DCAT.mediaType) == Literal(resource.mime)
assert r.value(DCT.format) == Literal(resource.format)

Expand All @@ -161,8 +166,8 @@ def test_temporal_coverage(self):
pot = d.value(DCT.temporal)

assert pot.value(RDF.type).identifier == DCT.PeriodOfTime
assert pot.value(SCHEMA.startDate).toPython() == start
assert pot.value(SCHEMA.endDate).toPython() == end
assert pot.value(DCAT.startDate).toPython() == start
assert pot.value(DCAT.endDate).toPython() == end

def test_from_external_repository(self):
dataset = DatasetFactory(harvest=HarvestDatasetMetadata(
Expand Down Expand Up @@ -272,8 +277,8 @@ def test_all_fields(self):
pot = BNode()
g.add((node, DCT.temporal, pot))
g.set((pot, RDF.type, DCT.PeriodOfTime))
g.set((pot, SCHEMA.startDate, Literal(start)))
g.set((pot, SCHEMA.endDate, Literal(end)))
g.set((pot, DCAT.startDate, Literal(start)))
g.set((pot, DCAT.endDate, Literal(end)))
for tag in tags:
g.add((node, DCAT.keyword, Literal(tag)))

Expand Down Expand Up @@ -391,7 +396,7 @@ def test_all_resource_fields(self):
g.add((node, DCAT.downloadURL, Literal(url)))
g.add((node, DCT.issued, Literal(issued)))
g.add((node, DCT.modified, Literal(modified)))
g.add((node, DCAT.bytesSize, Literal(filesize)))
g.add((node, DCAT.byteSize, Literal(filesize)))
g.add((node, DCAT.mediaType, Literal(mime)))
g.add((node, DCT.format, Literal('CSV')))

Expand Down
5 changes: 4 additions & 1 deletion udata/tests/organization/test_organization_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_catalog(self):

datasets = VisibleDatasetFactory.create_batch(3, organization=origin_org)
catalog = build_org_catalog(origin_org, datasets)

graph = catalog.graph

self.assertIsInstance(catalog, RdfResource)
Expand All @@ -77,6 +77,9 @@ def test_catalog(self):
self.assertEqual(org.value(RDF.type).identifier, FOAF.Organization)
self.assertEqual(org.value(FOAF.name), Literal(origin_org.name))

self.assertEqual(catalog.value(DCT.title), Literal(f"{origin_org.name}"))
self.assertEqual(catalog.value(DCT.description), Literal(f"{origin_org.name}"))

graph = catalog.graph
graph_datasets = graph.subjects(RDF.type, DCAT.Dataset)
self.assertEqual(len(list(graph_datasets)), len(datasets))
Expand Down
1 change: 1 addition & 0 deletions udata/tests/site/test_site_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def test_minimal(self, app):
assert isinstance(catalog.identifier, URIRef)
assert str(catalog.identifier) == uri
assert catalog.value(DCT.title) == Literal(site.title)
assert catalog.value(DCT.description) == Literal(f"{site.title}")
lang = app.config['DEFAULT_LANGUAGE']
assert catalog.value(DCT.language) == Literal(lang)

Expand Down

0 comments on commit f85f94c

Please sign in to comment.