Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ISO DCAT XSLT conversion #2982

Merged
merged 23 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
4ecb725
Explore using GeoDCAT-AP XSLT for CSW parsing
maudetes Jan 25, 2024
ab60013
Dedicated csw-iso-xslt-dcat for testing
maudetes Feb 13, 2024
e4fac29
Stop building DCAT graph when max items reached
maudetes Feb 13, 2024
7b7dae3
Merge branch 'master' into feat/iso-dcat-xslt-conversion
maudetes Feb 19, 2024
96684bd
Use CSW to do the pagination
ThibaudDauce Mar 4, 2024
2453cdb
Disable URL resolution
ThibaudDauce Mar 4, 2024
92659a6
fix tests
ThibaudDauce Mar 4, 2024
c2f4635
Merge branch 'master' into feat/iso-dcat-xslt-conversion
ThibaudDauce Mar 5, 2024
e9c6bb1
Update changelog
ThibaudDauce Mar 5, 2024
b42cba8
Merge branch 'master' into feat/iso-dcat-xslt-conversion
maudetes Mar 12, 2024
605ea12
Merge branch 'master' into feat/iso-dcat-xslt-conversion
maudetes Mar 14, 2024
16a7ca6
Merge branch 'master' into feat/iso-dcat-xslt-conversion
ThibaudDauce Mar 19, 2024
ea93843
First tests on CSW ISO XSLT DCAT harvester
maudetes Mar 19, 2024
f14b09e
Add new harverster option in parse_url command
maudetes Mar 19, 2024
e11f0c5
Add text/xml in rdf mime types
maudetes Mar 19, 2024
7daeaf1
disable CoupledResourceLookUp and remove EmptyResolver hack
maudetes Mar 20, 2024
23cccf1
Merge branch 'master' into feat/iso-dcat-xslt-conversion
maudetes Mar 20, 2024
342ac45
Rename CSW ISO backend
maudetes Mar 20, 2024
91c4e33
Merge branch 'master' into feat/iso-dcat-xslt-conversion
maudetes Mar 21, 2024
109a3a3
Merge branch 'master' into feat/iso-dcat-xslt-conversion
maudetes Mar 22, 2024
f914d52
Add filter on dc:type dataset
maudetes Mar 22, 2024
153302c
Merge branch 'master' into feat/iso-dcat-xslt-conversion
maudetes Mar 22, 2024
050514e
Merge branch 'master' into feat/iso-dcat-xslt-conversion
ThibaudDauce Mar 26, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## Current (in progress)

- Add new harvester for ISO DCAT with XSLT transform [#2982](https://github.com/opendatateam/udata/pull/2982)
- Fix, do not fail on spatial coverage harvesting exception and allow literal spatial BBOX from Arcgis [2998](https://github.com/opendatateam/udata/pull/2998)
- Mock calls to example.com [#3000](https://github.com/opendatateam/udata/pull/3000)
- Fix duplicate logs in console commands [#2996](https://github.com/opendatateam/udata/pull/2996)
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def pip(filename):
'udata.harvesters': [
'dcat = udata.harvest.backends.dcat:DcatBackend',
'csw-dcat = udata.harvest.backends.dcat:CswDcatBackend',
'csw-iso-19139 = udata.harvest.backends.dcat:CswIso19139DcatBackend'
],
'udata.avatars': [
'internal = udata.features.identicon.backends:internal',
Expand Down
11 changes: 7 additions & 4 deletions udata/commands/dcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from udata.commands import cli, green, yellow, cyan, echo, magenta
from udata.core.dataset.factories import DatasetFactory
from udata.core.dataset.rdf import dataset_from_rdf
from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend, CswIso19139DcatBackend
from udata.rdf import namespace_manager

log = logging.getLogger(__name__)
Expand All @@ -23,9 +23,10 @@ def grp():
@grp.command()
@click.argument('url')
@click.option('-q', '--quiet', is_flag=True, help='Ignore warnings')
@click.option('-i', '--rid', help='Inspect specific remote id (contains)')
@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint')
def parse_url(url, csw, quiet=False, rid=''):
@click.option('-r', '--rid', help='Inspect specific remote id (contains)')
@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint with DCAT output')
@click.option('-i', '--iso', is_flag=True, help='The target is a CSW endpoint with ISO output')
def parse_url(url, csw, iso, quiet=False, rid=''):
'''Parse the datasets in a DCAT format located at URL (debug)'''
if quiet:
verbose_loggers = ['rdflib', 'udata.core.dataset']
Expand All @@ -49,6 +50,8 @@ def _create(cls, model_class, *args, **kwargs):
source.url = url
if csw:
backend = CswDcatBackend(source, dryrun=True)
elif iso:
backend = CswIso19139DcatBackend(source, dryrun=True)
else:
backend = DcatBackend(source, dryrun=True)
backend.job = MockJob()
Expand Down
152 changes: 128 additions & 24 deletions udata/harvest/backends/dcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from rdflib import Graph, URIRef
from rdflib.namespace import RDF
import xml.etree.ElementTree as ET
import lxml.etree as ET
import boto3
from flask import current_app
from datetime import date
Expand Down Expand Up @@ -173,7 +173,36 @@ def process(self, item):
dataset = self.get_dataset(item.remote_id)
dataset = dataset_from_rdf(graph, dataset, node=node)
return dataset


def next_record_if_should_continue(self, start, search_results):
next_record = int(search_results.attrib['nextRecord'])
matched_count = int(search_results.attrib['numberOfRecordsMatched'])
returned_count = int(search_results.attrib['numberOfRecordsReturned'])

# Break conditions copied gratefully from
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
break_conditions = (
# standard CSW: A value of 0 means all records have been returned.
next_record == 0,

# Misbehaving CSW server returning a next record > matched count
next_record > matched_count,

# No results returned already
returned_count == 0,

# Current next record is lower than previous one
next_record < start,

# Enough items have been harvested already
self.max_items and len(self.job.items) >= self.max_items
)

if any(break_conditions):
return None
else:
return next_record

class CswDcatBackend(DcatBackend):
display_name = 'CSW-DCAT'
Expand Down Expand Up @@ -201,17 +230,18 @@ def parse_graph(self, url: str, fmt: str) -> List[Graph]:
graphs = []
page = 0
start = 1

response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
headers=headers)
response.raise_for_status()
content = response.text
content = response.content
tree = ET.fromstring(content)
if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
raise ValueError(f'Failed to query CSW:\n{content}')
while tree:
graph = Graph(namespace_manager=namespace_manager)
search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
if not search_results:
if search_results is None:
log.error(f'No search results found for {url} on page {page}')
break
for child in search_results:
Expand All @@ -225,37 +255,111 @@ def parse_graph(self, url: str, fmt: str) -> List[Graph]:
kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
self.add_item(id, **kwargs)
graphs.append(graph)

next_record = self.next_record_if_should_continue(start, search_results)
if not next_record:
break

start = next_record
page += 1

next_record = int(search_results.attrib['nextRecord'])
matched_count = int(search_results.attrib['numberOfRecordsMatched'])
returned_count = int(search_results.attrib['numberOfRecordsReturned'])
tree = ET.fromstring(
self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
headers=headers).content)

return graphs



class CswIso19139DcatBackend(DcatBackend):
'''
An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
The parsing of items is then the same as for the DcatBackend.
'''

display_name = 'CSW-ISO-19139'

ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd'

# Break conditions copied gratefully from
# noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
break_conditions = (
# standard CSW: A value of 0 means all records have been returned.
next_record == 0,
XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"

# Misbehaving CSW server returning a next record > matched count
next_record > matched_count,
def parse_graph(self, url: str, fmt: str) -> List[Graph]:
'''
Parse CSW graph querying ISO schema.
Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
'''

# Load XSLT
xsl = ET.fromstring(self.get(self.XSL_URL).content)
transform = ET.XSLT(xsl)

# Start querying and parsing graph
body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
xmlns:gmd="http://www.isotc211.org/2005/gmd"
service="CSW" version="2.0.2" resultType="results"
startPosition="{start}" maxPosition="10"
outputSchema="{schema}">
<csw:Query typeNames="csw:Record">
<csw:ElementSetName>full</csw:ElementSetName>
<csw:Constraint version="1.1.0">
<ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
<ogc:PropertyIsEqualTo>
<ogc:PropertyName>dc:type</ogc:PropertyName>
<ogc:Literal>dataset</ogc:Literal>
</ogc:PropertyIsEqualTo>
</ogc:Filter>
</csw:Constraint>
</csw:Query>
</csw:GetRecords>'''
headers = {'Content-Type': 'application/xml'}

# No results returned already
returned_count == 0,
graphs = []
page = 0
start = 1

# Current next record is lower than previous one
next_record < start,
response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
headers=headers)
response.raise_for_status()

# Enough items have been harvested already
self.max_items and len(self.job.items) >= self.max_items
)
tree_before_transform = ET.fromstring(response.content)
# Disabling CoupledResourceLookUp to prevent failure on xlink:href
# https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")

if any(break_conditions):
while tree:
# We query the tree before the transformation because the XSLT remove the search results
# infos (useful for pagination)
search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
if search_results is None:
log.error(f'No search results found for {url} on page {page}')
break

subgraph = Graph(namespace_manager=namespace_manager)
subgraph.parse(ET.tostring(tree), format=fmt)

if not subgraph.subjects(RDF.type, DCAT.Dataset):
raise ValueError("Failed to fetch CSW content")

for node in subgraph.subjects(RDF.type, DCAT.Dataset):
id = subgraph.value(node, DCT.identifier)
kwargs = {'nid': str(node), 'page': page}
kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
self.add_item(id, **kwargs)
graphs.append(subgraph)

next_record = self.next_record_if_should_continue(start, search_results)
if not next_record:
break

start = next_record
tree = ET.fromstring(
self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
headers=headers).text)
page += 1

response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
headers=headers)
response.raise_for_status()

tree_before_transform = ET.fromstring(response.content)
tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")

return graphs
Loading