opendatateam · maudetes · Mar 26, 2024 · Jan 25, 2024 · Feb 13, 2024 · Feb 13, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## Current (in progress)
 
+- Add new harvester for ISO DCAT with XSLT transform [#2982](https://github.com/opendatateam/udata/pull/2982)
 - Fix, do not fail on spatial coverage harvesting exception and allow literal spatial BBOX from Arcgis [2998](https://github.com/opendatateam/udata/pull/2998)
 - Mock calls to example.com [#3000](https://github.com/opendatateam/udata/pull/3000)
 - Fix duplicate logs in console commands [#2996](https://github.com/opendatateam/udata/pull/2996)

diff --git a/setup.py b/setup.py
@@ -45,6 +45,7 @@ def pip(filename):
         'udata.harvesters': [
             'dcat = udata.harvest.backends.dcat:DcatBackend',
             'csw-dcat = udata.harvest.backends.dcat:CswDcatBackend',
+            'csw-iso-19139 = udata.harvest.backends.dcat:CswIso19139DcatBackend'
         ],
         'udata.avatars': [
             'internal = udata.features.identicon.backends:internal',

diff --git a/udata/commands/dcat.py b/udata/commands/dcat.py
@@ -8,7 +8,7 @@
 from udata.commands import cli, green, yellow, cyan, echo, magenta
 from udata.core.dataset.factories import DatasetFactory
 from udata.core.dataset.rdf import dataset_from_rdf
-from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend
+from udata.harvest.backends.dcat import DcatBackend, CswDcatBackend, CswIso19139DcatBackend
 from udata.rdf import namespace_manager
 
 log = logging.getLogger(__name__)
@@ -23,9 +23,10 @@ def grp():
 @grp.command()
 @click.argument('url')
 @click.option('-q', '--quiet', is_flag=True, help='Ignore warnings')
-@click.option('-i', '--rid', help='Inspect specific remote id (contains)')
-@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint')
-def parse_url(url, csw, quiet=False, rid=''):
+@click.option('-r', '--rid', help='Inspect specific remote id (contains)')
+@click.option('-c', '--csw', is_flag=True, help='The target is a CSW endpoint with DCAT output')
+@click.option('-i', '--iso', is_flag=True, help='The target is a CSW endpoint with ISO output')
+def parse_url(url, csw, iso, quiet=False, rid=''):
     '''Parse the datasets in a DCAT format located at URL (debug)'''
     if quiet:
         verbose_loggers = ['rdflib', 'udata.core.dataset']
@@ -49,6 +50,8 @@ def _create(cls, model_class, *args, **kwargs):
     source.url = url
     if csw:
         backend = CswDcatBackend(source, dryrun=True)
+    elif iso:
+        backend = CswIso19139DcatBackend(source, dryrun=True)
     else:
         backend = DcatBackend(source, dryrun=True)
     backend.job = MockJob()

diff --git a/udata/harvest/backends/dcat.py b/udata/harvest/backends/dcat.py
@@ -2,7 +2,7 @@
 
 from rdflib import Graph, URIRef
 from rdflib.namespace import RDF
-import xml.etree.ElementTree as ET
+import lxml.etree as ET
 import boto3
 from flask import current_app
 from datetime import date
@@ -173,7 +173,36 @@ def process(self, item):
         dataset = self.get_dataset(item.remote_id)
         dataset = dataset_from_rdf(graph, dataset, node=node)
         return dataset
+
 
+    def next_record_if_should_continue(self, start, search_results):
+        next_record = int(search_results.attrib['nextRecord'])
+        matched_count = int(search_results.attrib['numberOfRecordsMatched'])
+        returned_count = int(search_results.attrib['numberOfRecordsReturned'])
+
+        # Break conditions copied gratefully from
+        # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
+        break_conditions = (
+            # standard CSW: A value of 0 means all records have been returned.
+            next_record == 0,
+
+            # Misbehaving CSW server returning a next record > matched count
+            next_record > matched_count,
+
+            # No results returned already
+            returned_count == 0,
+
+            # Current next record is lower than previous one
+            next_record < start,
+
+            # Enough items have been harvested already
+            self.max_items and len(self.job.items) >= self.max_items
+        )
+
+        if any(break_conditions):
+            return None
+        else:
+            return next_record
 
 class CswDcatBackend(DcatBackend):
     display_name = 'CSW-DCAT'
@@ -201,17 +230,18 @@ def parse_graph(self, url: str, fmt: str) -> List[Graph]:
         graphs = []
         page = 0
         start = 1
+
         response = self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
                              headers=headers)
         response.raise_for_status()
-        content = response.text
+        content = response.content
         tree = ET.fromstring(content)
         if tree.tag == '{' + OWS_NAMESPACE + '}ExceptionReport':
             raise ValueError(f'Failed to query CSW:\n{content}')
         while tree:
             graph = Graph(namespace_manager=namespace_manager)
             search_results = tree.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
-            if not search_results:
+            if search_results is None:
                 log.error(f'No search results found for {url} on page {page}')
                 break
             for child in search_results:
@@ -225,37 +255,111 @@ def parse_graph(self, url: str, fmt: str) -> List[Graph]:
                     kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
                     self.add_item(id, **kwargs)
             graphs.append(graph)
+
+            next_record = self.next_record_if_should_continue(start, search_results)
+            if not next_record:
+                break
+
+            start = next_record
             page += 1
 
-            next_record = int(search_results.attrib['nextRecord'])
-            matched_count = int(search_results.attrib['numberOfRecordsMatched'])
-            returned_count = int(search_results.attrib['numberOfRecordsReturned'])
+            tree = ET.fromstring(
+                self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
+                          headers=headers).content)
+
+        return graphs
+
+
+
+class CswIso19139DcatBackend(DcatBackend):
+    '''
+    An harvester that takes CSW ISO 19139 as input and transforms it to DCAT using SEMIC GeoDCAT-AP XSLT.
+    The parsing of items is then the same as for the DcatBackend.
+    '''
+
+    display_name = 'CSW-ISO-19139'
+
+    ISO_SCHEMA = 'http://www.isotc211.org/2005/gmd'
 
-            # Break conditions copied gratefully from
-            # noqa https://github.com/geonetwork/core-geonetwork/blob/main/harvesters/src/main/java/org/fao/geonet/kernel/harvest/harvester/csw/Harvester.java#L338-L369
-            break_conditions = (
-                # standard CSW: A value of 0 means all records have been returned.
-                next_record == 0,
+    XSL_URL = "https://raw.githubusercontent.com/SEMICeu/iso-19139-to-dcat-ap/master/iso-19139-to-dcat-ap.xsl"
 
-                # Misbehaving CSW server returning a next record > matched count
-                next_record > matched_count,
+    def parse_graph(self, url: str, fmt: str) -> List[Graph]:
+        '''
+        Parse CSW graph querying ISO schema.
+        Use SEMIC GeoDCAT-AP XSLT to map it to a correct version.
+        See https://github.com/SEMICeu/iso-19139-to-dcat-ap for more information on the XSLT.
+        '''
+
+        # Load XSLT
+        xsl = ET.fromstring(self.get(self.XSL_URL).content)
+        transform = ET.XSLT(xsl)
+
+        # Start querying and parsing graph
+        body = '''<csw:GetRecords xmlns:csw="http://www.opengis.net/cat/csw/2.0.2"
+                                  xmlns:gmd="http://www.isotc211.org/2005/gmd"
+                                  service="CSW" version="2.0.2" resultType="results"
+                                  startPosition="{start}" maxPosition="10"
+                                  outputSchema="{schema}">
+                      <csw:Query typeNames="csw:Record">
+                        <csw:ElementSetName>full</csw:ElementSetName>
+                        <csw:Constraint version="1.1.0">
+                            <ogc:Filter xmlns:ogc="http://www.opengis.net/ogc">
+                                <ogc:PropertyIsEqualTo>
+                                    <ogc:PropertyName>dc:type</ogc:PropertyName>
+                                    <ogc:Literal>dataset</ogc:Literal>
+                                </ogc:PropertyIsEqualTo>
+                            </ogc:Filter>
+                        </csw:Constraint>
+                    </csw:Query>
+                </csw:GetRecords>'''
+        headers = {'Content-Type': 'application/xml'}
 
-                # No results returned already
-                returned_count == 0,
+        graphs = []
+        page = 0
+        start = 1
 
-                # Current next record is lower than previous one
-                next_record < start,
+        response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
+                            headers=headers)
+        response.raise_for_status()
 
-                # Enough items have been harvested already
-                self.max_items and len(self.job.items) >= self.max_items
-            )
+        tree_before_transform = ET.fromstring(response.content)
+        # Disabling CoupledResourceLookUp to prevent failure on xlink:href
+        # https://github.com/SEMICeu/iso-19139-to-dcat-ap/blob/master/documentation/HowTo.md#parameter-coupledresourcelookup
+        tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
 
-            if any(break_conditions):
+        while tree:
+            # We query the tree before the transformation because the XSLT remove the search results
+            # infos (useful for pagination)
+            search_results = tree_before_transform.find('csw:SearchResults', {'csw': CSW_NAMESPACE})
+            if search_results is None:
+                log.error(f'No search results found for {url} on page {page}')
                 break
 
+            subgraph = Graph(namespace_manager=namespace_manager)
+            subgraph.parse(ET.tostring(tree), format=fmt)
+
+            if not subgraph.subjects(RDF.type, DCAT.Dataset):
+                raise ValueError("Failed to fetch CSW content")
+
+            for node in subgraph.subjects(RDF.type, DCAT.Dataset):
+                id = subgraph.value(node, DCT.identifier)
+                kwargs = {'nid': str(node), 'page': page}
+                kwargs['type'] = 'uriref' if isinstance(node, URIRef) else 'blank'
+                self.add_item(id, **kwargs)
+            graphs.append(subgraph)
+
+            next_record = self.next_record_if_should_continue(start, search_results)
+            if not next_record:
+                break
+
             start = next_record
-            tree = ET.fromstring(
-                self.post(url, data=body.format(start=start, schema=self.DCAT_SCHEMA),
-                          headers=headers).text)
+            page += 1
+
+            response = self.post(url, data=body.format(start=start, schema=self.ISO_SCHEMA),
+                          headers=headers)
+            response.raise_for_status()
+
+            tree_before_transform = ET.fromstring(response.content)
+            tree = transform(tree_before_transform, CoupledResourceLookUp="'disabled'")
 
         return graphs