From aafb615e5ccf097e10190ecc7523bb7d5c5e3fa9 Mon Sep 17 00:00:00 2001 From: David de Boer Date: Wed, 22 Mar 2023 09:20:00 +0100 Subject: [PATCH] feat: Support Hydra pagination (#708) --- jest.config.js | 8 +++--- src/comunica-config.json | 54 +++++++++++++++++++++++++++++++++++ src/fetch.ts | 46 ++++++++++++++++++++++++----- test/datasets/hydra-page1.ttl | 39 +++++++++++++++++++++++++ test/datasets/hydra-page2.ttl | 39 +++++++++++++++++++++++++ test/fetch.test.ts | 20 +++++++++++++ tsconfig.json | 2 +- 7 files changed, 196 insertions(+), 12 deletions(-) create mode 100644 src/comunica-config.json create mode 100644 test/datasets/hydra-page1.ttl create mode 100644 test/datasets/hydra-page2.ttl diff --git a/jest.config.js b/jest.config.js index 7c7b6011..4a16ac27 100644 --- a/jest.config.js +++ b/jest.config.js @@ -15,10 +15,10 @@ export default { coverageReporters: ['json-summary', 'text'], coverageThreshold: { global: { - lines: 69.15, - statements: 68.94, - branches: 55.1, - functions: 68.42, + lines: 69.81, + statements: 69.6, + branches: 60.36, + functions: 69.23, }, }, transform: {}, diff --git a/src/comunica-config.json b/src/comunica-config.json new file mode 100644 index 00000000..754239d1 --- /dev/null +++ b/src/comunica-config.json @@ -0,0 +1,54 @@ +{ + "@context": [ + "https://linkedsoftwaredependencies.org/bundles/npm/@comunica/config-query-sparql/^2.0.0/components/context.jsonld" + ], + "import": [ + "ccqs:config/context-preprocess/actors.json", + "ccqs:config/context-preprocess/mediators.json", + "ccqs:config/hash-bindings/actors.json", + "ccqs:config/hash-bindings/mediators.json", + "ccqs:config/http/actors.json", + "ccqs:config/http/mediators.json", + "ccqs:config/http-invalidate/actors.json", + "ccqs:config/http-invalidate/mediators.json", + "ccqs:config/init/actors.json", + "ccqs:config/optimize-query-operation/actors.json", + "ccqs:config/optimize-query-operation/mediators.json", + "ccqs:config/query-operation/actors.json", + "ccqs:config/query-operation/mediators.json", + "ccqs:config/query-parse/actors.json", + "ccqs:config/query-parse/mediators.json", + "ccqs:config/query-result-serialize/actors.json", + "ccqs:config/query-result-serialize/mediators.json", + "ccqs:config/dereference/actors.json", + "ccqs:config/dereference/mediators.json", + "ccqs:config/dereference-rdf/actors.json", + "ccqs:config/dereference-rdf/mediators.json", + "ccqs:config/rdf-join/actors.json", + "ccqs:config/rdf-join/mediators.json", + "ccqs:config/rdf-join-entries-sort/actors.json", + "ccqs:config/rdf-join-entries-sort/mediators.json", + "ccqs:config/rdf-join-selectivity/actors.json", + "ccqs:config/rdf-join-selectivity/mediators.json", + "ccqs:config/rdf-metadata/actors.json", + "ccqs:config/rdf-metadata/mediators.json", + "ccqs:config/rdf-metadata-extract/actors.json", + "ccqs:config/rdf-metadata-extract/mediators.json", + "ccqs:config/rdf-parse/actors.json", + "ccqs:config/rdf-parse/mediators.json", + "ccqs:config/rdf-parse-html/actors.json", + "ccqs:config/rdf-resolve-hypermedia/actors.json", + "ccqs:config/rdf-resolve-hypermedia/mediators.json", + "ccqs:config/rdf-resolve-hypermedia-links/mediators.json", + "ccqs:config/rdf-resolve-hypermedia-links-queue/actors.json", + "ccqs:config/rdf-resolve-hypermedia-links-queue/mediators.json", + "ccqs:config/rdf-resolve-quad-pattern/actors.json", + "ccqs:config/rdf-resolve-quad-pattern/mediators.json", + "ccqs:config/rdf-serialize/actors.json", + "ccqs:config/rdf-serialize/mediators.json", + "ccqs:config/rdf-update-hypermedia/actors.json", + "ccqs:config/rdf-update-hypermedia/mediators.json", + "ccqs:config/rdf-update-quads/actors.json", + "ccqs:config/rdf-update-quads/mediators.json" + ] +} diff --git a/src/fetch.ts b/src/fetch.ts index a961267b..d46a4f07 100644 --- a/src/fetch.ts +++ b/src/fetch.ts @@ -1,4 +1,4 @@ -import {QueryEngine} from '@comunica/query-sparql'; +import {QueryEngineFactory} from '@comunica/query-sparql'; import factory from 'rdf-ext'; import DatasetExt from 'rdf-ext/lib/Dataset'; import {URL} from 'url'; @@ -8,6 +8,7 @@ import {pipeline} from 'stream'; import {StandardizeSchemaOrgPrefixToHttps} from './transform'; import Pino from 'pino'; import {rdfDereferencer} from './rdf'; +import {resolve} from 'node:path'; export class HttpError extends Error { constructor(message: string, public readonly statusCode: number) { @@ -22,11 +23,10 @@ export class NoDatasetFoundAtUrl extends Error { } export async function fetch(url: URL): Promise { - let datasets = []; - try { - datasets = await query(url); - } catch (e) { - handleComunicaError(e); + let datasets = await doFetch(url); + const nextPage = await findNextPage(url); + if (nextPage !== null && nextPage !== url) { + datasets = [...datasets, ...(await doFetch(nextPage))]; } if (datasets.length === 0) { @@ -36,6 +36,14 @@ export async function fetch(url: URL): Promise { return datasets; } +export async function doFetch(url: URL) { + try { + return await query(url); + } catch (e) { + handleComunicaError(e); + } +} + /** * Fetch dataset description(s) by dereferencing the registration URL. */ @@ -53,7 +61,11 @@ export async function dereference(url: URL): Promise { } } -const engine = new QueryEngine(); +// Use custom config to disable "ccqs:config/rdf-resolve-hypermedia-links/actors.json", which causes +// many duplicate bindings and does not find any datasets on the second page ff. +const engine = await new QueryEngineFactory().create({ + configPath: resolve('src/comunica-config.json'), +}); /** * Fetch dataset descriptions by executing a SPARQL SELECT query. @@ -164,3 +176,23 @@ function handleComunicaError(e: unknown): never { throw e; } + +async function findNextPage(url: URL): Promise { + const bindingsStream = await engine.queryBindings( + ` + PREFIX hydra: + + SELECT ?nextPage { + ?s ?nextPagePredicate ?nextPage + VALUES ?nextPagePredicate { hydra:next hydra:nextPage } + } + `, + { + sources: [url.toString()], + } + ); + const bindings = await bindingsStream.toArray(); + const nextPage = bindings[0]?.get('nextPage')?.value; + + return nextPage ? new URL(nextPage) : null; +} diff --git a/test/datasets/hydra-page1.ttl b/test/datasets/hydra-page1.ttl new file mode 100644 index 00000000..18604c23 --- /dev/null +++ b/test/datasets/hydra-page1.ttl @@ -0,0 +1,39 @@ +@prefix adms: . +@prefix dcat: . +@prefix dct: . +@prefix foaf: . +@prefix gsp: . +@prefix hydra: . +@prefix locn: . +@prefix owl: . +@prefix rdf: . +@prefix rdfs: . +@prefix schema: . +@prefix skos: . +@prefix spdx: . +@prefix time: . +@prefix vcard: . +@prefix xml: . +@prefix xsd: . + + a hydra:PartialCollectionView ; + hydra:next . + + + a schema:Dataset ; + schema:name "B" ; + schema:dateModified "2021-09-05"^^xsd:date ; + schema:datePublished "2021-09-05"^^xsd:date ; + schema:description "Combinatie van kadastrale perceelnummers uit 1832 met Verpondingsnummers uit het verpondingsregister 1875 en de plaatselijke aanduiding uit het verpondingsregister." ; + schema:license ; + schema:publisher ; + schema:distribution . + + + a schema:Organization ; + schema:name "Gouda Tijdmachine" . + + + a schema:DataDownload ; + schema:contentUrl "https://www.goudatijdmachine.nl/wp-content/uploads/sites/7/2021/09/Totaal_perceel_Plaand_EPSG_4326.geojson" ; + schema:encodingFormat "application/geo+json" . diff --git a/test/datasets/hydra-page2.ttl b/test/datasets/hydra-page2.ttl new file mode 100644 index 00000000..57495924 --- /dev/null +++ b/test/datasets/hydra-page2.ttl @@ -0,0 +1,39 @@ +@prefix adms: . +@prefix dcat: . +@prefix dct: . +@prefix foaf: . +@prefix gsp: . +@prefix hydra: . +@prefix locn: . +@prefix owl: . +@prefix rdf: . +@prefix rdfs: . +@prefix schema: . +@prefix skos: . +@prefix spdx: . +@prefix time: . +@prefix vcard: . +@prefix xml: . +@prefix xsd: . + + a hydra:PartialCollectionView ; + hydra:previous . + + + a schema:Dataset ; + schema:name "Combinatie perceelnummers met verpondingsregister"^^xsd:string ; + schema:dateModified "2021-09-05"^^xsd:date ; + schema:datePublished "2022-09-05"^^xsd:date ; + schema:description "Combinatie van kadastrale perceelnummers uit 1832 met Verpondingsnummers uit het verpondingsregister 1875 en de plaatselijke aanduiding uit het verpondingsregister." ; + schema:license ; + schema:publisher ; + schema:distribution . + + + a schema:Organization ; + schema:name "Gouda Tijdmachine" . + + + a schema:DataDownload ; + schema:contentUrl "https://www.goudatijdmachine.nl/wp-content/uploads/sites/7/2021/09/Totaal_perceel_Plaand_EPSG_4326.geojson" ; + schema:encodingFormat "application/geo+json" . diff --git a/test/fetch.test.ts b/test/fetch.test.ts index 9a8f6107..fc2acb2c 100644 --- a/test/fetch.test.ts +++ b/test/fetch.test.ts @@ -202,4 +202,24 @@ describe('Fetch', () => { expect(e).toBeInstanceOf(NoDatasetFoundAtUrl); } }); + + it('handles paginated responses', async () => { + nock('https://example.com') + .get('/datasets/hydra-page1.ttl') + .replyWithFile(200, 'test/datasets/hydra-page1.ttl', { + 'Content-Type': 'text/turtle', + }); + + nock('https://example.com') + .get('/datasets/hydra-page2.ttl') + .replyWithFile(200, 'test/datasets/hydra-page2.ttl', { + 'Content-Type': 'text/turtle', + }); + + const datasets = await fetch( + new URL('https://example.com/datasets/hydra-page1.ttl') + ); + + expect(datasets).toHaveLength(2); + }); }); diff --git a/tsconfig.json b/tsconfig.json index 36be84f4..e154762d 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -7,7 +7,7 @@ "lib": [ "ES2020" ], - "module": "ES2020", + "module": "es2022", "declarationMap": true, "skipLibCheck": true, "esModuleInterop": true,