Skip to content

Commit

Permalink
feat: Support Hydra pagination (#708)
Browse files Browse the repository at this point in the history
  • Loading branch information
ddeboer committed Mar 22, 2023
1 parent d41c79f commit aafb615
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 12 deletions.
8 changes: 4 additions & 4 deletions jest.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ export default {
coverageReporters: ['json-summary', 'text'],
coverageThreshold: {
global: {
lines: 69.15,
statements: 68.94,
branches: 55.1,
functions: 68.42,
lines: 69.81,
statements: 69.6,
branches: 60.36,
functions: 69.23,
},
},
transform: {},
Expand Down
54 changes: 54 additions & 0 deletions src/comunica-config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"@context": [
"https://linkedsoftwaredependencies.org/bundles/npm/@comunica/config-query-sparql/^2.0.0/components/context.jsonld"
],
"import": [
"ccqs:config/context-preprocess/actors.json",
"ccqs:config/context-preprocess/mediators.json",
"ccqs:config/hash-bindings/actors.json",
"ccqs:config/hash-bindings/mediators.json",
"ccqs:config/http/actors.json",
"ccqs:config/http/mediators.json",
"ccqs:config/http-invalidate/actors.json",
"ccqs:config/http-invalidate/mediators.json",
"ccqs:config/init/actors.json",
"ccqs:config/optimize-query-operation/actors.json",
"ccqs:config/optimize-query-operation/mediators.json",
"ccqs:config/query-operation/actors.json",
"ccqs:config/query-operation/mediators.json",
"ccqs:config/query-parse/actors.json",
"ccqs:config/query-parse/mediators.json",
"ccqs:config/query-result-serialize/actors.json",
"ccqs:config/query-result-serialize/mediators.json",
"ccqs:config/dereference/actors.json",
"ccqs:config/dereference/mediators.json",
"ccqs:config/dereference-rdf/actors.json",
"ccqs:config/dereference-rdf/mediators.json",
"ccqs:config/rdf-join/actors.json",
"ccqs:config/rdf-join/mediators.json",
"ccqs:config/rdf-join-entries-sort/actors.json",
"ccqs:config/rdf-join-entries-sort/mediators.json",
"ccqs:config/rdf-join-selectivity/actors.json",
"ccqs:config/rdf-join-selectivity/mediators.json",
"ccqs:config/rdf-metadata/actors.json",
"ccqs:config/rdf-metadata/mediators.json",
"ccqs:config/rdf-metadata-extract/actors.json",
"ccqs:config/rdf-metadata-extract/mediators.json",
"ccqs:config/rdf-parse/actors.json",
"ccqs:config/rdf-parse/mediators.json",
"ccqs:config/rdf-parse-html/actors.json",
"ccqs:config/rdf-resolve-hypermedia/actors.json",
"ccqs:config/rdf-resolve-hypermedia/mediators.json",
"ccqs:config/rdf-resolve-hypermedia-links/mediators.json",
"ccqs:config/rdf-resolve-hypermedia-links-queue/actors.json",
"ccqs:config/rdf-resolve-hypermedia-links-queue/mediators.json",
"ccqs:config/rdf-resolve-quad-pattern/actors.json",
"ccqs:config/rdf-resolve-quad-pattern/mediators.json",
"ccqs:config/rdf-serialize/actors.json",
"ccqs:config/rdf-serialize/mediators.json",
"ccqs:config/rdf-update-hypermedia/actors.json",
"ccqs:config/rdf-update-hypermedia/mediators.json",
"ccqs:config/rdf-update-quads/actors.json",
"ccqs:config/rdf-update-quads/mediators.json"
]
}
46 changes: 39 additions & 7 deletions src/fetch.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {QueryEngine} from '@comunica/query-sparql';
import {QueryEngineFactory} from '@comunica/query-sparql';
import factory from 'rdf-ext';
import DatasetExt from 'rdf-ext/lib/Dataset';
import {URL} from 'url';
Expand All @@ -8,6 +8,7 @@ import {pipeline} from 'stream';
import {StandardizeSchemaOrgPrefixToHttps} from './transform';
import Pino from 'pino';
import {rdfDereferencer} from './rdf';
import {resolve} from 'node:path';

export class HttpError extends Error {
constructor(message: string, public readonly statusCode: number) {
Expand All @@ -22,11 +23,10 @@ export class NoDatasetFoundAtUrl extends Error {
}

export async function fetch(url: URL): Promise<DatasetExt[]> {
let datasets = [];
try {
datasets = await query(url);
} catch (e) {
handleComunicaError(e);
let datasets = await doFetch(url);
const nextPage = await findNextPage(url);
if (nextPage !== null && nextPage !== url) {
datasets = [...datasets, ...(await doFetch(nextPage))];
}

if (datasets.length === 0) {
Expand All @@ -36,6 +36,14 @@ export async function fetch(url: URL): Promise<DatasetExt[]> {
return datasets;
}

export async function doFetch(url: URL) {
try {
return await query(url);
} catch (e) {
handleComunicaError(e);
}
}

/**
* Fetch dataset description(s) by dereferencing the registration URL.
*/
Expand All @@ -53,7 +61,11 @@ export async function dereference(url: URL): Promise<DatasetExt> {
}
}

const engine = new QueryEngine();
// Use custom config to disable "ccqs:config/rdf-resolve-hypermedia-links/actors.json", which causes
// many duplicate bindings and does not find any datasets on the second page ff.
const engine = await new QueryEngineFactory().create({
configPath: resolve('src/comunica-config.json'),
});

/**
* Fetch dataset descriptions by executing a SPARQL SELECT query.
Expand Down Expand Up @@ -164,3 +176,23 @@ function handleComunicaError(e: unknown): never {

throw e;
}

async function findNextPage(url: URL): Promise<URL | null> {
const bindingsStream = await engine.queryBindings(
`
PREFIX hydra: <http://www.w3.org/ns/hydra/core#>
SELECT ?nextPage {
?s ?nextPagePredicate ?nextPage
VALUES ?nextPagePredicate { hydra:next hydra:nextPage }
}
`,
{
sources: [url.toString()],
}
);
const bindings = await bindingsStream.toArray();
const nextPage = bindings[0]?.get('nextPage')?.value;

return nextPage ? new URL(nextPage) : null;
}
39 changes: 39 additions & 0 deletions test/datasets/hydra-page1.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
@prefix adms: <http://www.w3.org/ns/adms#> .
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix gsp: <http://www.opengis.net/ont/geosparql#> .
@prefix hydra: <http://www.w3.org/ns/hydra/core#> .
@prefix locn: <http://www.w3.org/ns/locn#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix spdx: <http://spdx.org/rdf/terms#> .
@prefix time: <http://www.w3.org/2006/time> .
@prefix vcard: <http://www.w3.org/2006/vcard/ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

</datasets/hydra-page2.ttl> a hydra:PartialCollectionView ;
hydra:next </datasets/hydra-page2.ttl> .

<https://www.goudatijdmachine.nl/data/api/items/145>
a schema:Dataset ;
schema:name "B" ;
schema:dateModified "2021-09-05"^^xsd:date ;
schema:datePublished "2021-09-05"^^xsd:date ;
schema:description "Combinatie van kadastrale perceelnummers uit 1832 met Verpondingsnummers uit het verpondingsregister 1875 en de plaatselijke aanduiding uit het verpondingsregister." ;
schema:license <http://blacreativecommons.org/publicdomain/zero/1.0/deed.nl> ;
schema:publisher <https://www.goudatijdmachine.nl/data/api/items/232> ;
schema:distribution <https://www.goudatijdmachine.nl/data/api/items/144> .

<https://www.goudatijdmachine.nl/data/api/items/232>
a schema:Organization ;
schema:name "Gouda Tijdmachine" .

<https://www.goudatijdmachine.nl/data/api/items/144>
a schema:DataDownload ;
schema:contentUrl "https://www.goudatijdmachine.nl/wp-content/uploads/sites/7/2021/09/Totaal_perceel_Plaand_EPSG_4326.geojson" ;
schema:encodingFormat "application/geo+json" .
39 changes: 39 additions & 0 deletions test/datasets/hydra-page2.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
@prefix adms: <http://www.w3.org/ns/adms#> .
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix gsp: <http://www.opengis.net/ont/geosparql#> .
@prefix hydra: <http://www.w3.org/ns/hydra/core#> .
@prefix locn: <http://www.w3.org/ns/locn#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix schema: <http://schema.org/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix spdx: <http://spdx.org/rdf/terms#> .
@prefix time: <http://www.w3.org/2006/time> .
@prefix vcard: <http://www.w3.org/2006/vcard/ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

</datasets/hydra-page2.ttl> a hydra:PartialCollectionView ;
hydra:previous </datasets/hydra-page1.ttl> .

<https://example.com/444>
a schema:Dataset ;
schema:name "Combinatie perceelnummers met verpondingsregister"^^xsd:string ;
schema:dateModified "2021-09-05"^^xsd:date ;
schema:datePublished "2022-09-05"^^xsd:date ;
schema:description "Combinatie van kadastrale perceelnummers uit 1832 met Verpondingsnummers uit het verpondingsregister 1875 en de plaatselijke aanduiding uit het verpondingsregister." ;
schema:license <http://creativecommons.org/publicdomain/zero/1.0/deed.nl> ;
schema:publisher <https://www.goudatijdmachine.nl/data/api/items/232> ;
schema:distribution <https://www.goudatijdmachine.nl/data/api/items/144> .

<https://www.goudatijdmachine.nl/data/api/items/232>
a schema:Organization ;
schema:name "Gouda Tijdmachine" .

<https://www.goudatijdmachine.nl/data/api/items/144>
a schema:DataDownload ;
schema:contentUrl "https://www.goudatijdmachine.nl/wp-content/uploads/sites/7/2021/09/Totaal_perceel_Plaand_EPSG_4326.geojson" ;
schema:encodingFormat "application/geo+json" .
20 changes: 20 additions & 0 deletions test/fetch.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -202,4 +202,24 @@ describe('Fetch', () => {
expect(e).toBeInstanceOf(NoDatasetFoundAtUrl);
}
});

it('handles paginated responses', async () => {
nock('https://example.com')
.get('/datasets/hydra-page1.ttl')
.replyWithFile(200, 'test/datasets/hydra-page1.ttl', {
'Content-Type': 'text/turtle',
});

nock('https://example.com')
.get('/datasets/hydra-page2.ttl')
.replyWithFile(200, 'test/datasets/hydra-page2.ttl', {
'Content-Type': 'text/turtle',
});

const datasets = await fetch(
new URL('https://example.com/datasets/hydra-page1.ttl')
);

expect(datasets).toHaveLength(2);
});
});
2 changes: 1 addition & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"lib": [
"ES2020"
],
"module": "ES2020",
"module": "es2022",
"declarationMap": true,
"skipLibCheck": true,
"esModuleInterop": true,
Expand Down

0 comments on commit aafb615

Please sign in to comment.