Skip to content

Commit

Permalink
fix: Read more duplicate triples (#937)
Browse files Browse the repository at this point in the history
* Raise SPARQL LIMIT, which should be fine as we are streaming results.
* Use a subquery for a more representative LIMIT.
  • Loading branch information
ddeboer committed Jun 21, 2024
1 parent fafe0f4 commit 45139cc
Showing 1 changed file with 56 additions and 53 deletions.
109 changes: 56 additions & 53 deletions src/query.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ export const rdf = (property: string): NamedNode =>
factory.namedNode(`http://www.w3.org/1999/02/22-rdf-syntax-ns#${property}`);

export const datasetType = dcat('Dataset');
export const sparqlLimit = 50000;
export const sparqlLimit = 1_000_000;

export const constructQuery = `
PREFIX dcat: <http://www.w3.org/ns/dcat#>
Expand Down Expand Up @@ -99,62 +99,65 @@ export const constructQuery = `
dct:title ?${distributionName} ;
dcat:byteSize ?${distributionSize} .
} WHERE {
{
${schemaOrgQuery('schema')}
} UNION {
${schemaOrgQuery('httpSchema')}
} UNION {
?${dataset} a dcat:Dataset ;
dct:title ?${name} ;
dct:license ?${license} ;
dct:publisher ?${publisher} .
?${publisher} a ?foafOrganizationOrPerson ;
a ?${publisherType} ;
foaf:name ?${publisherName} .
OPTIONAL {
?${creator} a ?foafOrganizationOrPerson ;
a ?${creatorType} ;
foaf:name ?${creatorName} .
}
SELECT * WHERE {
{
${schemaOrgQuery('schema')}
} UNION {
${schemaOrgQuery('httpSchema')}
} UNION {
?${dataset} a dcat:Dataset ;
dct:title ?${name} ;
dct:license ?${license} ;
dct:publisher ?${publisher} .
?${publisher} a ?foafOrganizationOrPerson ;
a ?${publisherType} ;
foaf:name ?${publisherName} .
VALUES ?foafOrganizationOrPerson { foaf:Organization foaf:Person }
OPTIONAL {
?${dataset} dcat:distribution ?${distribution} .
?${distribution} a dcat:Distribution ;
dcat:accessURL ?${convertToIri(distributionUrl)} .
OPTIONAL {
?${creator} a ?foafOrganizationOrPerson ;
a ?${creatorType} ;
foaf:name ?${creatorName} .
}
OPTIONAL { ?${distribution} dct:format ?${distributionFormat} }
OPTIONAL { ?${distribution} dcat:mediaType ?${distributionMediaType} }
OPTIONAL { ?${distribution} dct:issued ${convertToXsdDate(
distributionDatePublished
)} }
OPTIONAL { ?${distribution} dct:modified ${convertToXsdDate(
distributionDateModified
)} }
OPTIONAL { ?${distribution} dct:description ?${distributionDescription} }
OPTIONAL { ?${distribution} dct:language ?${distributionLanguage} }
OPTIONAL { ?${distribution} dct:license ?${distributionLicense} }
OPTIONAL { ?${distribution} dct:title ?${distributionName} }
OPTIONAL { ?${distribution} dcat:byteSize ?${distributionSize} }
VALUES ?foafOrganizationOrPerson { foaf:Organization foaf:Person }
OPTIONAL {
?${dataset} dcat:distribution ?${distribution} .
?${distribution} a dcat:Distribution ;
dcat:accessURL ?${convertToIri(distributionUrl)} .
OPTIONAL { ?${distribution} dct:format ?${distributionFormat} }
OPTIONAL { ?${distribution} dcat:mediaType ?${distributionMediaType} }
OPTIONAL { ?${distribution} dct:issued ${convertToXsdDate(
distributionDatePublished
)} }
OPTIONAL { ?${distribution} dct:modified ${convertToXsdDate(
distributionDateModified
)} }
OPTIONAL { ?${distribution} dct:description ?${distributionDescription} }
OPTIONAL { ?${distribution} dct:language ?${distributionLanguage} }
OPTIONAL { ?${distribution} dct:license ?${distributionLicense} }
OPTIONAL { ?${distribution} dct:title ?${distributionName} }
OPTIONAL { ?${distribution} dcat:byteSize ?${distributionSize} }
}
OPTIONAL { ?${dataset} dct:description ?${description} }
OPTIONAL { ?${dataset} dct:identifier ?${identifier} }
OPTIONAL { ?${dataset} dct:alternative ?${alternateName} }
OPTIONAL { ?${dataset} dct:created ${convertToXsdDate(dateCreated)} }
OPTIONAL { ?${dataset} dct:issued ${convertToXsdDate(datePublished)} }
OPTIONAL { ?${dataset} dct:modified ${convertToXsdDate(dateModified)} }
OPTIONAL { ?${dataset} dct:language ?${language} }
OPTIONAL { ?${dataset} dct:source ?${source} }
OPTIONAL { ?${dataset} dcat:keyword ?${keyword} }
OPTIONAL { ?${dataset} owl:versionInfo ?${version} }
OPTIONAL { ?${dataset} dct:isPartOf ?${includedInDataCatalog} }
OPTIONAL { ?${dataset} dcat:landingPage ?${mainEntityOfPage} }
}
OPTIONAL { ?${dataset} dct:description ?${description} }
OPTIONAL { ?${dataset} dct:identifier ?${identifier} }
OPTIONAL { ?${dataset} dct:alternative ?${alternateName} }
OPTIONAL { ?${dataset} dct:created ${convertToXsdDate(dateCreated)} }
OPTIONAL { ?${dataset} dct:issued ${convertToXsdDate(datePublished)} }
OPTIONAL { ?${dataset} dct:modified ${convertToXsdDate(dateModified)} }
OPTIONAL { ?${dataset} dct:language ?${language} }
OPTIONAL { ?${dataset} dct:source ?${source} }
OPTIONAL { ?${dataset} dcat:keyword ?${keyword} }
OPTIONAL { ?${dataset} owl:versionInfo ?${version} }
OPTIONAL { ?${dataset} dct:isPartOf ?${includedInDataCatalog} }
OPTIONAL { ?${dataset} dcat:landingPage ?${mainEntityOfPage} }
}
} LIMIT ${sparqlLimit}`;
LIMIT ${sparqlLimit}
}`;

function schemaOrgQuery(prefix: string): string {
return `
Expand Down

0 comments on commit 45139cc

Please sign in to comment.