Skip to content

Commit

Permalink
#8 Better GeoSpatial API format categorising
Browse files Browse the repository at this point in the history
#11 Use format info in original metadata if it is "CSV-GEO-AU"
#12 Recognise zipped GeoTiff file as GeoTiff format
  • Loading branch information
t83714 committed Jul 1, 2021
1 parent 848d8de commit 91ca2d0
Show file tree
Hide file tree
Showing 11 changed files with 420 additions and 8 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# 1.0.0

- #8 Better GeoSpatial API format categorising
- #11 Use format info in original metadata if it is "CSV-GEO-AU"
- #12 Recognise zipped GeoTiff file as GeoTiff format
13 changes: 13 additions & 0 deletions src/format-engine/measureEvaluatorByHierarchy.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,19 @@ export default function getBestMeasureResult(
) {
finalCandidate = dcatSet;
}

// if format get from original metadata is "WMS" or "WFS", we should trust it
if (
["WMS", "WFS"].indexOf(dcatFormat) !== -1 &&
dcatFormat !== sortedFormat
) {
finalCandidate = dcatSet;
}

// if format get from original metadata is "CSV-GEO-AU", we should trust it
if (dcatFormat === "CSV-GEO-AU" && dcatFormat !== sortedFormat) {
finalCandidate = dcatSet;
}
}
return {
format: finalCandidate.measureResult.formats[0],
Expand Down
39 changes: 37 additions & 2 deletions src/format-engine/measures/dcatFormatMeasure.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,10 +90,42 @@ function getFilteredBracketedFormats(formats: Array<string>) {
});
}

/**
* Remove a candidate format string if it has no Alphabeta or number char
*
* @param {string[]} formats
*/
const filterNonAlphabetaNumberFormat = (formats: string[]) =>
formats.filter(format => format.match(/[a-z0-9]+/i));

/**
* resolve some known format here. e.g.
* [ 'ZIP', 'FILE', 'GEOTIFF' ] should be considered as "GEOTIFF"
* See https://github.com/magda-io/magda-minion-linked-data-rating/issues/4
*
*
* @param {string[]} formats
* @return {*}
*/
const filterKeepKnownFormat = (formats: string[]) => {
if (formats.indexOf("ZIP") !== -1 && formats.indexOf("GEOTIFF") !== -1) {
return ["GEOTIFF"];
}
return formats;
};

export default function getMeasureResult(
relatedDistribution: any,
synonymObject: any
): MeasureResult {
/**
* standardize format string to known / common format
*
* @param {string[]} formats
*/
const filterCommonFormat = (formats: string[]) =>
formats.map(format => getCommonFormat(format, synonymObject));

if (
!relatedDistribution ||
!relatedDistribution.aspects ||
Expand Down Expand Up @@ -143,7 +175,10 @@ export default function getMeasureResult(
replaceAmpersandFormats,
splitWhiteSpaceFormats,
reduceMimeType,
filterBracketedFormats
filterBracketedFormats,
filterNonAlphabetaNumberFormat,
filterCommonFormat,
filterKeepKnownFormat
];

processedFormats = cleanUpAssemblyChain
Expand All @@ -160,7 +195,7 @@ export default function getMeasureResult(
return {
formats: processedFormats.map(eachFormat => {
return {
format: getCommonFormat(eachFormat, synonymObject),
format: eachFormat,
correctConfidenceLevel: 100
};
}),
Expand Down
12 changes: 11 additions & 1 deletion src/format-engine/measures/downloadExtensionMeasure.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@ const URL_REGEXES: Array<[RegExp, string]> = [
[new RegExp(".*\\.geojson$", "i"), "GEOJSON"],
[new RegExp(".*\\?.*service=wms.*", "i"), "WMS"],
[new RegExp(".*\\?.*service=wfs.*", "i"), "WFS"],
[new RegExp("\\W+MapServer\\W*|\\W+FeatureServer\\W*", "i"), "ESRI REST"],
[new RegExp("\\W+MapServer\\W*", "i"), "ESRI MAPSERVER"],
// ESRI FeatureServer Group We will set as "ESRI MAPSERVER" for now: https://github.com/magda-io/magda-minion-format/issues/8
[
new RegExp("(\\W+FeatureServer)|(\\W+FeatureServer/)$", "i"),
"ESRI MAPSERVER"
],
[new RegExp("\\W+FeatureServer/d", "i"), "ESRI MAPSERVER"], // ESRI FeatureServer
[
new RegExp("(\\W+SceneServer)|(\\W+SceneServer/)$", "i"),
"ESRI SCENESERVER"
], // ESRI SceneServer
[new RegExp(".*\\.(shp|shz|dbf)(\\.zip)?$", "i"), "SHP"],
[new RegExp(".*\\.(pdf)(\\.zip)?$", "i"), "PDF"],
[new RegExp(".*\\.(json)(\\.zip)?$", "i"), "JSON"],
Expand Down
53 changes: 48 additions & 5 deletions src/test/onRecordFound.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ import soilRiskMap2 from "./sampleDataFiles/soil-risk-map-2.json";

import { AuthorizedRegistryClient as Registry } from "@magda/minion-sdk";

describe("onRecordFound", function(this: Mocha.ISuiteCallbackContext) {
describe("onRecordFound", async function(this) {
async function testDistReturnsFormat(
distributionData: any,
format: string
Expand All @@ -40,7 +40,13 @@ describe("onRecordFound", function(this: Mocha.ISuiteCallbackContext) {
registry.putRecordAspect.callsFake(
(disId: any, aType: any, aspect: any) => {
resultAspect = aspect;
return new Promise((resolve, reject) => resolve());
return Promise.resolve({
id: disId,
tenantId: 0,
name: "test",
sourceTag: "xxxx",
aspects: []
});
}
);

Expand All @@ -51,9 +57,18 @@ describe("onRecordFound", function(this: Mocha.ISuiteCallbackContext) {
});
}

function testDistFileReturnsFormat(
distributionDataFile: string,
format: string
) {
const data = require(distributionDataFile);
it(`Should process "${distributionDataFile}" as "${format}"`, () =>
testDistReturnsFormat(data, format));
}

describe("Should process sample launceston dataset data correctly", function() {
it("Should return `ESRI REST` for distribution no. 2", () => {
return testDistReturnsFormat(launcestonDist2, "ESRI REST");
it("Should return `ESRI MAPSERVER` for distribution no. 2", () => {
return testDistReturnsFormat(launcestonDist2, "ESRI MAPSERVER");
});

it("Should return `WMS` for distribution no.7", () => {
Expand Down Expand Up @@ -123,6 +138,34 @@ describe("onRecordFound", function(this: Mocha.ISuiteCallbackContext) {
});
});

testDistFileReturnsFormat(
"./sampleDataFiles/esri-featureserver.json",
"ESRI MAPSERVER"
);

testDistFileReturnsFormat(
"./sampleDataFiles/esri-sceneserver.json",
"ESRI SCENESERVER"
);

// we put as ESRI MAPSERVER for now see https://github.com/magda-io/magda-minion-format/issues/8
testDistFileReturnsFormat(
"./sampleDataFiles/esri-featureserver-group1.json",
"ESRI MAPSERVER"
);

testDistFileReturnsFormat(
"./sampleDataFiles/esri-featureserver-group2.json",
"ESRI MAPSERVER"
);

testDistFileReturnsFormat("./sampleDataFiles/GeoTIFF-zip.json", "GEOTIFF");

testDistFileReturnsFormat(
"./sampleDataFiles/csv-geo-au.json",
"CSV-GEO-AU"
);

/**
* This test simply takes a bunch of formats that were previously causing the minion to use all its CPU and be
* killed by a liveness check and ensures that they all are able to execute in less than 5 seconds.
Expand All @@ -133,7 +176,7 @@ describe("onRecordFound", function(this: Mocha.ISuiteCallbackContext) {
const registry = sinon.createStubInstance(Registry);
registry.putRecordAspect.callsFake(
(disId: any, aType: any, aspect: any) => {
return new Promise((resolve, reject) => resolve());
return Promise.resolve({} as any);
}
);

Expand Down
51 changes: 51 additions & 0 deletions src/test/sampleDataFiles/GeoTIFF-zip.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"aspects": {
"dcat-distribution-strings": {
"description": "Link to Catchment scale land use of Australia – Update December 2020 raster package (GeoTIFF and supporting files) – Zip [126 MB]. This zip file includes:\r\n\r\nCLUM raster dataset of Catchment scale land use of Australia. GeoTIFF, 16 bit integer, resolution 50 metres by 50 metres, coordinate system GDA94 / Australian Albers.\r\n\r\nRaster data assigned with 18 class, primary and secondary classes as depicted in the Australian Land Use and Management Classification (ALUMC) version 8, agricultural industries, date of mapping and scale of mapping.",
"downloadURL": "https://www.agriculture.gov.au/sites/default/files/documents/geotiff_clum_50m1220m.zip",
"format": "zipped file - geotiff",
"issued": "2021-02-25T00:52:51Z",
"license": "Creative Commons Attribution 4.0 International",
"mediaType": "application/zip",
"modified": "2021-02-25T00:00:00Z",
"title": "Raster package – GeoTIFF"
},
"ckan-resource": {
"cache_last_updated": null,
"cache_url": null,
"created": "2021-02-25T00:52:51.282127",
"datastore_active": false,
"description": "Link to Catchment scale land use of Australia – Update December 2020 raster package (GeoTIFF and supporting files) – Zip [126 MB]. This zip file includes:\r\n\r\nCLUM raster dataset of Catchment scale land use of Australia. GeoTIFF, 16 bit integer, resolution 50 metres by 50 metres, coordinate system GDA94 / Australian Albers.\r\n\r\nRaster data assigned with 18 class, primary and secondary classes as depicted in the Australian Land Use and Management Classification (ALUMC) version 8, agricultural industries, date of mapping and scale of mapping.",
"format": "zipped file - geotiff",
"hash": "",
"id": "bee76d8b-f129-4f26-8658-7b5b11e6cc86",
"last_modified": "2021-02-25T00:00:00",
"mimetype": "application/zip",
"mimetype_inner": null,
"name": "Raster package – GeoTIFF",
"package_id": "97bb9e54-f0df-4073-9288-e0ebded53a96",
"position": 4,
"resource_type": null,
"revision_id": "7c4feb68-08b9-4f91-bc88-aee8a20039c6",
"size": null,
"state": "active",
"url": "https://www.agriculture.gov.au/sites/default/files/documents/geotiff_clum_50m1220m.zip",
"url_type": null,
"wms_layer": ""
},
"dataset-format": {
"confidenceLevel": 90,
"format": "ZIP"
},
"source": {
"id": "dga",
"name": "data.gov.au",
"type": "ckan-resource",
"url": "https://data.gov.au/data/api/3/action/resource_show?id=bee76d8b-f129-4f26-8658-7b5b11e6cc86"
}
},
"id": "dist-dga-bee76d8b-f129-4f26-8658-7b5b11e6cc86",
"name": "Raster package – GeoTIFF",
"sourceTag": "5b35cb77-307c-440b-9505-2ff8e6abeb3a",
"tenantId": 0
}
55 changes: 55 additions & 0 deletions src/test/sampleDataFiles/csv-geo-au.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{
"aspects": {
"source": {
"id": "nsw-ckan",
"name": "exmaple.com.au",
"type": "ckan-resource",
"url": "https://exmaple.com.au/data/api/3/action/resource_show?id=b8b7a7f8-fd13-4f22-b766-767dd9095417"
},
"ckan-resource": {
"cache_last_updated": null,
"cache_url": null,
"created": "2021-06-15T00:12:07.915995",
"datastore_active": false,
"datastore_contains_all_records_of_source_file": false,
"description": "xxxxx",
"format": "csv-geo-au",
"hash": "",
"id": "b8b7a7f8-fd13-4f22-b766-767dd9095417",
"last_modified": "2021-06-15T00:12:07.891368",
"metadata_modified": "2021-06-15T00:12:08.133726",
"mimetype": "text/csv",
"mimetype_inner": null,
"name": "Australian-towns-sample with geo2.csv-geo-au.csv",
"package_id": "91a74fbe-c426-4c5b-8c85-486364c94416",
"position": 0,
"resource_type": null,
"size": 70987,
"state": "active",
"url": "https://exmaple.com.au/data/dataset/91a74fbe-c426-4c5b-8c85-486364c94416/resource/b8b7a7f8-fd13-4f22-b766-767dd9095417/download/au-towns-sample-with-geo2.csv-geo-au.csv",
"url_type": "upload"
},
"dcat-distribution-strings": {
"description": "xxxxx",
"downloadURL": "https://exmaple.com.au/data/dataset/91a74fbe-c426-4c5b-8c85-486364c94416/resource/b8b7a7f8-fd13-4f22-b766-767dd9095417/download/au-towns-sample-with-geo2.csv-geo-au.csv",
"format": "csv-geo-au",
"issued": "2021-06-15T00:12:07Z",
"license": "License Not Specified",
"mediaType": "text/csv",
"modified": "2021-06-15T00:12:07Z",
"title": "Australian-towns-sample with geo2.csv-geo-au.csv"
},
"source-link-status": {
"httpStatusCode": 200,
"status": "active"
},
"dataset-format": {
"confidenceLevel": 90,
"format": "CSV"
}
},
"id": "dist-nsw-ckan-b8b7a7f8-fd13-4f22-b766-767dd9095417",
"name": "Australian-towns-sample with geo2.csv-geo-au.csv",
"sourceTag": "aae3c868-6e5b-4359-8582-b3d035c89926",
"tenantId": 0
}
50 changes: 50 additions & 0 deletions src/test/sampleDataFiles/esri-featureserver-group1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
{
"aspects": {
"source": {
"id": "nsw-ckan",
"name": "data.nsw.gov.au",
"type": "ckan-resource",
"url": "https://data.nsw.gov.au/data/api/3/action/resource_show?id=63e0bbd3-2dab-4965-af33-dc402dd95f51"
},
"ckan-resource": {
"cache_last_updated": null,
"cache_url": null,
"created": "2020-02-05T10:06:49",
"datastore_active": false,
"datastore_contains_all_records_of_source_file": false,
"format": "ARCGIS REST SERVICE",
"hash": "",
"id": "63e0bbd3-2dab-4965-af33-dc402dd95f51",
"last_modified": null,
"metadata_modified": "2021-06-30T14:22:38.854070",
"mimetype": null,
"mimetype_inner": null,
"package_id": "65bd07a8-9a63-4bbf-9bf3-1ee9204802ae",
"position": 0,
"resource_type": null,
"size": null,
"state": "active",
"url": "https://portal.spatial.nsw.gov.au/server/rest/services/NSW_Administrative_Boundaries_Theme/FeatureServer",
"url_type": null
},
"dcat-distribution-strings": {
"downloadURL": "https://portal.spatial.nsw.gov.au/server/rest/services/NSW_Administrative_Boundaries_Theme/FeatureServer",
"format": "ARCGIS REST SERVICE",
"issued": "2020-02-05T10:06:49Z",
"license": "Creative Commons Attribution",
"title": "63e0bbd3-2dab-4965-af33-dc402dd95f51"
},
"source-link-status": {
"httpStatusCode": 200,
"status": "active"
},
"dataset-format": {
"confidenceLevel": 33,
"format": "ARCGIS"
}
},
"id": "dist-nsw-ckan-63e0bbd3-2dab-4965-af33-dc402dd95f51",
"name": "63e0bbd3-2dab-4965-af33-dc402dd95f51",
"sourceTag": "eb74ae9c-a301-441b-857b-54d5db622d90",
"tenantId": 0
}
Loading

0 comments on commit 91ca2d0

Please sign in to comment.