From 83af05c7849efeffea7812895398b2f2f162e4d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Kir=C3=A1ly?= Date: Wed, 24 May 2023 18:06:47 +0200 Subject: [PATCH] PICA: general changes #163 --- scripts/sqlite/completeness.sqlite.sh | 4 +++- .../metadataqa/marc/cli/Completeness.java | 24 ++++++++----------- .../metadataqa/marc/cli/CompletenessTest.java | 16 ++++++------- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/scripts/sqlite/completeness.sqlite.sh b/scripts/sqlite/completeness.sqlite.sh index c491bcfb6..74aafe0cb 100755 --- a/scripts/sqlite/completeness.sqlite.sh +++ b/scripts/sqlite/completeness.sqlite.sh @@ -24,6 +24,7 @@ CREATE TABLE IF NOT EXISTS "marc_elements" ( "groupId" INTEGER, "documenttype" TEXT, "path" TEXT, + "sortkey" TEXT, "packageid" INTEGER, "package" TEXT, "tag" TEXT, @@ -38,6 +39,7 @@ CREATE TABLE IF NOT EXISTS "marc_elements" ( ); CREATE INDEX IF NOT EXISTS "gme_groupId" ON "marc_elements" ("groupId"); CREATE INDEX IF NOT EXISTS "gme_documenttype" ON "marc_elements" ("documenttype"); +CREATE INDEX IF NOT EXISTS "gme_sortkey" ON "marc_elements" ("sortkey"); EOF log "clean marc_elements" @@ -49,7 +51,7 @@ log "create headless CSV" if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then tail -n +2 ${OUTPUT_DIR}/completeness-grouped-marc-elements.csv > ${OUTPUT_DIR}/marc-elements-noheader.csv else - tail -n +2 ${OUTPUT_DIR}/marc-elements.csv | sed 's;^;0,;' > ${OUTPUT_DIR}/marc-elements-noheader.csv + tail -n +2 ${OUTPUT_DIR}/marc-elements.csv > ${OUTPUT_DIR}/marc-elements-noheader.csv fi log "import marc elements" diff --git a/src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java b/src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java index 9837d2117..6c60b18dc 100644 --- a/src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java +++ b/src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java @@ -213,7 +213,7 @@ private void saveMarcElements(String fileExtension, char separator) { Path path = Paths.get(parameters.getOutputDir(), "marc-elements" + fileExtension); try (var writer = Files.newBufferedWriter(path)) { writer.write(CsvUtils.createCsv( - "documenttype", "path", "packageid", "package", "tag", "subfield", + "groupId", "documenttype", "path", "sortkey", "packageid", "package", "tag", "subfield", "number-of-record", "number-of-instances", "min", "max", "mean", "stddev", "histogram" )); @@ -236,7 +236,7 @@ private void saveGroupedMarcElements(String fileExtension, char separator) { Path path = Paths.get(parameters.getOutputDir(), "completeness-grouped-marc-elements" + fileExtension); try (var writer = Files.newBufferedWriter(path)) { writer.write(CsvUtils.createCsv( - "groupId", "documenttype", "path", "packageid", "package", "tag", "subfield", + "groupId", "documenttype", "path", "sortkey", "packageid", "package", "tag", "subfield", "number-of-record", "number-of-instances", "min", "max", "mean", "stddev", "histogram" )); @@ -383,6 +383,7 @@ private String formatCardinality(String marcPath, } String marcPathLabel = marcPath.replace("!ind", "ind").replaceAll("\\|(\\d)$", "$1"); + String sortkey = marcPath.replaceAll("^leader", "000"); int packageId = TagCategory.OTHER.getId(); String packageLabel = TagCategory.OTHER.getLabel(); String tagLabel = ""; @@ -417,21 +418,16 @@ private String formatCardinality(String marcPath, BasicStatistics statistics = new BasicStatistics(histogram); List values = Arrays.asList( - documentType, marcPathLabel, packageId, packageLabel, tagLabel, subfieldLabel, - frequency, // = number-of-record - cardinality, // = number-of-instances - statistics.getMin(), statistics.getMax(), - statistics.getMean(), statistics.getStdDev(), - statistics.formatHistogram() + (groupId != null ? groupId : 0), + documentType, marcPathLabel, sortkey, packageId, packageLabel, tagLabel, subfieldLabel, + frequency, // = number-of-record + cardinality, // = number-of-instances + statistics.getMin(), statistics.getMax(), + statistics.getMean(), statistics.getStdDev(), + statistics.formatHistogram() ); - if (groupId != null) { - List merged = new ArrayList<>(List.of((Object)groupId)); - merged.addAll(values); - values = merged; - } return CsvUtils.createCsvFromObjects(values); - // return StringUtils.join(values, separator) + "\n"; } private char getSeparator(ValidationErrorFormat format) { diff --git a/src/test/java/de/gwdg/metadataqa/marc/cli/CompletenessTest.java b/src/test/java/de/gwdg/metadataqa/marc/cli/CompletenessTest.java index 40b1b5313..1498cf3b6 100644 --- a/src/test/java/de/gwdg/metadataqa/marc/cli/CompletenessTest.java +++ b/src/test/java/de/gwdg/metadataqa/marc/cli/CompletenessTest.java @@ -125,15 +125,15 @@ public void completeness_pica_groupBy() throws Exception { while ((record = reader.readNext()) != null) { if (lineNr == 0) assertEquals( - "groupId,documenttype,path,packageid,package,tag,subfield,number-of-record,number-of-instances,min,max,mean,stddev,histogram", + "groupId,documenttype,path,sortkey,packageid,package,tag,subfield,number-of-record,number-of-instances,min,max,mean,stddev,histogram", StringUtils.join(record, ",") ); else { - int records = Integer.parseInt(record[7]); - int occurrences = Integer.parseInt(record[8]); + int records = Integer.parseInt(record[8]); + int occurrences = Integer.parseInt(record[9]); assertTrue(records <= occurrences); int total = 0; - String histogram = record[13].replaceAll("^\"(.*)\"$", "$1"); + String histogram = record[14].replaceAll("^\"(.*)\"$", "$1"); for (String expr : histogram.split("; ")) { String[] parts = expr.split("="); total += Integer.parseInt(parts[0]) * Integer.parseInt(parts[1]); @@ -175,15 +175,15 @@ public void completeness_pica_groupBy_file() throws Exception { while ((record = reader.readNext()) != null) { if (lineNr == 0) assertEquals( - "groupId,documenttype,path,packageid,package,tag,subfield,number-of-record,number-of-instances,min,max,mean,stddev,histogram", + "groupId,documenttype,path,sortkey,packageid,package,tag,subfield,number-of-record,number-of-instances,min,max,mean,stddev,histogram", StringUtils.join(record, ",") ); else { - int records = Integer.parseInt(record[7]); - int occurrences = Integer.parseInt(record[8]); + int records = Integer.parseInt(record[8]); + int occurrences = Integer.parseInt(record[9]); assertTrue(records <= occurrences); int total = 0; - String histogram = record[13].replaceAll("^\"(.*)\"$", "$1"); + String histogram = record[14].replaceAll("^\"(.*)\"$", "$1"); for (String expr : histogram.split("; ")) { String[] parts = expr.split("="); total += Integer.parseInt(parts[0]) * Integer.parseInt(parts[1]);