Skip to content

Commit

Permalink
PICA: general changes #163
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed May 24, 2023
1 parent 0ed277c commit 83af05c
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 23 deletions.
4 changes: 3 additions & 1 deletion scripts/sqlite/completeness.sqlite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ CREATE TABLE IF NOT EXISTS "marc_elements" (
"groupId" INTEGER,
"documenttype" TEXT,
"path" TEXT,
"sortkey" TEXT,
"packageid" INTEGER,
"package" TEXT,
"tag" TEXT,
Expand All @@ -38,6 +39,7 @@ CREATE TABLE IF NOT EXISTS "marc_elements" (
);
CREATE INDEX IF NOT EXISTS "gme_groupId" ON "marc_elements" ("groupId");
CREATE INDEX IF NOT EXISTS "gme_documenttype" ON "marc_elements" ("documenttype");
CREATE INDEX IF NOT EXISTS "gme_sortkey" ON "marc_elements" ("sortkey");
EOF

log "clean marc_elements"
Expand All @@ -49,7 +51,7 @@ log "create headless CSV"
if [[ "${HAS_GROUP_PARAM}" == "1" ]]; then
tail -n +2 ${OUTPUT_DIR}/completeness-grouped-marc-elements.csv > ${OUTPUT_DIR}/marc-elements-noheader.csv
else
tail -n +2 ${OUTPUT_DIR}/marc-elements.csv | sed 's;^;0,;' > ${OUTPUT_DIR}/marc-elements-noheader.csv
tail -n +2 ${OUTPUT_DIR}/marc-elements.csv > ${OUTPUT_DIR}/marc-elements-noheader.csv
fi

log "import marc elements"
Expand Down
24 changes: 10 additions & 14 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ private void saveMarcElements(String fileExtension, char separator) {
Path path = Paths.get(parameters.getOutputDir(), "marc-elements" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(CsvUtils.createCsv(
"documenttype", "path", "packageid", "package", "tag", "subfield",
"groupId", "documenttype", "path", "sortkey", "packageid", "package", "tag", "subfield",
"number-of-record", "number-of-instances",
"min", "max", "mean", "stddev", "histogram"
));
Expand All @@ -236,7 +236,7 @@ private void saveGroupedMarcElements(String fileExtension, char separator) {
Path path = Paths.get(parameters.getOutputDir(), "completeness-grouped-marc-elements" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(CsvUtils.createCsv(
"groupId", "documenttype", "path", "packageid", "package", "tag", "subfield",
"groupId", "documenttype", "path", "sortkey", "packageid", "package", "tag", "subfield",
"number-of-record", "number-of-instances",
"min", "max", "mean", "stddev", "histogram"
));
Expand Down Expand Up @@ -383,6 +383,7 @@ private String formatCardinality(String marcPath,
}

String marcPathLabel = marcPath.replace("!ind", "ind").replaceAll("\\|(\\d)$", "$1");
String sortkey = marcPath.replaceAll("^leader", "000");
int packageId = TagCategory.OTHER.getId();
String packageLabel = TagCategory.OTHER.getLabel();
String tagLabel = "";
Expand Down Expand Up @@ -417,21 +418,16 @@ private String formatCardinality(String marcPath,
BasicStatistics statistics = new BasicStatistics(histogram);

List<Object> values = Arrays.asList(
documentType, marcPathLabel, packageId, packageLabel, tagLabel, subfieldLabel,
frequency, // = number-of-record
cardinality, // = number-of-instances
statistics.getMin(), statistics.getMax(),
statistics.getMean(), statistics.getStdDev(),
statistics.formatHistogram()
(groupId != null ? groupId : 0),
documentType, marcPathLabel, sortkey, packageId, packageLabel, tagLabel, subfieldLabel,
frequency, // = number-of-record
cardinality, // = number-of-instances
statistics.getMin(), statistics.getMax(),
statistics.getMean(), statistics.getStdDev(),
statistics.formatHistogram()
);
if (groupId != null) {
List<Object> merged = new ArrayList<>(List.of((Object)groupId));
merged.addAll(values);
values = merged;
}

return CsvUtils.createCsvFromObjects(values);
// return StringUtils.join(values, separator) + "\n";
}

private char getSeparator(ValidationErrorFormat format) {
Expand Down
16 changes: 8 additions & 8 deletions src/test/java/de/gwdg/metadataqa/marc/cli/CompletenessTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,15 @@ public void completeness_pica_groupBy() throws Exception {
while ((record = reader.readNext()) != null) {
if (lineNr == 0)
assertEquals(
"groupId,documenttype,path,packageid,package,tag,subfield,number-of-record,number-of-instances,min,max,mean,stddev,histogram",
"groupId,documenttype,path,sortkey,packageid,package,tag,subfield,number-of-record,number-of-instances,min,max,mean,stddev,histogram",
StringUtils.join(record, ",")
);
else {
int records = Integer.parseInt(record[7]);
int occurrences = Integer.parseInt(record[8]);
int records = Integer.parseInt(record[8]);
int occurrences = Integer.parseInt(record[9]);
assertTrue(records <= occurrences);
int total = 0;
String histogram = record[13].replaceAll("^\"(.*)\"$", "$1");
String histogram = record[14].replaceAll("^\"(.*)\"$", "$1");
for (String expr : histogram.split("; ")) {
String[] parts = expr.split("=");
total += Integer.parseInt(parts[0]) * Integer.parseInt(parts[1]);
Expand Down Expand Up @@ -175,15 +175,15 @@ public void completeness_pica_groupBy_file() throws Exception {
while ((record = reader.readNext()) != null) {
if (lineNr == 0)
assertEquals(
"groupId,documenttype,path,packageid,package,tag,subfield,number-of-record,number-of-instances,min,max,mean,stddev,histogram",
"groupId,documenttype,path,sortkey,packageid,package,tag,subfield,number-of-record,number-of-instances,min,max,mean,stddev,histogram",
StringUtils.join(record, ",")
);
else {
int records = Integer.parseInt(record[7]);
int occurrences = Integer.parseInt(record[8]);
int records = Integer.parseInt(record[8]);
int occurrences = Integer.parseInt(record[9]);
assertTrue(records <= occurrences);
int total = 0;
String histogram = record[13].replaceAll("^\"(.*)\"$", "$1");
String histogram = record[14].replaceAll("^\"(.*)\"$", "$1");
for (String expr : histogram.split("; ")) {
String[] parts = expr.split("=");
total += Integer.parseInt(parts[0]) * Integer.parseInt(parts[1]);
Expand Down

0 comments on commit 83af05c

Please sign in to comment.