Skip to content

Commit

Permalink
Group results in completeness #199
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Mar 14, 2023
1 parent 8577101 commit 282232c
Show file tree
Hide file tree
Showing 7 changed files with 1,901 additions and 72 deletions.
2 changes: 1 addition & 1 deletion catalogues/k10plus_pica_groupped.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ MARC_DIR=${BASE_PICA_INPUT_DIR}/k10plus_pica_groupped
SCHEMA=PICA
TYPE_PARAMS="--schemaType PICA --marcFormat PICA_NORMALIZED --emptyLargeCollectors"
TYPE_PARAMS="$TYPE_PARAMS --groupBy 001@\$0"
TYPE_PARAMS="$TYPE_PARAMS --groupListFile src/main/resources/kxp-uniq-library-names.tsv"
TYPE_PARAMS="$TYPE_PARAMS --groupListFile src/main/resources/k10plus-libraries-by-unique-iln.txt"
TYPE_PARAMS="$TYPE_PARAMS --ignorableFields 001@,001E,001L,001U,001U,001X,001X,002V,003C,003G,003Z,008G,017N,020F,027D,031B,037I,039V,042@,046G,046T,101@,101E,101U,102D,201E,201U,202D,1...,2..."
TYPE_PARAMS="$TYPE_PARAMS --ignorableIssueTypes undefinedField"
TYPE_PARAMS="$TYPE_PARAMS --allowableRecords base64:"$(echo '002@.0 !~ "^L" && 002@.0 !~ "^..[iktN]" && (002@.0 !~ "^.v" || 021A.a?)' | base64 -w 0)
Expand Down
41 changes: 20 additions & 21 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,14 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import static de.gwdg.metadataqa.marc.Utils.createRow;
import static de.gwdg.metadataqa.marc.Utils.quote;

public class Completeness extends QACli implements BibliographicInputProcessor, Serializable {

private static final Logger logger = Logger.getLogger(Completeness.class.getCanonicalName());
Expand Down Expand Up @@ -193,10 +191,10 @@ private void saveLibraries003(String fileExtension, char separator) {
logger.info("Saving libraries003...");
var path = Paths.get(parameters.getOutputDir(), "libraries003" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(CsvUtils.createCsv(List.of("library", "count")));
writer.write(CsvUtils.createCsv("library", "count"));
completenessDAO.getLibrary003Counter().forEach((key, value) -> {
try {
writer.write(CsvUtils.createCsv(List.of(key, value)));
writer.write(CsvUtils.createCsv(key, value));
} catch (IOException e) {
logger.log(Level.SEVERE, "saveLibraries003", e);
}
Expand All @@ -209,11 +207,11 @@ private void saveLibraries003(String fileExtension, char separator) {
private void saveMarcElements(String fileExtension, char separator) {
Path path = Paths.get(parameters.getOutputDir(), "marc-elements" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(CsvUtils.createCsv(List.of(
writer.write(CsvUtils.createCsv(
"documenttype", "path", "packageid", "package", "tag", "subfield",
"number-of-record", "number-of-instances",
"min", "max", "mean", "stddev", "histogram"
)));
));
completenessDAO.getElementCardinality().forEach((documentType, cardinalities) -> {
cardinalities.forEach((marcPath, cardinality) -> {
try {
Expand All @@ -232,7 +230,7 @@ private void saveGrouppedMarcElements(String fileExtension, char separator) {
logger.info("saving groupped MARC elements...");
Path path = Paths.get(parameters.getOutputDir(), "completeness-groupped-marc-elements" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(createRow(
writer.write(CsvUtils.createCsv(
"groupId", "documenttype", "path", "packageid", "package", "tag", "subfield",
"number-of-record", "number-of-instances",
"min", "max", "mean", "stddev", "histogram"
Expand All @@ -257,7 +255,7 @@ private void savePackages(String fileExtension, char separator) {
logger.info("saving packages...");
var path = Paths.get(parameters.getOutputDir(), "packages" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(CsvUtils.createCsv(List.of("documenttype", "packageid", "name", "label", "iscoretag", "count")));
writer.write(CsvUtils.createCsv("documenttype", "packageid", "name", "label", "iscoretag", "count"));
completenessDAO.getPackageCounter().forEach((documentType, packages) -> {
packages.forEach((packageName, count) -> {
try {
Expand All @@ -274,7 +272,7 @@ private void savePackages(String fileExtension, char separator) {
} else {
logger.severe(packageName + " has not been found in TagCategory");
}
writer.write(CsvUtils.createCsv(List.of(documentType, id, range, label, isPartOfMarcScore, count)));
writer.write(CsvUtils.createCsv(documentType, id, range, label, isPartOfMarcScore, count));
} catch (IOException e) {
logger.log(Level.SEVERE, "savePackages", e);
}
Expand All @@ -289,7 +287,7 @@ private void saveGrouppedPackages(String fileExtension, char separator) {
logger.info("saving groupped packages...");
var path = Paths.get(parameters.getOutputDir(), "completeness-groupped-packages" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(CsvUtils.createCsv(List.of("group", "documenttype", "packageid", "name", "label", "iscoretag", "count")));
writer.write(CsvUtils.createCsv("group", "documenttype", "packageid", "name", "label", "iscoretag", "count"));
completenessDAO.getGrouppedPackageCounter().forEach((groupId, documentTypes) -> {
documentTypes.forEach((documentType, packages) -> {
packages.forEach((packageName, count) -> {
Expand All @@ -307,7 +305,7 @@ private void saveGrouppedPackages(String fileExtension, char separator) {
} else {
logger.severe(packageName + " has not been found in TagCategory");
}
writer.write(CsvUtils.createCsv(List.of(groupId, documentType, id, range, label, isPartOfMarcScore, count)));
writer.write(CsvUtils.createCsv(groupId, documentType, id, range, label, isPartOfMarcScore, count));
} catch (IOException e) {
logger.log(Level.SEVERE, "savePackages", e);
}
Expand All @@ -323,10 +321,10 @@ private void saveLibraries(String fileExtension, char separator) {
logger.info("Saving libraries...");
var path = Paths.get(parameters.getOutputDir(), "libraries" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(CsvUtils.createCsv(List.of("library", "count")));
writer.write(CsvUtils.createCsv("library", "count"));
completenessDAO.getLibraryCounter().forEach((key, value) -> {
try {
writer.write(CsvUtils.createCsv(List.of(key, value)));
writer.write(CsvUtils.createCsv(key, value));
} catch (IOException e) {
logger.log(Level.SEVERE, "saveLibraries", e);
}
Expand All @@ -341,10 +339,10 @@ private void saveGroups(String fileExtension, char separator) {
GroupSelector groupSelector = new GroupSelector(parameters.getGroupListFile());
var path = Paths.get(parameters.getOutputDir(), "completeness-groups" + fileExtension);
try (var writer = Files.newBufferedWriter(path)) {
writer.write(CsvUtils.createCsv(List.of("id", "group", "count")));
writer.write(CsvUtils.createCsv("id", "group", "count"));
completenessDAO.getGroupCounter().forEach((key, value) -> {
try {
writer.write(CsvUtils.createCsv(List.of(key, groupSelector.getOrgName(key), value)));
writer.write(CsvUtils.createCsv(key, groupSelector.getOrgName(key), value));
} catch (IOException e) {
logger.log(Level.SEVERE, "saveLibraries", e);
}
Expand Down Expand Up @@ -396,18 +394,19 @@ private String formatCardinality(String marcPath,
}
BasicStatistics statistics = new BasicStatistics(histogram);

List<Object> values = quote(
Arrays.asList(
List<Object> values = Arrays.asList(
documentType, marcPathLabel, packageId, packageLabel, tagLabel, subfieldLabel,
frequency, // = number-of-record
cardinality, // = number-of-instances
statistics.getMin(), statistics.getMax(),
statistics.getMean(), statistics.getStdDev(),
statistics.formatHistogram()
)
);
if (groupId != null)
values.add(0, groupId);
if (groupId != null) {
List<Object> merged = new ArrayList<>(List.of((Object)groupId));
merged.addAll(values);
values = merged;
}

return CsvUtils.createCsvFromObjects(values);
// return StringUtils.join(values, separator) + "\n";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ public class K10OrganisationReader {
public static Map<String, K10Organisation> fileToCodeList(String fileName) {

// protected Map<String, EncodedValue> index = new HashMap<>();
boolean isTsv = fileName.endsWith(".tsv");

Map<String, K10Organisation> codes = new HashMap<>();
try {
Expand All @@ -26,10 +27,18 @@ public static Map<String, K10Organisation> fileToCodeList(String fileName) {
String line = it.nextLine();
if (line.equals("") || line.startsWith("#") || line.startsWith("--"))
continue;
String[] parts = line.split("\t", 3);
if (parts.length > 1) {
String id = removeLeadingZeros(parts[0]);
codes.put(id, new K10Organisation(id, parts[1], parts[2]));
if (isTsv) {
String[] parts = line.split("\t", 3);
if (parts.length > 1) {
String id = removeLeadingZeros(parts[0]);
codes.put(id, new K10Organisation(id, parts[1], parts[2]));
}
} else {
String[] parts = line.split(": ", 2);
if (parts.length > 1) {
String id = removeLeadingZeros(parts[0]);
codes.put(id, new K10Organisation(id, id, parts[1]));
}
}
}
} catch (IOException e) {
Expand Down
Loading

0 comments on commit 282232c

Please sign in to comment.