Skip to content

Commit

Permalink
issue #142: calculating PICA completeness
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Jun 16, 2022
1 parent 49b9232 commit 298513c
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 24 deletions.
8 changes: 6 additions & 2 deletions src/main/java/de/gwdg/metadataqa/marc/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,16 @@ public static String extractPackageName(DataField field) {

public static String extractPackageName(DataFieldDefinition field) {
return field.getClass().getPackage().getName()
.replace("de.gwdg.metadataqa.marc.definition.tags.", "");
.replace("de.gwdg.metadataqa.marc.definition.tags.", "")
.replace("de.gwdg.metadataqa.marc.utils.", "")
;
}

public static String extractPackageName(Class<? extends DataFieldDefinition> field) {
return field.getPackage().getName()
.replace("de.gwdg.metadataqa.marc.definition.tags.", "");
.replace("de.gwdg.metadataqa.marc.definition.tags.", "")
.replace("de.gwdg.metadataqa.marc.utils.", "")
;
}

public static MarcVersion getVersion(DataFieldDefinition field) {
Expand Down
41 changes: 24 additions & 17 deletions src/main/java/de/gwdg/metadataqa/marc/cli/Completeness.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import de.gwdg.metadataqa.marc.dao.MarcPositionalControlField;
import de.gwdg.metadataqa.marc.dao.MarcRecord;
import de.gwdg.metadataqa.marc.definition.ControlValue;
import de.gwdg.metadataqa.marc.definition.structure.DataFieldDefinition;
import de.gwdg.metadataqa.marc.definition.tags.TagCategory;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorFormat;
import de.gwdg.metadataqa.marc.utils.BasicStatistics;
Expand Down Expand Up @@ -59,6 +60,7 @@ public class Completeness implements BibliographicInputProcessor, Serializable {
private Map<String, Map<Integer, Integer>> fieldHistogram = new HashMap<>();
private boolean readyToProcess;
private CompletenessPlugin plugin;
private Map<DataFieldDefinition, String> packageNameCache = new HashMap<>();

public Completeness(String[] args) throws ParseException {
parameters = new CompletenessParameters(args);
Expand Down Expand Up @@ -118,9 +120,11 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce
count(library, libraryCounter);
}

processLeader(marcRecord, recordFrequency, recordPackageCounter, documentType);
processSimpleControlfields(marcRecord, recordFrequency, recordPackageCounter, documentType);
processPositionalControlFields(marcRecord, recordFrequency, recordPackageCounter, documentType);
if (!parameters.isPica()) {
processLeader(marcRecord, recordFrequency, recordPackageCounter, documentType);
processSimpleControlfields(marcRecord, recordFrequency, recordPackageCounter, documentType);
processPositionalControlFields(marcRecord, recordFrequency, recordPackageCounter, documentType);
}
processDataFields(marcRecord, recordFrequency, recordPackageCounter, documentType);

for (String key : recordFrequency.keySet()) {
Expand Down Expand Up @@ -217,10 +221,15 @@ private List<String> getMarcPaths(DataField field) {
private String getPackageName(DataField field) {
String packageName;
if (field.getDefinition() != null) {
packageName = Utils.extractPackageName(field);
if (StringUtils.isBlank(packageName)) {
logger.warning(String.format("%s has no package. /%s", field, field.getDefinition().getClass()));
packageName = TagCategory.other.getPackageName();
if (packageNameCache.containsKey(field.getDefinition()))
packageName = packageNameCache.get(field.getDefinition());
else {
packageName = Utils.extractPackageName(field);
if (StringUtils.isBlank(packageName)) {
logger.warning(String.format("%s has no package. /%s", field, field.getDefinition().getClass()));
packageName = TagCategory.other.getPackageName();
}
packageNameCache.put(field.getDefinition(), packageName);
}
} else {
packageName = TagCategory.other.getPackageName();
Expand Down Expand Up @@ -401,16 +410,14 @@ private String formatCardinality(char separator,
String packageLabel = TagCategory.other.getLabel();
String tagLabel = "";
String subfieldLabel = "";
if (parameters.isMarc21()) {
TagHierarchy tagHierarchy = TagHierarchy.createFromPath(marcPathLabel, parameters.getMarcVersion());
if (tagHierarchy != null) {
packageId = tagHierarchy.getPackageId();
packageLabel = tagHierarchy.getPackageLabel();
tagLabel = tagHierarchy.getTagLabel();
subfieldLabel = tagHierarchy.getSubfieldLabel();
} else {
logger.severe("Key can not be found in the TagHierarchy: " + marcPathLabel);
}
TagHierarchy tagHierarchy = plugin.getTagHierarchy(marcPathLabel);
if (tagHierarchy != null) {
packageId = tagHierarchy.getPackageId();
packageLabel = tagHierarchy.getPackageLabel();
tagLabel = tagHierarchy.getTagLabel();
subfieldLabel = tagHierarchy.getSubfieldLabel();
} else {
logger.severe("Key can not be found in the TagHierarchy: " + marcPathLabel);
}

// Integer cardinality = entry.getValue();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
package de.gwdg.metadataqa.marc.cli.plugin;

import de.gwdg.metadataqa.marc.dao.MarcRecord;
import de.gwdg.metadataqa.marc.utils.TagHierarchy;

public interface CompletenessPlugin {
String getDocumentType(MarcRecord marcRecord);
TagHierarchy getTagHierarchy(String path);
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import de.gwdg.metadataqa.marc.cli.parameters.CompletenessParameters;
import de.gwdg.metadataqa.marc.dao.MarcRecord;
import de.gwdg.metadataqa.marc.utils.TagHierarchy;

import java.util.regex.Pattern;

Expand All @@ -16,4 +17,9 @@ public Marc21CompletenessPlugin(CompletenessParameters parameters) {
public String getDocumentType(MarcRecord marcRecord) {
return marcRecord.getType().getValue();
}

@Override
public TagHierarchy getTagHierarchy(String path) {
return TagHierarchy.createFromPath(path, parameters.getMarcVersion());
}
}
Original file line number Diff line number Diff line change
@@ -1,24 +1,85 @@
package de.gwdg.metadataqa.marc.cli.plugin;

import de.gwdg.metadataqa.marc.Utils;
import de.gwdg.metadataqa.marc.cli.parameters.CompletenessParameters;
import de.gwdg.metadataqa.marc.dao.MarcRecord;
import de.gwdg.metadataqa.marc.definition.structure.SubfieldDefinition;
import de.gwdg.metadataqa.marc.definition.tags.TagCategory;
import de.gwdg.metadataqa.marc.utils.TagHierarchy;
import de.gwdg.metadataqa.marc.utils.pica.FieldPath;
import de.gwdg.metadataqa.marc.utils.pica.PicaFieldDefinition;
import de.gwdg.metadataqa.marc.utils.pica.PicaSchemaReader;
import org.apache.commons.lang3.StringUtils;

import java.nio.file.Paths;
import java.util.Map;
import java.util.regex.Pattern;

public class PicaCompletenessPlugin implements CompletenessPlugin {
private final CompletenessParameters parameters;
private final String field;
private final String subfield;
private final String separator;
private final Map<String, PicaFieldDefinition> picaSchema;
private final static Map<String, String> types = Map.ofEntries(
Map.entry("A", "Druckschriften (einschließlich Bildbänden)"),
Map.entry("B", "Tonträger, Videodatenträger, Bildliche Darstellungen"),
Map.entry("C", "Blindenschriftträger und andere taktile Materialien"),
Map.entry("E", "Mikroform"),
Map.entry("H", "Handschriftliches Material"),
Map.entry("L", "Lokales Katalogisat (nur GBV)"),
Map.entry("O", "Elektronische Ressource im Fernzugriff"),
Map.entry("S", "Elektronische Ressource auf Datenträger"),
Map.entry("V", "Objekt"),
Map.entry("Z", "Medienkombination"),
Map.entry("a", "Mailboxsatz")
);

public PicaCompletenessPlugin(CompletenessParameters parameters) {
this.parameters = parameters;
String[] parts = parameters.getPicaRecordTypeField().split(Pattern.quote(parameters.getPicaSubfieldSeparator()));
field = parts[0];
subfield = parts[1];
separator = Pattern.quote(parameters.getPicaSubfieldSeparator());
FieldPath path = parse(parameters.getPicaRecordTypeField());
// String[] parts = parameters.getPicaRecordTypeField().split(Pattern.quote(parameters.getPicaSubfieldSeparator()));
field = path.getField();
subfield = path.getSubfield();
String schemaFile = StringUtils.isNotEmpty(parameters.getPicaSchemaFile())
? parameters.getPicaSchemaFile()
: Paths.get("src/main/resources/pica/avram-k10plus.json").toAbsolutePath().toString();
picaSchema = PicaSchemaReader.create(schemaFile);

}

@Override
public String getDocumentType(MarcRecord marcRecord) {
return marcRecord.getDatafield(field).get(0).getSubfield(subfield).get(0).getValue();
String code = marcRecord.getDatafield(field).get(0).getSubfield(subfield).get(0).getValue().substring(0, 1);
return types.getOrDefault(code, "invalid");
}

@Override
public TagHierarchy getTagHierarchy(String rawpath) {
FieldPath path = parse(rawpath);
String fieldLabel = "";
String subfieldLabel = "";
PicaFieldDefinition field = picaSchema.get(path.getField());
TagCategory category = TagCategory.other;
if (field != null) {
category = TagCategory.pica;
fieldLabel = field.getLabel();
if (!path.getSubfield().equals("")){
SubfieldDefinition subfield = field.getSubfield(path.getSubfield());
subfieldLabel = subfield != null ? subfield.getLabel() : "";
}
}

return new TagHierarchy(category, fieldLabel, subfieldLabel);
}

private FieldPath parse(String path) {
String[] parts = path.split(separator);
if (parts.length == 1) {
System.err.println("problem with path: " + path);
return new FieldPath(parts[0], "");
}
return new FieldPath(parts[0], parts[1]);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public void start() {
defaultRecordType = parameters.getDefaultRecordType();
replecementInControlFields = parameters.getReplecementInControlFields();
decimalFormat = new DecimalFormat();
if (parameters.getSchemaType().equals(SchemaType.PICA)) {
if (parameters.isPica()) {
String schemaFile = StringUtils.isNotEmpty(parameters.getPicaSchemaFile())
? parameters.getPicaSchemaFile()
: Paths.get("src/main/resources/pica/avram-k10plus.json").toAbsolutePath().toString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ public enum TagCategory {
bl(20, "bltags", "BL", "Locally defined tags of the British Library", false),
uva(21, "uvatags", "UvA", "Locally defined tags of University of Amsterdam", false),
b3kat(22, "b3kattags", "B3Kat", "Locally defined tags of a German union cataogue B3Kat", false),
pica(50, "pica", "PICA", "PICA+ tags", false),
other(99, "unknown", "unknown", "unknown origin", false)
;

Expand Down
2 changes: 2 additions & 0 deletions src/main/java/de/gwdg/metadataqa/marc/utils/TagHierarchy.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package de.gwdg.metadataqa.marc.utils;

import de.gwdg.metadataqa.marc.Utils;
import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.definition.structure.ControlFieldDefinition;
import de.gwdg.metadataqa.marc.definition.structure.ControlfieldPositionDefinition;
import de.gwdg.metadataqa.marc.definition.structure.DataFieldDefinition;
Expand All @@ -27,6 +28,7 @@ public class TagHierarchy {
private static final Pattern controlFieldPattern = Pattern.compile("^(00\\d)(/(\\d+|\\d+-\\d+))?$");
private static final Pattern controlFieldIdPattern = Pattern.compile("^(00[6-8])([a-z][a-zA-Z]+)(\\d+)$");
private static final Pattern dataFieldPattern = Pattern.compile("^(\\d\\d\\d)\\$(.*)$");
private static final Pattern picaDataFieldPattern = Pattern.compile("^(.*)\\$(.*)$");

private TagCategory category;
private String tagLabel;
Expand Down
19 changes: 19 additions & 0 deletions src/main/java/de/gwdg/metadataqa/marc/utils/pica/FieldPath.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package de.gwdg.metadataqa.marc.utils.pica;

public class FieldPath {
String field;
String subfield;

public FieldPath(String field, String subfield) {
this.field = field;
this.subfield = subfield;
}

public String getField() {
return field;
}

public String getSubfield() {
return subfield;
}
}

0 comments on commit 298513c

Please sign in to comment.