Skip to content

Commit

Permalink
issue #199: Group results in completeness
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Dec 18, 2022
1 parent 2b785c6 commit da387f1
Show file tree
Hide file tree
Showing 8 changed files with 584 additions and 222 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package de.gwdg.metadataqa.marc.analysis.completeness;

import de.gwdg.metadataqa.marc.definition.structure.DataFieldDefinition;

import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;

public class CompletenessDAO {

private Map<String, Map<String, Integer>> packageCounter = new TreeMap<>();
private Map<String, Map<String, Map<String, Integer>>> grouppedPackageCounter = new TreeMap<>();
private Map<String, Map<String, Integer>> elementCardinality = new TreeMap<>();
private Map<String, Map<String, Map<String, Integer>>> grouppedElementCardinality = new TreeMap<>();
private Map<String, Map<String, Integer>> elementFrequency = new TreeMap<>();
private Map<String, Integer> groupCounter = new TreeMap<>();
private Map<String, Integer> library003Counter = new TreeMap<>();
private Map<String, Integer> libraryCounter = new TreeMap<>();
private Map<DataFieldDefinition, String> packageNameCache = new HashMap<>();

public void initialize() {
packageCounter.put("all", new TreeMap<>());
elementCardinality.put("all", new TreeMap<>());
elementFrequency.put("all", new TreeMap<>());
}

public Map<String, Map<String, Integer>> getPackageCounter() {
return packageCounter;
}

public Map<String, Map<String, Map<String, Integer>>> getGrouppedPackageCounter() {
return grouppedPackageCounter;
}

public Map<String, Map<String, Integer>> getElementCardinality() {
return elementCardinality;
}

public Map<String, Map<String, Map<String, Integer>>> getGrouppedElementCardinality() {
return grouppedElementCardinality;
}

public Map<String, Map<String, Integer>> getElementFrequency() {
return elementFrequency;
}

public Map<String, Integer> getGroupCounter() {
return groupCounter;
}

public Map<String, Integer> getLibrary003Counter() {
return library003Counter;
}

public Map<String, Integer> getLibraryCounter() {
return libraryCounter;
}

public Map<DataFieldDefinition, String> getPackageNameCache() {
return packageNameCache;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
package de.gwdg.metadataqa.marc.analysis.completeness;

import de.gwdg.metadataqa.marc.MarcSubfield;
import de.gwdg.metadataqa.marc.cli.parameters.CompletenessParameters;
import de.gwdg.metadataqa.marc.cli.plugin.CompletenessPlugin;
import de.gwdg.metadataqa.marc.dao.DataField;
import de.gwdg.metadataqa.marc.dao.MarcControlField;
import de.gwdg.metadataqa.marc.dao.MarcPositionalControlField;
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.definition.ControlValue;
import de.gwdg.metadataqa.marc.definition.tags.TagCategory;
import de.gwdg.metadataqa.marc.utils.BibiographicPath;
import de.gwdg.metadataqa.marc.utils.pica.path.PicaPath;
import org.apache.commons.lang3.StringUtils;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.logging.Logger;
import java.util.regex.Pattern;

public class RecordCompleteness {

private static final Logger logger = Logger.getLogger(RecordCompleteness.class.getCanonicalName());
private static final Pattern numericalPattern = Pattern.compile("^(\\d)$");

private final BibiographicPath groupBy;
private final CompletenessParameters parameters;
private final CompletenessDAO completenessDAO;
private final CompletenessPlugin plugin;
BibliographicRecord bibliographicRecord;
String documentType;
boolean hasGroupBy;
Map<String, Integer> recordFrequency = new TreeMap<>();
Map<String, Integer> recordPackageCounter = new TreeMap<>();
Set<String> groupIds = new HashSet<>();

public RecordCompleteness(BibliographicRecord bibliographicRecord,
CompletenessParameters parameters,
CompletenessDAO completenessDAO,
CompletenessPlugin plugin,
BibiographicPath groupBy) {
this.bibliographicRecord = bibliographicRecord;
this.parameters = parameters;
this.completenessDAO = completenessDAO;
this.plugin = plugin;
this.groupBy = groupBy;
this.hasGroupBy = (groupBy != null);

if (hasGroupBy) {
List<String> idLists = parameters.isPica() ? bibliographicRecord.select((PicaPath) groupBy) : null; // TODO: MARC21
groupIds = extractGroupIds(idLists);
}
}

public void process() {
documentType = plugin.getDocumentType(bibliographicRecord);
completenessDAO.getElementCardinality().computeIfAbsent(documentType, s -> new TreeMap<>());
completenessDAO.getElementFrequency().computeIfAbsent(documentType, s -> new TreeMap<>());

if (bibliographicRecord.getControl003() != null)
count(bibliographicRecord.getControl003().getContent(), completenessDAO.getLibrary003Counter());

for (String library : extract(bibliographicRecord, "852", "a"))
count(library, completenessDAO.getLibraryCounter());

if (!parameters.isPica()) {
processLeader(bibliographicRecord, documentType, recordFrequency, recordPackageCounter);
processSimpleControlfields(bibliographicRecord, documentType, recordFrequency, recordPackageCounter);
processPositionalControlFields(bibliographicRecord, documentType, recordFrequency, recordPackageCounter);
}
processDataFields(bibliographicRecord, documentType, recordFrequency, recordPackageCounter);
}

private void processLeader(BibliographicRecord marcRecord,
String documentType,
Map<String, Integer> recordFrequency,
Map<String, Integer> recordPackageCounter) {
if (marcRecord.getLeader() != null) {
for (ControlValue position : marcRecord.getLeader().getValuesList()) {
String marcPath = position.getDefinition().getId();
count(marcPath, completenessDAO.getElementCardinality().get(documentType));
count(marcPath, completenessDAO.getElementCardinality().get("all"));
count(marcPath, recordFrequency);
count(TagCategory.TAGS_00X.getPackageName(), recordPackageCounter);
}
}
}

private void processSimpleControlfields(BibliographicRecord marcRecord,
String documentType,
Map<String, Integer> recordFrequency,
Map<String, Integer> recordPackageCounter) {
for (MarcControlField field : marcRecord.getSimpleControlfields()) {
if (field != null) {
String marcPath = field.getDefinition().getTag();
count(marcPath, completenessDAO.getElementCardinality().get(documentType));
count(marcPath, completenessDAO.getElementCardinality().get("all"));
count(marcPath, recordFrequency);
count(TagCategory.TAGS_00X.getPackageName(), recordPackageCounter);
}
}
}

private void processPositionalControlFields(BibliographicRecord marcRecord,
String documentType,
Map<String, Integer> recordFrequency,
Map<String, Integer> recordPackageCounter) {
for (MarcPositionalControlField field : marcRecord.getPositionalControlfields()) {
if (field != null) {
for (ControlValue position : field.getValuesList()) {
String marcPath = position.getDefinition().getId();
count(marcPath, completenessDAO.getElementCardinality().get(documentType));
count(marcPath, completenessDAO.getElementCardinality().get("all"));
count(marcPath, recordFrequency);
count(TagCategory.TAGS_00X.getPackageName(), recordPackageCounter);
}
}
}
}

private void processDataFields(BibliographicRecord marcRecord,
String documentType,
Map<String, Integer> recordFrequency,
Map<String, Integer> recordPackageCounter) {
for (DataField field : marcRecord.getDatafields()) {
if (parameters.getIgnorableFields().contains(field.getTag()))
continue;

count(getPackageName(field), recordPackageCounter);
if (groupBy != null) {
for (String groupId : groupIds) {
completenessDAO.getGrouppedElementCardinality().computeIfAbsent(groupId, s -> new TreeMap<>());
completenessDAO.getGrouppedElementCardinality().get(groupId).computeIfAbsent(documentType, s -> new TreeMap<>());
completenessDAO.getGrouppedElementCardinality().get(groupId).computeIfAbsent("all", s -> new TreeMap<>());
count(field.getTag(), completenessDAO.getGrouppedElementCardinality().get(groupId).get(documentType));
count(field.getTag(), completenessDAO.getGrouppedElementCardinality().get(groupId).get("all"));
count(field.getTag(), recordFrequency);

List<String> marcPaths = getMarcPaths(field);
for (String marcPath : marcPaths) {
count(marcPath, completenessDAO.getGrouppedElementCardinality().get(groupId).get(documentType));
count(marcPath, completenessDAO.getGrouppedElementCardinality().get(groupId).get("all"));
count(marcPath, recordFrequency);
}
}
} else {
count(field.getTag(), completenessDAO.getElementCardinality().get(documentType));
count(field.getTag(), completenessDAO.getElementCardinality().get("all"));
count(field.getTag(), recordFrequency);

List<String> marcPaths = getMarcPaths(field);
for (String marcPath : marcPaths) {
count(marcPath, completenessDAO.getElementCardinality().get(documentType));
count(marcPath, completenessDAO.getElementCardinality().get("all"));
count(marcPath, recordFrequency);
}
}
}
}

private void processDataFields() {
for (DataField field : bibliographicRecord.getDatafields()) {
if (parameters.getIgnorableFields().contains(field.getTag()))
continue;

count(getPackageName(field), recordPackageCounter);
if (groupBy != null) {
count(field.getTag(), completenessDAO.getElementCardinality().get(documentType));
count(field.getTag(), completenessDAO.getElementCardinality().get("all"));
count(field.getTag(), recordFrequency);

List<String> marcPaths = getMarcPaths(field);
for (String marcPath : marcPaths) {
count(marcPath, completenessDAO.getElementCardinality().get(documentType));
count(marcPath, completenessDAO.getElementCardinality().get("all"));
count(marcPath, recordFrequency);
}
} else {
count(field.getTag(), completenessDAO.getElementCardinality().get(documentType));
count(field.getTag(), completenessDAO.getElementCardinality().get("all"));
count(field.getTag(), recordFrequency);

List<String> marcPaths = getMarcPaths(field);
for (String marcPath : marcPaths) {
count(marcPath, completenessDAO.getElementCardinality().get(documentType));
count(marcPath, completenessDAO.getElementCardinality().get("all"));
count(marcPath, recordFrequency);
}
}
}
}

private <T extends Object> void count(T key, Map<T, Integer> counter) {
counter.computeIfAbsent(key, s -> 0);
counter.put(key, counter.get(key) + 1);
}

private Set<String> extractGroupIds(List<String> idLists) {
Set<String> groupIds = new HashSet<>();
groupIds.add("all");
if (idLists != null) {
for (String idList : idLists) {
String[] ids = idList.split(",");
for (String id : ids) {
groupIds.add(id);
}
}
}
return groupIds;
}

public Set<String> getGroupIds() {
return groupIds;
}

public Map<String, Integer> getRecordFrequency() {
return recordFrequency;
}

public Map<String, Integer> getRecordPackageCounter() {
return recordPackageCounter;
}

public String getDocumentType() {
return documentType;
}

public boolean hasGroupBy() {
return hasGroupBy;
}

private List<String> extract(BibliographicRecord marcRecord, String tag, String subfield) {
List<String> values = new ArrayList<>();
List<DataField> fields = marcRecord.getDatafield(tag);
if (fields != null && !fields.isEmpty()) {
for (DataField field : fields) {
List<MarcSubfield> subfieldInstances = field.getSubfield(subfield);
if (subfieldInstances != null) {
for (MarcSubfield subfieldInstance : subfieldInstances) {
values.add(subfieldInstance.getValue());
}
}
}
}
return values;
}

private String getPackageName(DataField field) {
String packageName;
if (field.getDefinition() != null) {
if (completenessDAO.getPackageNameCache().containsKey(field.getDefinition()))
packageName = completenessDAO.getPackageNameCache().get(field.getDefinition());
else {
packageName = plugin.getPackageName(field);
if (StringUtils.isBlank(packageName)) {
logger.warning(String.format("%s has no package. /%s", field, field.getDefinition().getClass()));
packageName = TagCategory.OTHER.getPackageName();
}
completenessDAO.getPackageNameCache().put(field.getDefinition(), packageName);
}
} else {
packageName = TagCategory.OTHER.getPackageName();
}
return packageName;
}

private List<String> getMarcPaths(DataField field) {
List<String> marcPaths = new ArrayList<>();

if (parameters.isMarc21()) {
if (field.getInd1() != null)
if (field.getDefinition() != null && field.getDefinition().getInd1().exists() || !field.getInd1().equals(" "))
marcPaths.add(String.format("%s$!ind1", field.getTag()));

if (field.getInd2() != null)
if (field.getDefinition() != null && field.getDefinition().getInd2().exists() || !field.getInd2().equals(" "))
marcPaths.add(String.format("%s$!ind2", field.getTag()));
}

for (MarcSubfield subfield : field.getSubfields())
if (numericalPattern.matcher(subfield.getCode()).matches())
marcPaths.add(String.format("%s$|%s", field.getTag(), subfield.getCode()));
else
marcPaths.add(String.format("%s$%s", field.getTag(), subfield.getCode()));

return marcPaths;
}
}
Loading

0 comments on commit da387f1

Please sign in to comment.