Skip to content

Commit

Permalink
issue #78, #79, #80, #81: fixing completeness issues
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Jan 11, 2021
1 parent 90b55a4 commit 1b3cf47
Show file tree
Hide file tree
Showing 19 changed files with 437 additions and 99 deletions.
17 changes: 15 additions & 2 deletions src/main/java/de/gwdg/metadataqa/marc/DataField.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
package de.gwdg.metadataqa.marc;

import de.gwdg.metadataqa.marc.definition.*;
import de.gwdg.metadataqa.marc.definition.Cardinality;
import de.gwdg.metadataqa.marc.definition.DataFieldDefinition;
import de.gwdg.metadataqa.marc.definition.Indicator;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.definition.SourceSpecificationType;
import de.gwdg.metadataqa.marc.definition.SubfieldDefinition;
import de.gwdg.metadataqa.marc.definition.TagDefinitionLoader;
import de.gwdg.metadataqa.marc.definition.general.Linkage;
import de.gwdg.metadataqa.marc.definition.general.indexer.FieldIndexer;
import de.gwdg.metadataqa.marc.definition.general.indexer.subject.*;
Expand All @@ -14,9 +20,16 @@
import org.apache.commons.lang3.StringUtils;

import java.io.Serializable;
import java.util.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;

import static de.gwdg.metadataqa.marc.definition.SourceSpecificationType.Indicator2For055AndSubfield2;
import static de.gwdg.metadataqa.marc.model.validation.ValidationErrorType.*;

public class DataField implements Extractable, Validatable, Serializable {
Expand Down
7 changes: 7 additions & 0 deletions src/main/java/de/gwdg/metadataqa/marc/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,13 @@ public static <T extends Object> void count(T key, Map<T, Integer> counter) {
counter.put(key, counter.get(key) + 1);
}

public static <T extends Object> void add(T key, Map<T, Integer> counter, int i) {
if (!counter.containsKey(key)) {
counter.put(key, 0);
}
counter.put(key, counter.get(key) + i);
}

public static <T extends Object> List<String> counterToList(Map<T, Integer> counter) {
return counterToList(':', counter);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,19 @@
import de.gwdg.metadataqa.marc.cli.utils.Schema;
import de.gwdg.metadataqa.marc.definition.SourceSpecificationType;

import java.util.*;
import java.util.Map;
import java.util.HashMap;
import java.util.Set;
import java.util.HashSet;
import java.util.List;
import java.util.ArrayList;
import java.util.Collections;
import java.util.logging.Logger;
import java.util.regex.Pattern;

import static de.gwdg.metadataqa.marc.Utils.add;
import static de.gwdg.metadataqa.marc.Utils.count;

public class AuthorithyAnalyzer {

private static final Logger logger = Logger.getLogger(
Expand All @@ -27,23 +36,39 @@ public AuthorithyAnalyzer(MarcRecord marcRecord,
}

public int process() {
Map<AuthorityCategory, Integer> categoryCounter = new HashMap<>();
int count = 0;
for (DataField field : marcRecord.getAuthorityFields()) {
SourceSpecificationType type = field.getDefinition().getSourceSpecificationType();
if (type == null) {

} else if (type.equals(SourceSpecificationType.Subfield2)) {
count += processFieldWithSubfield2(field);
int fieldInstanceLevelCount = processFieldWithSubfield2(field);
count += fieldInstanceLevelCount;
add(AuthorityCategory.get(field.getTag()), categoryCounter, fieldInstanceLevelCount);
} else {
logger.severe("Unhandled type: " + type);
}
}
// logger.info(categoryCounter.toString());
updateAuthorityCategoryStatitics(categoryCounter);
return count;
}

private void updateAuthorityCategoryStatitics(Map<AuthorityCategory, Integer> categoryCounter) {
for (Map.Entry<AuthorityCategory, Integer> entry : categoryCounter.entrySet()) {
if (entry.getValue() > 0) {
// logger.info(entry.getKey() + " -> " * )
authoritiesStatistics.getInstancesPerCategories().add(entry.getKey(), entry.getValue());
authoritiesStatistics.getRecordsPerCategories().count(entry.getKey());
}
}
}

private int processFieldWithSubfield2(DataField field) {
int count = 0;
List<Schema> schemas = new ArrayList<>();
AuthorityCategory category = AuthorityCategory.get(field.getTag());

Schema currentSchema = extractFromSubfield0(field, schemas);
if (currentSchema == null)
Expand All @@ -53,6 +78,7 @@ private int processFieldWithSubfield2(DataField field) {

addSchemasToStatistics(authoritiesStatistics.getInstances(), schemas);
addSchemasToStatistics(authoritiesStatistics.getRecords(), deduplicateSchema(schemas));

return count;
}

Expand Down Expand Up @@ -117,16 +143,10 @@ private void updateSchemaSubfieldStatistics(DataField field,
}
}


private void addSchemasToStatistics(Map<Schema, Integer> fieldStatistics, List<Schema> schemes) {
if (!schemes.isEmpty()) {
for (Schema scheme : schemes) {
if (!fieldStatistics.containsKey(scheme)) {
fieldStatistics.put(scheme, 0);
}
fieldStatistics.put(scheme, fieldStatistics.get(scheme) + 1);
}
}
if (!schemes.isEmpty())
for (Schema scheme : schemes)
count(scheme, fieldStatistics);
}

private List<String> orderSubfields(List<MarcSubfield> originalSubfields) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package de.gwdg.metadataqa.marc.analysis;

import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public enum AuthorityCategory {
Personal("Personal names", "100", "700", "800"),
Corporate("Corporate names", "110", "710", "810"),
Meeting("Meeting names", "111", "711", "811"),
Geographic("Geographic names", "751", "752"),
Titles("Titles", "130", "730", "740", "830"),
Other("Other", "720", "753", "754")
;

private String label;
private List<String> tags;
private static Map<String, AuthorityCategory> index = new HashMap<>();

AuthorityCategory(String label, String... tags) {
this.label = label;
this.tags = Arrays.asList(tags);
}

public String getLabel() {
return label;
}

public List<String> getTags() {
return tags;
}

public static AuthorityCategory get(String tag) {
if (index.isEmpty())
for (AuthorityCategory category : values())
for (String t : category.tags)
index.put(t, category);

return index.getOrDefault(tag, null);
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package de.gwdg.metadataqa.marc.analysis;

import de.gwdg.metadataqa.marc.cli.utils.Schema;
import de.gwdg.metadataqa.marc.utils.Counter;

import java.util.HashMap;
import java.util.List;
Expand All @@ -10,6 +11,10 @@ public class AuthorityStatistics {
private Map<Schema, Integer> instances = new HashMap<>();
private Map<Schema, Integer> records = new HashMap<>();
private Map<Schema, Map<List<String>, Integer>> subfields = new HashMap<>();
private Counter<AuthorityCategory> instancesPerCategories = new Counter();
private Counter<AuthorityCategory> recordsPerCategories = new Counter();
// private Map<AuthorityCategory, Integer> instancesPerCategories = new HashMap<>();
// private Map<AuthorityCategory, Integer> recordsPerCategories = new HashMap<>();

public AuthorityStatistics() {
}
Expand All @@ -25,4 +30,12 @@ public Map<Schema, Integer> getRecords() {
public Map<Schema, Map<List<String>, Integer>> getSubfields() {
return subfields;
}

public Counter<AuthorityCategory> getInstancesPerCategories() {
return instancesPerCategories;
}

public Counter<AuthorityCategory> getRecordsPerCategories() {
return recordsPerCategories;
}
}
40 changes: 39 additions & 1 deletion src/main/java/de/gwdg/metadataqa/marc/cli/AuthorityAnalysis.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import de.gwdg.metadataqa.marc.MarcRecord;
import de.gwdg.metadataqa.marc.Utils;
import de.gwdg.metadataqa.marc.analysis.AuthorithyAnalyzer;
import de.gwdg.metadataqa.marc.analysis.AuthorityCategory;
import de.gwdg.metadataqa.marc.analysis.AuthorityStatistics;
import de.gwdg.metadataqa.marc.cli.parameters.CommonParameters;
import de.gwdg.metadataqa.marc.cli.parameters.ValidatorParameters;
Expand All @@ -20,10 +21,14 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;

import static de.gwdg.metadataqa.marc.Utils.count;
import static de.gwdg.metadataqa.marc.Utils.quote;

public class AuthorityAnalysis implements MarcFileProcessor, Serializable {

Expand Down Expand Up @@ -103,12 +108,45 @@ public void fileProcessed() {

@Override
public void afterIteration(int numberOfprocessedRecords) {
printAuthoritiesByCategories();
printAuthoritiesBySchema();
printAuthoritiesByRecords();
printAuthoritiesHistogram();
printAuthoritiesSubfieldsStatistics();
}

private void printAuthoritiesByCategories() {
Path path = Paths.get(parameters.getOutputDir(), "authorities-by-categories.csv");
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
writer.write(createRow("category", "recordcount", "instancecount"));
statistics.getRecordsPerCategories()
.entrySet()
.stream()
.forEach(
entry -> {
AuthorityCategory category = entry.getKey();
int recordCount = entry.getValue();
int instanceCount = statistics.getInstancesPerCategories().get(category);
try {
writer.write(createRow(
quote(category.getLabel()),
recordCount,
instanceCount
));
} catch (IOException ex) {
ex.printStackTrace();
System.err.println(category);
} catch (NullPointerException ex) {
ex.printStackTrace();
System.err.println(category);
}
}
);
} catch (IOException e) {
e.printStackTrace();
}
}

private void printAuthoritiesBySchema() {
Path path = Paths.get(parameters.getOutputDir(), "authorities-by-schema.csv");
try (BufferedWriter writer = Files.newBufferedWriter(path)) {
Expand Down

0 comments on commit 1b3cf47

Please sign in to comment.