Skip to content

Commit

Permalink
PICA: general changes #163
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Feb 26, 2023
1 parent eb9cf3f commit d9b844d
Show file tree
Hide file tree
Showing 9 changed files with 35 additions and 190 deletions.
6 changes: 3 additions & 3 deletions catalogues/k10plus_pica.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
. ./setdir.sh
NAME=k10plus_pica
# MARC_DIR=${BASE_INPUT_DIR}/k10plus_pica
MARC_DIR=/data/pica/k10plus_pica
MARC_DIR=${BASE_PICA_INPUT_DIR}/k10plus_pica
SCHEMA=PICA
TYPE_PARAMS="--schemaType PICA --marcFormat PICA_NORMALIZED --emptyLargeCollectors"
TYPE_PARAMS="$TYPE_PARAMS --ignorableFields 001@,001E,001L,001U,001U,001X,001X,002V,003C,003G,003Z,008G,017N,020F,027D,031B,037I,039V,042@,046G,046T,101@,101E,101U,102D,201E,201U,202D,1...,2..."
TYPE_PARAMS="$TYPE_PARAMS --ignorableIssueTypes undefinedField"
TYPE_PARAMS="$TYPE_PARAMS --allowableRecords base64:"$(echo '002@.0 !~ "^L" && 002@.0 !~ "^..[iktN]" && (002@.0 !~ "^.v" || 021A.a?)' | base64 -w 0)
# MASK=sample.pica
# MASK=kxp-sample-title_2022-09_30.dat
MASK=kxp-title_2022-09-30.dat
MASK=kxp-sample-title_2022-09_30.dat
# MASK=kxp-title_2022-09-30.dat
# MASK=small.dat

. ./common-script
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,15 @@ public IgnorableFields getIgnorableFields() {
public List<ValidationErrorType> getIgnorableIssueTypes() {
return ignorableIssueTypes;
}

@Override
public String toString() {
return "ValidatorConfiguration{" +
"marcVersion=" + marcVersion +
", schemaType=" + schemaType +
", doSummary=" + doSummary +
", ignorableFields=" + ignorableFields +
", ignorableIssueTypes=" + ignorableIssueTypes +
'}';
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@
import java.util.List;

public class ValidatorParameters extends CommonParameters implements Serializable {
public static final String DEFAULT_FILE_NAME = "validation-report.txt";
public static final String DEFAULT_DETAILS_FILE_NAME = "issue-details.csv";
public static final String DEFAULT_SUMMARY_FILE_NAME = "issue-summary.csv";

private String detailsFileName = DEFAULT_FILE_NAME;
private String summaryFileName;
private String detailsFileName = DEFAULT_DETAILS_FILE_NAME;
private String summaryFileName = DEFAULT_SUMMARY_FILE_NAME;
private boolean doDetails = true;
private boolean doSummary = false;
private ValidationErrorFormat format = ValidationErrorFormat.TEXT;
Expand All @@ -29,7 +30,7 @@ protected void setOptions() {
options.addOption("s", "summary", false, "show summary instead of record level display");
options.addOption("h", "details", false, "show record level display");
options.addOption("f", "detailsFileName", true,
String.format("the report file name (default is '%s')", ValidatorParameters.DEFAULT_FILE_NAME));
String.format("the report file name (default is '%s')", ValidatorParameters.DEFAULT_DETAILS_FILE_NAME));
options.addOption("r", "format", true, "specify a format");
options.addOption("w", "emptyLargeCollectors", false, "empty large collectors");
options.addOption("t", "collectAllErrors", false, "collect all errors (useful only for validating small number of records)");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -449,186 +449,12 @@ public String asJson() {
return json;
}

/*
@Override
public boolean validate(MarcVersion marcVersion) {
return validate(marcVersion, false, null);
}
public boolean validate(MarcVersion marcVersion, boolean isSummary) {
return validate(marcVersion, isSummary, null);
}
public boolean validate(MarcVersion marcVersion,
boolean isSummary,
IgnorableFields ignorableFields) {
return validate(marcVersion, isSummary, null, null);
}
public boolean validate(MarcVersion marcVersion,
boolean isSummary,
IgnorableFields ignorableFields,
List<ValidationErrorType> ignorableIssueTypes) {
logger.info("validate!");
// validationErrors = new ArrayList<>();
boolean isValidRecord = true;
if (!schemaType.equals(SchemaType.PICA))
isValidRecord = validateLeader(marcVersion, isValidRecord, ignorableIssueTypes);
isValidRecord = validateUnhandledTags(isSummary, isValidRecord, ignorableFields, ignorableIssueTypes);
isValidRecord = validateControlfields(marcVersion, isValidRecord, ignorableIssueTypes);
isValidRecord = validateDatafields(marcVersion, isValidRecord, ignorableFields, ignorableIssueTypes);
// TODO: use reflection to get all validator class
// ValidatorResponse validatorResponse;
return isValidRecord;
}
*/

/*
private boolean validateLeader(MarcVersion marcVersion,
boolean isValidRecord,
List<ValidationErrorType> ignorableIssueTypes) {
boolean isValidComponent;
isValidComponent = leader.validate(marcVersion);
if (!isValidComponent) {
List<ValidationError> leaderErrors = leader.getValidationErrors();
for (ValidationError leaderError : leaderErrors)
if (leaderError.getRecordId() == null)
leaderError.setRecordId(getId());
validationErrors.addAll(filterErrors(leaderErrors, ignorableIssueTypes));
isValidRecord = isValidComponent;
}
return isValidRecord;
}
private boolean validateUnhandledTags(boolean isSummary,
boolean isValidRecord,
IgnorableFields ignorableFields,
List<ValidationErrorType> ignorableIssueTypes) {
if (!unhandledTags.isEmpty()) {
if (isSummary) {
for (String tag : unhandledTags) {
if (!isIgnorableField(tag, ignorableFields)
&& (ignorableIssueTypes == null
|| ignorableIssueTypes.isEmpty()
|| !ignorableIssueTypes.contains(ValidationErrorType.FIELD_UNDEFINED)))
validationErrors.add(new ValidationError(getId(), tag, ValidationErrorType.FIELD_UNDEFINED, tag, null));
}
} else {
Map<String, Integer> tags = new LinkedHashMap<>();
for (String tag : unhandledTags)
Utils.count(tag, tags);
List<String> unhandledTagsList = new ArrayList<>();
for (Map.Entry<String, Integer> entry : tags.entrySet()) {
String tag = entry.getKey();
if (entry.getValue() == 1)
unhandledTagsList.add(tag);
else
unhandledTagsList.add(String.format("%s (%d*)", tag, entry.getValue()));
}
for (String tag : unhandledTagsList) {
if (!isIgnorableField(tag, ignorableFields)
&& !ignorableIssueTypes.contains(ValidationErrorType.FIELD_UNDEFINED))
validationErrors.add(new ValidationError(getId(), tag, ValidationErrorType.FIELD_UNDEFINED, tag, null));
}
}
isValidRecord = false;
}
return isValidRecord;
}
private boolean validateControlfields(MarcVersion marcVersion,
boolean isValidRecord,
List<ValidationErrorType> ignorableIssueTypes) {
boolean isValidComponent;
for (MarcControlField controlField : getControlfields()) {
if (controlField != null) {
isValidComponent = controlField.validate(marcVersion);
if (!isValidComponent) {
validationErrors.addAll(filterErrors(controlField.getValidationErrors(), ignorableIssueTypes));
isValidRecord = isValidComponent;
}
}
}
return isValidRecord;
}
private boolean validateDatafields(MarcVersion marcVersion,
boolean isValidRecord,
IgnorableFields ignorableFields,
List<ValidationErrorType> ignorableIssueTypes) {
ValidatorResponse validatorResponse;
Map<DataFieldDefinition, Integer> repetitionCounter = new HashMap<>();
for (DataField field : datafields) {
if (field.getDefinition() != null && !isIgnorableField(field.getTag(), ignorableFields)) {
count(field.getDefinition(), repetitionCounter);
if (!field.validate(marcVersion)) {
isValidRecord = false;
validationErrors.addAll(filterErrors(field.getValidationErrors(), ignorableIssueTypes));
}
validatorResponse = ClassificationReferenceValidator.validate(field);
if (!validatorResponse.isValid()) {
validationErrors.addAll(filterErrors(validatorResponse.getValidationErrors(), ignorableIssueTypes));
isValidRecord = false;
}
}
}
for (Map.Entry<DataFieldDefinition, Integer> entry : repetitionCounter.entrySet()) {
DataFieldDefinition fieldDefinition = entry.getKey();
Integer count = entry.getValue();
if (count > 1
&& fieldDefinition.getCardinality().equals(Cardinality.Nonrepeatable)) {
validationErrors.add(new ValidationError(getId(), fieldDefinition.getTag(),
ValidationErrorType.FIELD_NONREPEATABLE,
String.format("there are %d instances", count),
fieldDefinition.getDescriptionUrl()
));
isValidRecord = false;
}
}
return isValidRecord;
}
*/

/**
* Remove ignorable errors from the list of errors
*
* @param errors The list of error objects
* @param ignorableIssueTypes The list of ignorable error types
* @return
*/
/*
private static List<ValidationError> filterErrors(List<ValidationError> errors,
List<ValidationErrorType> ignorableIssueTypes) {
if (ignorableIssueTypes == null || ignorableIssueTypes.isEmpty())
return errors;
List<ValidationError> filtered = errors
.stream()
.filter(error -> !ignorableIssueTypes.contains(error.getType()))
.collect(Collectors.toList());
logger.info(errors.size() + " -> " + filtered.size());
return filtered;
}
*/

public boolean isIgnorableField(String tag, IgnorableFields ignorableFields) {
if (ignorableFields == null)
return false;
return ignorableFields.contains(tag);
}

/*
@Override
public List<ValidationError> getValidationErrors() {
return validationErrors;
}
*/

public List<String> search(String path, String query) {
List<String> results = new ArrayList<>();
if (path.equals("001") || path.equals("003") || path.equals("005")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,5 +189,4 @@ private void initializeThompsonTrailTags() {
}
}
}

}
1 change: 1 addition & 0 deletions src/main/resources/pica/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
The list of PICA schemas
3 changes: 2 additions & 1 deletion src/main/resources/pica/update-avram-k10plus-title.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

command -v jq >/dev/null 2>&1 || { echo >&2 "Missing command jq"; exit 1; }

TIMESTAMP=$(date +"%Y-%m-%d")
# See <https://github.com/pkiraly/metadata-qa-marc/issues/193> why repeatable is changed
curl "https://format.k10plus.de/avram.pl?profile=k10plus-title" \
| jq -S '(.fields[]|select(.subfields.U and .subfields.T)).repeatable=true' \
| jq -S '(.fields[]|select(.occurrence=="00")).occurrence=null' \
> avram-k10plus-title.json
> avram-k10plus-title-$TIMESTAMP.json

Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@

import de.gwdg.metadataqa.marc.MarcFactory;
import de.gwdg.metadataqa.marc.cli.CliTestUtils;
import de.gwdg.metadataqa.marc.dao.DataField;
import de.gwdg.metadataqa.marc.dao.record.BibliographicRecord;
import de.gwdg.metadataqa.marc.definition.MarcFormat;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.model.validation.ValidationError;
import de.gwdg.metadataqa.marc.definition.bibliographic.SchemaType;
import de.gwdg.metadataqa.marc.model.validation.ValidationErrorType;
import de.gwdg.metadataqa.marc.utils.QAMarcReaderFactory;
import de.gwdg.metadataqa.marc.utils.pica.PicaSchemaManager;
import de.gwdg.metadataqa.marc.utils.pica.PicaSchemaReader;
Expand Down Expand Up @@ -37,12 +36,21 @@ public void validate() {
BibliographicRecord marcRecord = MarcFactory.createPicaFromMarc4j(record, schema);
assertNotNull(marcRecord);

Validator validator = new Validator();
Validator validator = new Validator(new ValidatorConfiguration().withSchemaType(SchemaType.PICA));
boolean valid = validator.validate(marcRecord);
assertFalse(valid);
assertEquals(3, validator.getValidationErrors().size());
assertEquals(26, validator.getValidationErrors().size());
assertEquals("001@", validator.getValidationErrors().get(0).getMarcPath());
assertEquals(ValidationErrorType.FIELD_UNDEFINED, validator.getValidationErrors().get(0).getType());
assertEquals(null, validator.getValidationErrors().get(0).getUrl());

assertEquals("001U", validator.getValidationErrors().get(1).getMarcPath());
assertEquals("036F/01", validator.getValidationErrors().get(2).getMarcPath());

assertEquals("013D", validator.getValidationErrors().get(3).getMarcPath());
assertEquals(ValidationErrorType.SUBFIELD_UNDEFINED, validator.getValidationErrors().get(3).getType());
assertEquals("https://format.k10plus.de/k10plushelp.pl?cmd=kat&katalog=Standard&val=1131", validator.getValidationErrors().get(3).getUrl());
assertEquals("013D", validator.getValidationErrors().get(3).getMarcPath());
assertEquals("V", validator.getValidationErrors().get(3).getMessage());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,8 @@ public void validate_pica_normal() throws Exception {
RecordIterator iterator = new RecordIterator(processor);
iterator.start();
assertEquals("done", iterator.getStatus());
System.err.println("DONENEOENEN");

for (String outputFile : grouppedOutputFiles) {
System.err.println(outputFile);
File output = new File(outputDir, outputFile);
assertTrue(outputFile + " should exist", output.exists());
List<String> lines = FileUtils.readLinesFromFile("src/test/resources/output/" + outputFile);
Expand Down

0 comments on commit d9b844d

Please sign in to comment.