Skip to content

Commit

Permalink
issue #127: Fixing issues in indexing version specific subfields
Browse files Browse the repository at this point in the history
  • Loading branch information
pkiraly committed Feb 4, 2022
1 parent bbfcaf8 commit d0ae69d
Show file tree
Hide file tree
Showing 14 changed files with 118 additions and 23 deletions.
4 changes: 2 additions & 2 deletions common-script
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,8 @@ do_export_schema_files() {
fi
printf "%s %s> [avram]\n" $(date +"%F %T")
./export-schema --withSubfieldCodelists > marc-schema/marc-schema.json
./export-schema --withSubfieldCodelists --solrFieldType human-readable --withSelfDescriptiveCode > marc-schema/marc-schema-with-solr.json
./export-schema --withSubfieldCodelists --solrFieldType human-readable --withSelfDescriptiveCode --withLocallyDefinedFields > marc-schema/marc-schema-with-solr-and-extensions.json
./export-schema --withSubfieldCodelists --solrFieldType mixed --withSelfDescriptiveCode > marc-schema/marc-schema-with-solr.json
./export-schema --withSubfieldCodelists --solrFieldType mixed --withSelfDescriptiveCode --withLocallyDefinedFields > marc-schema/marc-schema-with-solr-and-extensions.json
printf "%s %s> 3 files generated at 'marc-schema' directory: marc-schema.json, marc-schema-with-solr.json, marc-schema-with-solr-and-extensions.json\n" $(date +"%F %T")
}

Expand Down
2 changes: 1 addition & 1 deletion marc-schema/marc-schema-with-solr-and-extensions.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion marc-schema/marc-schema-with-solr.json

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions src/main/java/de/gwdg/metadataqa/marc/Extractable.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
package de.gwdg.metadataqa.marc;

import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.model.SolrFieldType;

import java.util.List;
import java.util.Map;

public interface Extractable {

public Map<String, List<String>> getKeyValuePairs();
public Map<String, List<String>> getKeyValuePairs(SolrFieldType type);
Map<String, List<String>> getKeyValuePairs();
Map<String, List<String>> getKeyValuePairs(SolrFieldType type);
Map<String, List<String>> getKeyValuePairs(SolrFieldType type, MarcVersion marcVersion);
}
5 changes: 4 additions & 1 deletion src/main/java/de/gwdg/metadataqa/marc/cli/MarcToSolr.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import de.gwdg.metadataqa.marc.cli.processor.MarcFileProcessor;
import de.gwdg.metadataqa.marc.cli.utils.RecordIterator;
import de.gwdg.metadataqa.marc.datastore.MarcSolrClient;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
Expand Down Expand Up @@ -35,6 +36,7 @@ public class MarcToSolr implements MarcFileProcessor, Serializable {
MarcToSolr.class.getCanonicalName()
);
private final Options options;
private final MarcVersion version;
private MarcToSolrParameters parameters;
private MarcSolrClient client;
private Path currentFile;
Expand All @@ -47,6 +49,7 @@ public MarcToSolr(String[] args) throws ParseException {
client = new MarcSolrClient(parameters.getSolrUrl());
client.setTrimId(parameters.getTrimId());
readyToProcess = true;
version = parameters.getMarcVersion();
}

public static void main(String[] args) throws ParseException {
Expand Down Expand Up @@ -80,7 +83,7 @@ public void processRecord(MarcRecord marcRecord, int recordNumber) throws IOExce

try {
Map<String, List<String>> map = marcRecord.getKeyValuePairs(
parameters.getSolrFieldType(), true
parameters.getSolrFieldType(), true, parameters.getMarcVersion()
);
map.put("record_sni", Arrays.asList(marcRecord.asJson()));
client.indexMap(marcRecord.getId(), map);
Expand Down
10 changes: 9 additions & 1 deletion src/main/java/de/gwdg/metadataqa/marc/dao/DataField.java
Original file line number Diff line number Diff line change
Expand Up @@ -322,11 +322,18 @@ public Map<String, List<String>> getKeyValuePairs() {

@Override
public Map<String, List<String>> getKeyValuePairs(SolrFieldType type) {
return getKeyValuePairs(type, MarcVersion.MARC21);
}

@Override
public Map<String, List<String>> getKeyValuePairs(SolrFieldType type,
MarcVersion marcVersion) {
Map<String, List<String>> pairs = new HashMap<>();

DataFieldKeyGenerator keyGenerator = new DataFieldKeyGenerator(
definition, type, getTag()
);
keyGenerator.setMarcVersion(marcVersion);

boolean hasInd1def = (definition != null && definition.getInd1().exists());
if (hasInd1def || !getInd1().equals(" ")) {
Expand All @@ -340,8 +347,9 @@ definition, type, getTag()
pairs.put(keyGenerator.forInd2(), Arrays.asList(value));
}

for (MarcSubfield subfield : subfields)
for (MarcSubfield subfield : subfields) {
pairs.putAll(subfield.getKeyValuePairs(keyGenerator));
}

if (getFieldIndexer() != null) {
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,11 @@ public Map<String, List<String>> getKeyValuePairs() {

@Override
public Map<String, List<String>> getKeyValuePairs(SolrFieldType type) {
return getKeyValuePairs(type, MarcVersion.MARC21);
}

@Override
public Map<String, List<String>> getKeyValuePairs(SolrFieldType type, MarcVersion marcVersion) {
Map<String, List<String>> map = new LinkedHashMap<>();

map.put(
Expand Down
12 changes: 9 additions & 3 deletions src/main/java/de/gwdg/metadataqa/marc/dao/MarcRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -331,11 +331,17 @@ public Map<String, List<String>> getKeyValuePairs() {
}

public Map<String, List<String>> getKeyValuePairs(SolrFieldType type) {
return getKeyValuePairs(type, false);
return getKeyValuePairs(type, false, MarcVersion.MARC21);
}

@Override
public Map<String, List<String>> getKeyValuePairs(SolrFieldType type, MarcVersion marcVersion) {
return getKeyValuePairs(type, false, marcVersion);
}

public Map<String, List<String>> getKeyValuePairs(SolrFieldType type,
boolean withDeduplication) {
boolean withDeduplication,
MarcVersion marcVersion) {
if (mainKeyValuePairs == null) {
mainKeyValuePairs = new LinkedHashMap<>();

Expand All @@ -347,7 +353,7 @@ public Map<String, List<String>> getKeyValuePairs(SolrFieldType type,
mainKeyValuePairs.putAll(controlField.getKeyValuePairs(type));

for (DataField field : datafields) {
Map<String, List<String>> keyValuePairs = field.getKeyValuePairs(type);
Map<String, List<String>> keyValuePairs = field.getKeyValuePairs(type, marcVersion);
for (Map.Entry<String, List<String>> entry : keyValuePairs.entrySet()) {
String key = entry.getKey();
List<String> values = entry.getValue();
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package de.gwdg.metadataqa.marc.definition.general;

import de.gwdg.metadataqa.marc.Extractable;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.model.SolrFieldType;

import java.io.Serializable;
Expand Down Expand Up @@ -48,6 +49,11 @@ public Map<String, List<String>> getKeyValuePairs() {

@Override
public Map<String, List<String>> getKeyValuePairs(SolrFieldType type) {
return getKeyValuePairs(type, MarcVersion.MARC21);
}

@Override
public Map<String, List<String>> getKeyValuePairs(SolrFieldType type, MarcVersion marcVersion) {
Map<String, List<String>> map = new LinkedHashMap<>();
Map<String, String> simpleMap = getMap();
for (Map.Entry<String, String> entry : simpleMap.entrySet()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public static Tag880 getInstance() {
private void initialize() {
tag = "880";
label = "Alternate Graphic Representation";
mqTag = "Alternate Graphic Representation";
mqTag = "AlternateGraphicRepresentation";
cardinality = Cardinality.Repeatable;
descriptionUrl = "https://www.loc.gov/marc/bibliographic/bd880.html";
setCompilanceLevels("A", "A");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,20 @@
package de.gwdg.metadataqa.marc.utils.keygenerator;

import de.gwdg.metadataqa.marc.MarcSubfield;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.definition.structure.DataFieldDefinition;
import de.gwdg.metadataqa.marc.definition.structure.SubfieldDefinition;
import de.gwdg.metadataqa.marc.model.SolrFieldType;

import java.util.regex.Pattern;

public class DataFieldKeyGenerator {
private DataFieldDefinition definition;
private SolrFieldType type;
private String tag;
private String indexTag;
private static final Pattern nonValidSubfieldCode = Pattern.compile("[^0-9a-zA-Z]");
private MarcVersion marcVersion;

public DataFieldKeyGenerator(DataFieldDefinition definition, SolrFieldType type) {
this.definition = definition;
Expand Down Expand Up @@ -82,18 +87,31 @@ public String forInd2() {
}

public String forSubfield(MarcSubfield subfield) {
return forSubfield(subfield.getCode(), subfield.getCodeForIndex());
String code = subfield.getCode();
SubfieldDefinition subfieldDefinition = subfield.getDefinition();
if (subfieldDefinition == null && definition != null)
subfieldDefinition = definition.getVersionSpecificSubfield(marcVersion, code);
String codeForIndex = (subfieldDefinition != null) ? subfieldDefinition.getCodeForIndex() : code;
String key = forSubfield(code, codeForIndex);

return addVersion(subfieldDefinition, key);
}

public String forSubfield(SubfieldDefinition subfield) {
String key = forSubfield(subfield.getCode(), subfield.getCodeForIndex());
if (subfield.getMarcVersion() != null && type != SolrFieldType.MARC)
key += "_" + subfield.getMarcVersion().getCode();
return addVersion(subfield, key);
}

private String addVersion(SubfieldDefinition subfieldDefinition, String key) {
if (subfieldDefinition != null && subfieldDefinition.getMarcVersion() != null && type != SolrFieldType.MARC)
key += "_" + subfieldDefinition.getMarcVersion().getCode();
return key;
}

private String forSubfield(String code, String codeForIndex) {
if (nonValidSubfieldCode.matcher(code).matches())
code = String.format("x%x", (int) code.charAt(0));

String key = "";
switch (type) {
case HUMAN:
Expand Down Expand Up @@ -122,4 +140,8 @@ public String getIndexTag() {
public String forSubfield(MarcSubfield subfield, String extra) {
return String.format("%s_%s", forSubfield(subfield), extra);
}

public void setMarcVersion(MarcVersion marcVersion) {
this.marcVersion = marcVersion;
}
}
3 changes: 2 additions & 1 deletion src/test/java/de/gwdg/metadataqa/marc/cli/IndexingTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import de.gwdg.metadataqa.api.util.FileUtils;
import de.gwdg.metadataqa.marc.MarcFactory;
import de.gwdg.metadataqa.marc.dao.MarcRecord;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.model.SolrFieldType;
import org.junit.Test;

Expand All @@ -19,7 +20,7 @@ public class IndexingTest {
public void testIndexing710() throws IOException, URISyntaxException {
List<String> lines = FileUtils.readLinesFromResource("marctxt/010000011.mrctxt");
MarcRecord marcRecord = MarcFactory.createFromFormattedText(lines);
Map<String, List<String>> index = marcRecord.getKeyValuePairs(SolrFieldType.MIXED);
Map<String, List<String>> index = marcRecord.getKeyValuePairs(SolrFieldType.MIXED, MarcVersion.DNB);
assertEquals(136, index.size());
assertEquals("(DE-576)19168161X",
index.get("7100_AddedCorporateName_authorityRecordControlNumber")
Expand Down
30 changes: 30 additions & 0 deletions src/test/java/de/gwdg/metadataqa/marc/cli/MarcToSolrTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package de.gwdg.metadataqa.marc.cli;

import de.gwdg.metadataqa.marc.dao.DataField;
import de.gwdg.metadataqa.marc.dao.Leader;
import de.gwdg.metadataqa.marc.dao.MarcRecord;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.definition.tags.tags76x.Tag787;
import de.gwdg.metadataqa.marc.model.SolrFieldType;
import org.junit.Test;

import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class MarcToSolrTest {

@Test
public void testVersionSpecificSubfield() {
MarcRecord marcRecord = new MarcRecord("010000011");
marcRecord.setLeader(new Leader("00860cam a22002774a 45 0"));
marcRecord.addDataField(new DataField(Tag787.getInstance(), " ", " ","@", "japan"));
Map<String, List<String>> solr = marcRecord.getKeyValuePairs(SolrFieldType.MIXED, false, MarcVersion.KBR);
assertTrue(solr.containsKey("787x40_RelatedTo_language_KBR"));
assertEquals(Arrays.asList("japan"), solr.get("787x40_RelatedTo_language_KBR"));
}
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package de.gwdg.metadataqa.marc.utils.keygenerator;

import de.gwdg.metadataqa.marc.MarcSubfield;
import de.gwdg.metadataqa.marc.definition.MarcVersion;
import de.gwdg.metadataqa.marc.definition.structure.DataFieldDefinition;
import de.gwdg.metadataqa.marc.definition.tags.tags20x.Tag245;
import de.gwdg.metadataqa.marc.model.SolrFieldType;
import org.junit.Test;

import java.util.regex.Pattern;

import static org.junit.Assert.*;

public class DataFieldKeyGeneratorTest {
Expand All @@ -14,8 +17,7 @@ public class DataFieldKeyGeneratorTest {

@Test
public void testMarc() {
DataFieldKeyGenerator generator = new DataFieldKeyGenerator(
definition, SolrFieldType.MARC);
DataFieldKeyGenerator generator = new DataFieldKeyGenerator(definition, SolrFieldType.MARC);
assertNotNull(generator.forInd1());
assertEquals("245ind1", generator.forInd1());
assertEquals("245ind2", generator.forInd2());
Expand All @@ -24,8 +26,7 @@ public void testMarc() {

@Test
public void testHuman() {
DataFieldKeyGenerator generator = new DataFieldKeyGenerator(
definition, SolrFieldType.HUMAN);
DataFieldKeyGenerator generator = new DataFieldKeyGenerator(definition, SolrFieldType.HUMAN);
assertNotNull(generator.forInd1());
assertEquals("Title_titleAddedEntry", generator.forInd1());
assertEquals("Title_nonfilingCharacters", generator.forInd2());
Expand All @@ -34,11 +35,23 @@ public void testHuman() {

@Test
public void testMixed() {
DataFieldKeyGenerator generator = new DataFieldKeyGenerator(
definition, SolrFieldType.MIXED);
DataFieldKeyGenerator generator = new DataFieldKeyGenerator(definition, SolrFieldType.MIXED);
assertNotNull(generator.forInd1());
assertEquals("245ind1_Title_titleAddedEntry", generator.forInd1());
assertEquals("245ind2_Title_nonfilingCharacters", generator.forInd2());
assertEquals("245a_Title_mainTitle", generator.forSubfield(subfield));
}

@Test
public void testRegex() {
Pattern nonValidSubfieldCode = Pattern.compile("[^0-9a-zA-Z]");
assertTrue(nonValidSubfieldCode.matcher("@").matches());
}

@Test
public void testAt() {
DataFieldKeyGenerator generator = new DataFieldKeyGenerator(definition, SolrFieldType.MIXED);
assertEquals("245x40_Title_language_KBR", generator.forSubfield(definition.getVersionSpecificSubfield(MarcVersion.KBR, "@")));
}

}

0 comments on commit d0ae69d

Please sign in to comment.