PicaDecoder: add record ids for level 1&2 records

Local system records (level 1) and holding records (level 2) do not store their record id in field `003@ $0` but in field `107F $0` or `203@ $0` (the latter may include an occurrence specification). This commit adds support to `PicaDecoder` for using these ids as record ids in start-record events.
metafacture · Oct 27, 2016 · c955f4d · c955f4d
1 parent 54a3d8c
commit c955f4d
Show file tree

Hide file tree

Showing 2 changed files with 97 additions and 55 deletions.
diff --git a/src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java b/src/main/java/org/culturegraph/mf/stream/converter/bib/PicaDecoder.java
@@ -15,6 +15,14 @@
  */
 package org.culturegraph.mf.stream.converter.bib;
 
+import static org.culturegraph.mf.stream.converter.bib.PicaConstants.FIELD_END_MARKER;
+import static org.culturegraph.mf.stream.converter.bib.PicaConstants.FIELD_MARKER;
+import static org.culturegraph.mf.stream.converter.bib.PicaConstants.RECORD_MARKER;
+import static org.culturegraph.mf.stream.converter.bib.PicaConstants.SUBFIELD_MARKER;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 import org.culturegraph.mf.framework.DefaultObjectPipe;
 import org.culturegraph.mf.framework.StreamReceiver;
 import org.culturegraph.mf.framework.annotations.Description;
@@ -80,11 +88,18 @@
  * is removed). This can be changed by setting
  * {@link #setTrimFieldNames(boolean)} to false.
  * <p>
- * The content of subfield <i>003&#64; $0</i> is used as record id. If
- * {@link #setIgnoreMissingIdn(boolean)} is false and field
- * <i>003&#64; $0</i> is not found in the record a
- * {@link MissingIdException} is thrown otherwise the record identifier is an
- * empty string.
+ * The record id emitted with the <i>start-record</i> event is extracted from
+ * one of the following pica fields:
+ * <ul>
+ *   <li><i>003&#64; $0</i>
+ *   <li><i>107F $0</i>
+ *   <li><i>203&#64; $0</i> (this field may have an optional occurrence marker)
+ * </ul>
+ * The value of the first matching field is used as the record id. The <i>$0</i>
+ * subfield must be the first subfield in the field. If
+ * {@link #setIgnoreMissingIdn(boolean)} is false and no matching field is not
+ * found in the record a {@link MissingIdException} is thrown otherwise the
+ * record identifier is an empty string.
  * <p>
  * For example, when run on the input
  * <pre>
@@ -120,10 +135,14 @@
 public final class PicaDecoder
 		extends DefaultObjectPipe<String, StreamReceiver> {
 
-	private static final char[] ID_FIELD = {'0', '0', '3', '@', ' ', PicaConstants.SUBFIELD_MARKER, '0'};
+	private static final String START_MARKERS ="(?:^|" + FIELD_MARKER +
+			"|" + FIELD_END_MARKER + "|" + RECORD_MARKER + ")";
+	private static final Pattern ID_FIELDS_PATTERN = Pattern.compile(
+			START_MARKERS + "(?:003@|203@(?:/..)?|107F) " + SUBFIELD_MARKER + "0");
 
 	private static final int BUFFER_SIZE = 1024 * 1024;
 
+	private final Matcher idFieldMatcher = ID_FIELDS_PATTERN.matcher("");
 	private final StringBuilder idBuilder = new StringBuilder();
 	private final PicaParserContext parserContext = new PicaParserContext();
 
@@ -133,17 +152,16 @@ public final class PicaDecoder
 	private boolean ignoreMissingIdn;
 
 	/**
-	 * Controls whether records having no pica subfield <i>003&#64; $0</i>
-	 * (which contains the record identifier <i>IDN</i>) are reported as faulty.
-	 * By default such records are reported by the {@code PicaDecoder} by throwing
+	 * Controls whether records having no record id are reported as faulty. By
+	 * default such records are reported by the {@code PicaDecoder} by throwing
 	 * a {@link MissingIdException}.
 	 * <p>
 	 * The setting can be changed at any time. It becomes effective with the next
 	 * record that is being processed.
 	 * <p>
 	 * <strong>Default value: {@code false}</strong>
 	 *
-	 * @param ignoreMissingIdn if true, missing IDNs do not trigger a
+	 * @param ignoreMissingIdn if true, missing record ids do not trigger a
 	 *                         {@link MissingIdException} but an empty string is
 	 *                         used as record identifier instead.
 	 */
@@ -262,59 +280,35 @@ private boolean isRecordEmpty() {
 		return true;
 	}
 
-	/**
-	 * Searches the record for the sequence specified in {@code ID_FIELD}
-	 * and returns all characters following this sequence until the next
-	 * control character (see {@link PicaConstants}) is found or the end of
-	 * the record is reached. Only the first occurrence of the sequence is
-	 * processed, later occurrences are ignored.
-	 *
-	 * If the sequence is not found in the string or if it is not followed
-	 * by any characters then {@code null} is returned.
-	 *
-	 * @return value of subfield 003@$0 or null if the
-	 *         field is not found or is empty.
-	 */
 	private String extractRecordId() {
+		final int idFromIndex = findRecordId();
+		if (idFromIndex == -1) {
+			return null;
+		}
 		idBuilder.setLength(0);
-
-		int fieldPos = 0;
-		boolean skip = false;
-		for (int i = 0; i < recordLen; ++i) {
-			if (isFieldDelimiter(buffer[i])) {
-				if (idBuilder.length() > 0) {
-					break;
-				}
-				fieldPos = 0;
-				skip = false;
-			} else {
-				if (!skip) {
-					if (fieldPos < ID_FIELD.length) {
-						if (buffer[i] == ID_FIELD[fieldPos]) {
-							fieldPos += 1;
-						} else {
-							skip = true;
-						}
-					} else {
-						if (buffer[i] == PicaConstants.SUBFIELD_MARKER) {
-							break;
-						}
-						idBuilder.append(buffer[i]);
-					}
-				}
+		for (int i = idFromIndex; i < recordLen; ++i) {
+			final char ch = buffer[i];
+			if (isSubfieldDelimiter(ch)) {
+				break;
 			}
+			idBuilder.append(ch);
 		}
+		return idBuilder.toString();
+	}
 
-		if (idBuilder.length() > 0) {
-			return idBuilder.toString();
+	private int findRecordId() {
+		idFieldMatcher.reset(new String(buffer, 0, recordLen));
+		if (!idFieldMatcher.find()) {
+			return -1;
 		}
-		return null;
+		return idFieldMatcher.end();
 	}
 
-	private static boolean isFieldDelimiter(final char ch) {
-		return ch == PicaConstants.RECORD_MARKER
-				|| ch == PicaConstants.FIELD_MARKER
-				|| ch == PicaConstants.FIELD_END_MARKER;
+	private static boolean isSubfieldDelimiter(final char ch) {
+		return ch == RECORD_MARKER
+				|| ch == FIELD_MARKER
+				|| ch == FIELD_END_MARKER
+				|| ch == SUBFIELD_MARKER;
 	}
 
 }
diff --git a/src/test/java/org/culturegraph/mf/stream/converter/bib/PicaDecoderTest.java b/src/test/java/org/culturegraph/mf/stream/converter/bib/PicaDecoderTest.java
@@ -51,6 +51,9 @@ public final class PicaDecoderTest {
 
 	private static final String FIELD_001AT_0_TEST = "001@ " + SUBFIELD_MARKER + "0test";
 	private static final String FIELD_003AT_0_ID = "003@ " + SUBFIELD_MARKER + "0" + RECORD_ID;
+	private static final String FIELD_107F_0_ID = "107F " + SUBFIELD_MARKER + "0" + RECORD_ID;
+	private static final String FIELD_203AT_0_ID = "203@ " + SUBFIELD_MARKER + "0" + RECORD_ID;
+	private static final String FIELD_203AT_01_0_ID = "203@/01 " + SUBFIELD_MARKER + "0" + RECORD_ID;;
 	private static final String FIELD_021A_A_UEBER = "021A " + SUBFIELD_MARKER + "a" + COMPOSED_UTF8;
 	private static final String FIELD_028A = ENTITY_028A + " ";
 
@@ -305,6 +308,51 @@ public void shouldExtractPicaProductionNumberAtRecordEndAsRecordId() {
 		verify(receiver).startRecord(RECORD_ID);
 	}
 
+	@Test
+	public void shouldExtractLocalProductionNumberAsRecordId() {
+		picaDecoder.process(FIELD_107F_0_ID);
+
+		verify(receiver).startRecord(RECORD_ID);
+	}
+
+	@Test
+	public void shouldExtractCopyControlNumberAsRecordId() {
+		picaDecoder.process(FIELD_203AT_0_ID);
+
+		verify(receiver).startRecord(RECORD_ID);
+	}
+
+	@Test
+	public void shouldExtractCopyControlNumberWithOccurrenceAsRecordId() {
+		picaDecoder.process(FIELD_203AT_01_0_ID);
+
+		verify(receiver).startRecord(RECORD_ID);
+	}
+
+	@Test(expected=MissingIdException.class)
+	public void shouldThrowMissingIdExceptionIfNoRecordIdIsFound() {
+		picaDecoder.process(FIELD_001AT_0_TEST);
+		// Exception expected
+	}
+
+	@Test
+	public void shouldIgnoreMatchWithinFieldData() {
+		picaDecoder.setIgnoreMissingIdn(true);
+
+		picaDecoder.process(FIELD_001AT_0_TEST + FIELD_003AT_0_ID);
+
+		verify(receiver).startRecord("");
+	}
+
+	@Test
+	public void shouldIgnoreIncompleteMatch() {
+		picaDecoder.setIgnoreMissingIdn(true);
+
+		picaDecoder.process("003@ " + FIELD_MARKER + FIELD_001AT_0_TEST);
+
+		verify(receiver).startRecord("");
+	}
+
 	@Test
 	public void shouldSkipUnnamedFieldsWithNoSubFields() {
 		// Make sure that the field is skipped because