/
MabDecoder.java
122 lines (106 loc) · 4.4 KB
/
MabDecoder.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/*
* Copyright 2013, 2014 Deutsche Nationalbibliothek
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.metafacture.biblio;
import org.metafacture.framework.FluxCommand;
import org.metafacture.framework.FormatException;
import org.metafacture.framework.MissingIdException;
import org.metafacture.framework.StreamReceiver;
import org.metafacture.framework.annotations.Description;
import org.metafacture.framework.annotations.In;
import org.metafacture.framework.annotations.Out;
import org.metafacture.framework.helpers.DefaultObjectPipe;
import java.util.regex.Pattern;
/**
* Parses a raw Mab2 stream (utf-8 encoding assumed). Events are handled by a
* {@link StreamReceiver}.
*
* @see StreamReceiver
*
* @author Markus Michael Geipel, Christoph Böhme
*
*/
@Description("Parses a raw Mab2 stream (UTF-8 encoding expected).")
@In(String.class)
@Out(StreamReceiver.class)
@FluxCommand("decode-mab")
public final class MabDecoder extends DefaultObjectPipe<String, StreamReceiver> {
private static final String FIELD_END = "\u001e";
private static final Pattern FIELD_PATTERN =
Pattern.compile(FIELD_END, Pattern.LITERAL);
private static final Pattern SUBFIELD_PATTERN =
Pattern.compile("\u001f", Pattern.LITERAL);
private static final String RECORD_END = "\u001d";
private static final int FIELD_NAME_SIZE = 4;
private static final int HEADER_SIZE = 24;
private static final String LEADER = "Leader";
private static final String TYPE = "type";
private static final String INVALID_FORMAT = "Invalid MAB format";
private static final String ID_TAG = "001 ";
private static final int TAG_LENGTH = 4;
/**
* Creates an instance of {@link MabDecoder}.
*/
public MabDecoder() {
}
@Override
public void process(final String record) {
assert !isClosed();
if (record.trim().isEmpty()) {
return;
}
getReceiver().startRecord(extractIdFromRecord(record));
try {
getReceiver().literal(LEADER, record.substring(0, HEADER_SIZE));
getReceiver().literal(TYPE, String.valueOf(record.charAt(HEADER_SIZE - 1)));
final String content = record.substring(HEADER_SIZE);
for (final String part : FIELD_PATTERN.split(content)) {
if (!part.startsWith(RECORD_END)) {
final String fieldName = part.substring(0, FIELD_NAME_SIZE).trim();
final String fieldContent = part.substring(FIELD_NAME_SIZE);
final String[] subFields = SUBFIELD_PATTERN.split(fieldContent);
if (subFields.length == 1) {
getReceiver().literal(fieldName, subFields[0]);
}
else {
getReceiver().startEntity(fieldName);
for (int i = 1; i < subFields.length; ++i) {
final String name = subFields[i].substring(0, 1);
final String value = subFields[i].substring(1);
getReceiver().literal(name, value);
}
getReceiver().endEntity();
}
}
}
}
catch (final IndexOutOfBoundsException e) {
throw new FormatException("[" + record + "]", e);
}
getReceiver().endRecord();
}
private String extractIdFromRecord(final String record) {
try {
final int fieldEnd = record.indexOf(FIELD_END, HEADER_SIZE);
if (record.substring(HEADER_SIZE, HEADER_SIZE + TAG_LENGTH).equals(ID_TAG)) {
return record.substring(HEADER_SIZE + TAG_LENGTH, fieldEnd);
}
throw new MissingIdException(record);
}
catch (final IndexOutOfBoundsException e) {
throw new FormatException(INVALID_FORMAT + record, e);
}
}
}