/
FileParser.java
313 lines (261 loc) · 9.08 KB
/
FileParser.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
/*
* Copyright 2008 Glencoe Software, Inc. All rights reserved.
* Use is subject to license terms supplied in LICENSE.txt
*/
package ome.services.fulltext;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.Reader;
import java.util.Iterator;
import java.util.NoSuchElementException;
import ome.services.messages.ParserOpenFileMessage;
import ome.system.OmeroContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.BeansException;
import org.springframework.context.ApplicationContext;
import org.springframework.context.ApplicationContextAware;
/**
* Object which attempts to parse any file given to it. On an exception or
* empty/missing file, an empty {@link Iterable} should be returned
* rather than throwing an exception.
*
* Subclasses should follow
*
* @author Josh Moore, josh at glencoesoftware.com
* @since 3.0-Beta3
*/
public class FileParser implements ApplicationContextAware {
private final static Logger log = LoggerFactory.getLogger(FileParser.class);
protected OmeroContext context;
protected long maxFileSize = 10000L; // default test is 8.8KB
public void setApplicationContext(ApplicationContext arg0)
throws BeansException {
context = (OmeroContext) arg0;
}
public void setMaxFileSize(Long size) {
if (size.floatValue() / Runtime.getRuntime().maxMemory() > 0.5) {
log.warn("Indexer maximum file size is set to more than half of "
+ "total heap size. Excessively large text files may "
+ "cause search index corruption. Consider decreasing the "
+ "maximum file size or increasing the Indexer heap size.");
}
this.maxFileSize = size;
}
/**
* {@link Iterable} which returns an empty {@link Iterator}. This will be
* used in case
*/
public final static Iterable<Reader> EMPTY = new Iterable<Reader>() {
public Iterator<Reader> iterator() {
return new Iterator<Reader>() {
public boolean hasNext() {
return false;
}
public Reader next() {
throw new NoSuchElementException();
}
public void remove() {
throw new UnsupportedOperationException();
}
};
}
};
/**
* Uses {@link #doParse(File)} to create manageable chunks of a file for
* indexing. If the {@link File} argument is null or unreadable, then the
* {@link #EMPTY} {@link Iterable} will be returned. The same holds if a
* null {@link Iterable} is returned or an {@link Exception} is thrown.
*
* The {@link Iterator} returned from the instance should always be
* completely iterated through so that resources can be released. For
* example, <code>
* for (String string : parse(file)) {
* /* possibly ignore string *\/
* }
* </code>
*
* @param file
* Can be null.
* @return An {@link Iterable} which is never null.
*/
final public Iterable<Reader> parse(File file) {
if (file == null) {
log.warn("Argument null. Returning EMPTY:");
return EMPTY;
}
if (!file.exists() && !file.canRead()) {
log.debug("empty|unreadable file: " + file.getAbsoluteFile());
return EMPTY;
}
if (file.length() > this.maxFileSize) {
log.info("File too large for indexing. Skipping: "
+ file.getAbsoluteFile());
return EMPTY;
}
try {
Iterable<Reader> it = doParse(file);
if (it == null) {
log.debug("Implementation returned null.");
return EMPTY;
} else {
return it;
}
} catch (Exception e) {
log.warn("Implementation threw an exception.", e);
return EMPTY;
}
}
/**
* Template method to parse a {@link File} into manageable chunks.
*
* The default implementation reads from the file lazily with chunks
* overlapping on the final white space. For example a file with:
* <code>The quick brown fox jumps over the lazy dog</code> might be
* parsed to: <code>The quick brown fox jumps</code> and
* <code>jumps over the lazy dog</code>.
*
* Receives a non-null, {@link File#canRead() readable} {@link File}
* instance from {@link #parse(File)} and can return a possible null
* {@link Iterable} or throw an {@link Exception}.
*
* In any of the non-successful cases, the {@link #EMPTY} {@link Iterable}
* will be returned to the consumer.
*/
public Iterable<Reader> doParse(File file) throws Exception {
FileReader reader = new FileReader(file);
BufferedReader buffered = new BufferedReader(reader);
context.publishEvent(new ParserOpenFileMessage(this, buffered) {
@Override
public void close() {
try {
Reader r = (Reader) resource;
r.close();
} catch (Exception e) {
log.debug("Error closing " + resource, e);
}
}
});
Iterator<Reader> it = new SingleIterator(buffered);
return wrap(it);
}
/**
* Wraps an {@link Iterator} with an {@link Iterable} instance. If the
* {@link Iterator} is null, the {@link #EMPTY} {@link Iterable} will be
* returned.
*
* @param it
* Can be null.
* @return Will never be null
*/
public Iterable<Reader> wrap(Iterator<Reader> it) {
if (it == null) {
return EMPTY;
}
return new IteratorWrapper(it);
}
public Iterable<Reader> wrap(Reader r) {
if (r == null) {
return EMPTY;
}
return wrap(new SingleIterator(r));
}
private static class SingleIterator implements Iterator<Reader> {
Reader r;
SingleIterator(Reader r) {
this.r = r;
}
public boolean hasNext() {
return r != null;
}
public Reader next() {
Reader rv = r;
r = null;
return rv;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
private static class IteratorWrapper implements Iterable<Reader> {
private final Iterator<Reader> it;
public IteratorWrapper(Iterator<Reader> it) {
this.it = it;
}
public Iterator<Reader> iterator() {
return it;
}
}
private static class OverlappingChunkFileIterator implements
Iterator<String> {
private static final String linesep = System
.getProperty("line.separator");
private static final int size = 10000;
private final long fileSize;
private final char[] buf;
private String next;
/*
* will be closed nulled out when finished.
*/
private BufferedReader reader;
public OverlappingChunkFileIterator(File file) throws Exception {
this.fileSize = file.length();
if (fileSize > Integer.MAX_VALUE) {
throw new RuntimeException(String.format(
"%s file is too large for current implementation: %s",
file, fileSize));
}
this.reader = new BufferedReader(new FileReader(file), size);
this.buf = new char[size];
}
public boolean hasNext() {
if (next == null) {
next = doRead();
}
return next != null;
}
public String next() {
if (!hasNext()) { // does doRead()
throw new NoSuchElementException();
}
String rv = next;
next = null;
return rv;
}
public void remove() {
throw new UnsupportedOperationException();
}
/**
* Intermediate method which parses whole file into a single String.
* Please see the restriction in the constructor on filesize.
*/
private String doRead() {
if (reader == null) {
return null;
}
StringBuffer sb = new StringBuffer((int) fileSize);
int rv = -1;
try {
while ((rv = reader.read(buf)) != -1) {
sb.append(buf, 0, rv);
}
} catch (Exception e) {
throw new RuntimeException("Error while parsing file", e);
}
closeReader();
return sb.toString();
}
private void closeReader() {
if (reader != null) {
try {
reader.close();
} catch (Exception e) {
// must ignore
} finally {
reader = null;
}
}
}
}
}