Skip to content

Commit

Permalink
Add ability to configure Tika, Fixes #74
Browse files Browse the repository at this point in the history
  • Loading branch information
nsoft committed Jan 24, 2019
1 parent 1f6aaa6 commit 591c968
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.jesterj.ingest.model.Document;
Expand All @@ -41,6 +42,8 @@ public class TikaProcessor implements DocumentProcessor {
private static final Logger log = LogManager.getLogger();
private String name;
private String suffix;
private int maxLength = -1; // process all text by default
private TikaConfig tikaConfig = TikaConfig.getDefaultConfig();

@Override
public Document[] processDocument(Document document) {
Expand All @@ -50,11 +53,11 @@ public Document[] processDocument(Document document) {
return new Document[]{document};
}
try {
Tika tika = new Tika();
Tika tika = new Tika(tikaConfig);
tika.setMaxStringLength(document.getRawData().length);
Metadata metadata = new Metadata();
try (ByteArrayInputStream bais = new ByteArrayInputStream(rawData)) {
String textContent = tika.parseToString(bais, metadata);
String textContent = tika.parseToString(bais, metadata, maxLength);
document.setRawData(textContent.getBytes(Charset.forName("UTF-8")));
for (String name : metadata.names()) {
document.put(sanitize(name) + plusSuffix(), metadata.get(name));
Expand Down Expand Up @@ -116,6 +119,33 @@ public Builder appendingSuffix(String suffix) {
return this;
}

/**
* Convenience override for safety valve to guard against large documents. By
* default this is set to -1 for no limit on the amount of data to process
* with Tika.
*
* @param chars The limit
* @return This builder for further configuration
*/
public Builder truncatingTextTo(int chars) {
getObj().maxLength = chars;
return this;
}

/**
* Specify a tika configuration via an XML document you have loaded via filesystem/classpath or other method
* of your choice.
*
* @param config The configuration
* @return This builder for further config
* @throws TikaException if Tika doesn't like your config
* @throws IOException if Tika can't find something it needed?
*/
public Builder configuredWith(org.w3c.dom.Document config) throws TikaException, IOException {
getObj().tikaConfig = new TikaConfig(config);
return this;
}

private void setObj(TikaProcessor obj) {
this.obj = obj;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package org.jesterj.ingest.processors;

import com.copyright.easiertest.Mock;
import org.apache.tika.exception.TikaException;
import org.jesterj.ingest.model.Document;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.xml.sax.SAXException;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import java.io.ByteArrayInputStream;
import java.io.IOException;

import static com.copyright.easiertest.EasierMocks.*;
import static org.easymock.EasyMock.aryEq;
import static org.easymock.EasyMock.expect;

public class TikaProcessorTest {

private static final String HTML = "<html><head><title>The title</title></head><body>This is some body text</body></html>";
private static final String XML = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><root><peer><child>The title</child></peer><peer>This is some body text</peer></root>";
private static final String XML_CONFIG=
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
"<properties>\n" +
" <parsers>\n" +
" <!-- Default Parser for most things, except for 2 mime types, and never\n" +
" use the Executable Parser -->\n" +
" <parser class=\"org.apache.tika.parser.DefaultParser\">\n" +
" <mime-exclude>image/jpeg</mime-exclude>\n" +
" <mime-exclude>application/pdf</mime-exclude>\n" +
" <parser-exclude class=\"org.apache.tika.parser.executable.ExecutableParser\"/>\n" +
" </parser>\n" +
" <!-- Use a different parser for PDF -->\n" +
" <parser class=\"org.apache.tika.parser.EmptyParser\">\n" +
" <mime>application/pdf</mime>\n" +
" </parser>\n" +
" <!-- Use a different parser for XML -->\n" +
" <parser class=\"org.apache.tika.parser.XmlParser\">\n" +
" <mime>text/xml</mime>\n" +
" </parser>\n" +
" </parsers>\n" +
"</properties>";
@Mock
private Document mockDocument;

public TikaProcessorTest() {
prepareMocks(this);
}

@Before
public void setUp() {
reset();
}

@After
public void tearDown() {
verify();
}

@Test
public void testHtml() {
TikaProcessor proc = new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20).build();
expect(mockDocument.getRawData()).andReturn(HTML.getBytes()).anyTimes();
mockDocument.setRawData(aryEq("This is some body te".getBytes()));
expect(mockDocument.put("X_Parsed_By_tk", "org.apache.tika.parser.DefaultParser")).andReturn(true);
expect(mockDocument.put("dc_title_tk", "The title")).andReturn(true);
expect(mockDocument.put("Content_Encoding_tk", "ISO-8859-1")).andReturn(true);
expect(mockDocument.put("title_tk", "The title")).andReturn(true);
expect(mockDocument.put("Content_Type_tk", "text/html; charset=ISO-8859-1")).andReturn(true);

replay();
proc.processDocument(mockDocument);
}

@Test
public void testXml() throws ParserConfigurationException, IOException, SAXException, TikaException {
DocumentBuilderFactory factory =
DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
ByteArrayInputStream input = new ByteArrayInputStream(XML_CONFIG.getBytes("UTF-8"));
org.w3c.dom.Document doc = builder.parse(input);

TikaProcessor proc = new TikaProcessor.Builder().named("foo").appendingSuffix("_tk").truncatingTextTo(20)
.configuredWith(doc)
.build();
System.out.println(new String(new byte[] {32, 32, 32, 84, 104, 101, 32, 116, 105, 116, 108, 101, 32, 84, 104, 105, 115, 32, 105, 115}));
expect(mockDocument.getRawData()).andReturn(XML.getBytes()).anyTimes();
mockDocument.setRawData(aryEq(" The title This is".getBytes()));
expect(mockDocument.put("X_Parsed_By_tk", "org.apache.tika.parser.CompositeParser")).andReturn(true);
expect(mockDocument.put("Content_Type_tk", "application/xml")).andReturn(true);

replay();
proc.processDocument(mockDocument);
}
}

0 comments on commit 591c968

Please sign in to comment.