Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions java/TikaTextDetection/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ be a line or paragrah of text surrounded by newlines and/or page breaks, a singl
etc. In addition to `PAGE_NUM`, each track will also have a `SECTION_NUM` property. `SECTION_NUM` starts over at 1 on
each page / slide.

Setting `MERGE_LINES = true` will disable text splitting by section, and return back a single text detection per page.
This option is useful whenever a document contains an excessive number of sections.

Setting `MERGE_PAGES = true` will disable all text splitting behaviors and return a single track with text extracted from the document. When enabled, `MERGE_PAGES` will override `MERGE_LINES` and is useful whenever a document contains an
excessive number of page breaks.

Users can also enable metadata reporting. If enabled by setting the job property `STORE_METADATA = true`, document
metadata will be labeled and stored as the first track. Metadata track will not contain the `PAGE_NUM`, `SECTION_NUM`,
or `TEXT` detection properties. Instead, the track will have a `METADATA` property with a value formatted as a JSON
Expand Down
12 changes: 12 additions & 0 deletions java/TikaTextDetection/plugin-files/descriptor/descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,18 @@
"description": "Specifies minimum length of detected text before language filtering is applied.",
"type": "INT",
"defaultValue": "20"
},
{
"name": "MERGE_LINES",
"description": "Specifies whether or not to combine detected sections of text into a single detection. When true, all lines within a given page will be combined into one detection output. Enabling this property can avoid an excessive number of detections.",
"type": "BOOLEAN",
"defaultValue": "false"
},
{
"name": "MERGE_PAGES",
"description": "Specifies whether or not to combine all text detections into a single track output. If set to true, all boundary detections in documents are ignored during processing. Enabling this option will override `MERGE_LINES`.",
"type": "BOOLEAN",
"defaultValue": "false"
}

]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ public class TextExtractionContentHandler extends ToTextContentHandler {
private static final String SECTION_TAG = "p";
private static final String PAGE_LABEL = "page";
private static final String SLIDE_LABEL = "slide-content";


private boolean _mergePages;
private boolean _mergeLines;
private int _pageNumber;

private StringBuilder _allText;
Expand All @@ -49,16 +51,18 @@ public class TextExtractionContentHandler extends ToTextContentHandler {
// Enable to avoid storing metadata/title text from pdf and ppt documents
private boolean _skipTitle = true;

public TextExtractionContentHandler(){
public TextExtractionContentHandler(boolean mergeLines, boolean mergePages) {
super();
_allText = new StringBuilder();
_pageNumber = 0;
_mergeLines = mergeLines;
_mergePages = mergePages;
createPage();
}

@Override
public void startElement(String uri, String localName, String qName, Attributes atts) {
if (SECTION_TAG.equals(qName)) {
if (SECTION_TAG.equals(qName) && !_mergeLines && !_mergePages) {
newSection();
return;
}
Expand All @@ -75,8 +79,10 @@ public void startElement(String uri, String localName, String qName, Attributes
// If pdf: Discard blank page.
reset();
} else {
_pageNumber++;
createPage();
if (!_mergePages) {
_pageNumber++;
createPage();
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,19 @@ public List<MPFGenericTrack> getDetections(MPFGenericJob mpfGenericJob) throws M
mpfGenericJob.getJobName(), mpfGenericJob.getDataUri(),
mpfGenericJob.getJobProperties().size(), mpfGenericJob.getMediaProperties().size());

Map<String,String> properties = mpfGenericJob.getJobProperties();

// Specify filename for tika parsers here.
File file = new File(mpfGenericJob.getDataUri());

Map<Integer, List<StringBuilder>> pageToSections;
Metadata metadata = new Metadata();
boolean mergeLines = MapUtils.getBooleanValue(properties, "MERGE_LINES", false);
boolean mergePages = MapUtils.getBooleanValue(properties, "MERGE_PAGES", false);
try (FileInputStream inputstream = new FileInputStream(file)) {
// Init parser with custom content handler for parsing text per page (PDF/PPTX).
Parser parser = new AutoDetectParser();
TextExtractionContentHandler handler = new TextExtractionContentHandler();
TextExtractionContentHandler handler = new TextExtractionContentHandler(mergeLines, mergePages);
ParseContext context = new ParseContext();
// Parse file.
// If the format is .pdf or .pptx, output will be divided by page/slide.
Expand All @@ -93,7 +97,7 @@ public List<MPFGenericTrack> getDetections(MPFGenericJob mpfGenericJob) throws M
contentType.equals("application/pdf") ||
contentType.startsWith("application/vnd.openxmlformats-officedocument.presentationml");

Map<String,String> properties = mpfGenericJob.getJobProperties();


// Set language filtering limit.
int charLimit = MapUtils.getIntValue(properties, "MIN_CHARS_FOR_LANGUAGE_DETECTION", 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,52 @@ public void testGetDetectionsPptx() throws MPFComponentDetectionError {
assertSection(tracks.get(22), "11", "2", "Unknown", "End slide test text"); // cannot determine language
}



@Test
public void testMergeSectionTextPptx() throws MPFComponentDetectionError {
String mediaPath = this.getClass().getResource("/data/test-tika-detection.pptx").getPath();

Map<String, String> jobProperties = new HashMap<>();
Map<String, String> mediaProperties = new HashMap<>();
jobProperties.put("MIN_CHARS_FOR_LANGUAGE_DETECTION", "20");
jobProperties.put("LIST_ALL_PAGES", "true");
jobProperties.put("MERGE_LINES", "true");

MPFGenericJob genericJob = new MPFGenericJob("TestGenericJob", mediaPath, jobProperties, mediaProperties);
List<MPFGenericTrack> tracks = tikaComponent.getDetections(genericJob);
assertEquals(11 ,tracks.size());

assertSection(tracks.get(0), "1", "1", "English", "Testing Text Detection\nSlide 1");
assertSection(tracks.get(1), "2", "1", "Japanese", "Testing:\n\nジアゼパム");
assertSection(tracks.get(3), "4", "1", "English", "An automobile with a bike races down the street");
assertSection(tracks.get(9), "10", "1", "English",
"Phrase Test\nFrom the Universal Declaration of Human Rights (1948):\n\n" +
"Article 1.\n \nAll human beings are born free");
assertSection(tracks.get(10), "11", "1", "Unknown", "End\nEnd slide test text");
}

@Test
public void testMergeAllTextPptx() throws MPFComponentDetectionError {
String mediaPath = this.getClass().getResource("/data/test-tika-detection.pptx").getPath();

Map<String, String> jobProperties = new HashMap<>();
Map<String, String> mediaProperties = new HashMap<>();
jobProperties.put("MIN_CHARS_FOR_LANGUAGE_DETECTION", "20");
jobProperties.put("LIST_ALL_PAGES", "true");
jobProperties.put("MERGE_PAGES", "true");

MPFGenericJob genericJob = new MPFGenericJob("TestGenericJob", mediaPath, jobProperties, mediaProperties);
List<MPFGenericTrack> tracks = tikaComponent.getDetections(genericJob);

assertEquals(1 ,tracks.size());

assertSection(tracks.get(0), "1", "1", "English", "Testing Text Detection");
assertSection(tracks.get(0), "1", "1", "English", "ジアゼパム");
assertSection(tracks.get(0), "1", "1", "English", "All human beings are born free");
assertSection(tracks.get(0), "1", "1", "English", "End slide test text");
}

@Test
public void testGetDetectionsOdp() throws MPFComponentDetectionError {
String mediaPath = this.getClass().getResource("/data/test-tika-detection.odp").getPath();
Expand Down