From daeda4860fc20f0b4518533e8c89f8f5fccd2281 Mon Sep 17 00:00:00 2001 From: Howard W Huang <40070840+hhuangMITRE@users.noreply.github.com> Date: Tue, 27 Feb 2024 16:11:50 -0500 Subject: [PATCH] Add Option to Merge Text Sections and Pages in TikaTextDetection. (#351) Co-Authored-By: jrobble --- java/TikaTextDetection/README.md | 6 +++ .../plugin-files/descriptor/descriptor.json | 12 +++++ .../tika/TextExtractionContentHandler.java | 16 +++++-- .../tika/TikaTextDetectionComponent.java | 8 +++- .../tika/TestTikaTextDetectionComponent.java | 46 +++++++++++++++++++ 5 files changed, 81 insertions(+), 7 deletions(-) diff --git a/java/TikaTextDetection/README.md b/java/TikaTextDetection/README.md index 0324b901..d73286ba 100644 --- a/java/TikaTextDetection/README.md +++ b/java/TikaTextDetection/README.md @@ -17,6 +17,12 @@ be a line or paragrah of text surrounded by newlines and/or page breaks, a singl etc. In addition to `PAGE_NUM`, each track will also have a `SECTION_NUM` property. `SECTION_NUM` starts over at 1 on each page / slide. +Setting `MERGE_LINES = true` will disable text splitting by section, and return back a single text detection per page. +This option is useful whenever a document contains an excessive number of sections. + +Setting `MERGE_PAGES = true` will disable all text splitting behaviors and return a single track with text extracted from the document. When enabled, `MERGE_PAGES` will override `MERGE_LINES` and is useful whenever a document contains an +excessive number of page breaks. + Users can also enable metadata reporting. If enabled by setting the job property `STORE_METADATA = true`, document metadata will be labeled and stored as the first track. Metadata track will not contain the `PAGE_NUM`, `SECTION_NUM`, or `TEXT` detection properties. Instead, the track will have a `METADATA` property with a value formatted as a JSON diff --git a/java/TikaTextDetection/plugin-files/descriptor/descriptor.json b/java/TikaTextDetection/plugin-files/descriptor/descriptor.json index d00212c2..1dd4ae1b 100755 --- a/java/TikaTextDetection/plugin-files/descriptor/descriptor.json +++ b/java/TikaTextDetection/plugin-files/descriptor/descriptor.json @@ -44,6 +44,18 @@ "description": "Specifies minimum length of detected text before language filtering is applied.", "type": "INT", "defaultValue": "20" + }, + { + "name": "MERGE_LINES", + "description": "Specifies whether or not to combine detected sections of text into a single detection. When true, all lines within a given page will be combined into one detection output. Enabling this property can avoid an excessive number of detections.", + "type": "BOOLEAN", + "defaultValue": "false" + }, + { + "name": "MERGE_PAGES", + "description": "Specifies whether or not to combine all text detections into a single track output. If set to true, all boundary detections in documents are ignored during processing. Enabling this option will override `MERGE_LINES`.", + "type": "BOOLEAN", + "defaultValue": "false" } ] diff --git a/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TextExtractionContentHandler.java b/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TextExtractionContentHandler.java index 452a826c..8fb4d4a0 100644 --- a/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TextExtractionContentHandler.java +++ b/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TextExtractionContentHandler.java @@ -38,7 +38,9 @@ public class TextExtractionContentHandler extends ToTextContentHandler { private static final String SECTION_TAG = "p"; private static final String PAGE_LABEL = "page"; private static final String SLIDE_LABEL = "slide-content"; - + + private boolean _mergePages; + private boolean _mergeLines; private int _pageNumber; private StringBuilder _allText; @@ -49,16 +51,18 @@ public class TextExtractionContentHandler extends ToTextContentHandler { // Enable to avoid storing metadata/title text from pdf and ppt documents private boolean _skipTitle = true; - public TextExtractionContentHandler(){ + public TextExtractionContentHandler(boolean mergeLines, boolean mergePages) { super(); _allText = new StringBuilder(); _pageNumber = 0; + _mergeLines = mergeLines; + _mergePages = mergePages; createPage(); } @Override public void startElement(String uri, String localName, String qName, Attributes atts) { - if (SECTION_TAG.equals(qName)) { + if (SECTION_TAG.equals(qName) && !_mergeLines && !_mergePages) { newSection(); return; } @@ -75,8 +79,10 @@ public void startElement(String uri, String localName, String qName, Attributes // If pdf: Discard blank page. reset(); } else { - _pageNumber++; - createPage(); + if (!_mergePages) { + _pageNumber++; + createPage(); + } } } } diff --git a/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TikaTextDetectionComponent.java b/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TikaTextDetectionComponent.java index ad03bfcc..9801caea 100755 --- a/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TikaTextDetectionComponent.java +++ b/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TikaTextDetectionComponent.java @@ -64,15 +64,19 @@ public List getDetections(MPFGenericJob mpfGenericJob) throws M mpfGenericJob.getJobName(), mpfGenericJob.getDataUri(), mpfGenericJob.getJobProperties().size(), mpfGenericJob.getMediaProperties().size()); + Map properties = mpfGenericJob.getJobProperties(); + // Specify filename for tika parsers here. File file = new File(mpfGenericJob.getDataUri()); Map> pageToSections; Metadata metadata = new Metadata(); + boolean mergeLines = MapUtils.getBooleanValue(properties, "MERGE_LINES", false); + boolean mergePages = MapUtils.getBooleanValue(properties, "MERGE_PAGES", false); try (FileInputStream inputstream = new FileInputStream(file)) { // Init parser with custom content handler for parsing text per page (PDF/PPTX). Parser parser = new AutoDetectParser(); - TextExtractionContentHandler handler = new TextExtractionContentHandler(); + TextExtractionContentHandler handler = new TextExtractionContentHandler(mergeLines, mergePages); ParseContext context = new ParseContext(); // Parse file. // If the format is .pdf or .pptx, output will be divided by page/slide. @@ -93,7 +97,7 @@ public List getDetections(MPFGenericJob mpfGenericJob) throws M contentType.equals("application/pdf") || contentType.startsWith("application/vnd.openxmlformats-officedocument.presentationml"); - Map properties = mpfGenericJob.getJobProperties(); + // Set language filtering limit. int charLimit = MapUtils.getIntValue(properties, "MIN_CHARS_FOR_LANGUAGE_DETECTION", 0); diff --git a/java/TikaTextDetection/src/test/java/org/mitre/mpf/detection/tika/TestTikaTextDetectionComponent.java b/java/TikaTextDetection/src/test/java/org/mitre/mpf/detection/tika/TestTikaTextDetectionComponent.java index 7a5026fa..00b3eb72 100755 --- a/java/TikaTextDetection/src/test/java/org/mitre/mpf/detection/tika/TestTikaTextDetectionComponent.java +++ b/java/TikaTextDetection/src/test/java/org/mitre/mpf/detection/tika/TestTikaTextDetectionComponent.java @@ -142,6 +142,52 @@ public void testGetDetectionsPptx() throws MPFComponentDetectionError { assertSection(tracks.get(22), "11", "2", "Unknown", "End slide test text"); // cannot determine language } + + + @Test + public void testMergeSectionTextPptx() throws MPFComponentDetectionError { + String mediaPath = this.getClass().getResource("/data/test-tika-detection.pptx").getPath(); + + Map jobProperties = new HashMap<>(); + Map mediaProperties = new HashMap<>(); + jobProperties.put("MIN_CHARS_FOR_LANGUAGE_DETECTION", "20"); + jobProperties.put("LIST_ALL_PAGES", "true"); + jobProperties.put("MERGE_LINES", "true"); + + MPFGenericJob genericJob = new MPFGenericJob("TestGenericJob", mediaPath, jobProperties, mediaProperties); + List tracks = tikaComponent.getDetections(genericJob); + assertEquals(11 ,tracks.size()); + + assertSection(tracks.get(0), "1", "1", "English", "Testing Text Detection\nSlide 1"); + assertSection(tracks.get(1), "2", "1", "Japanese", "Testing:\n\nジアゼパム"); + assertSection(tracks.get(3), "4", "1", "English", "An automobile with a bike races down the street"); + assertSection(tracks.get(9), "10", "1", "English", + "Phrase Test\nFrom the Universal Declaration of Human Rights (1948):\n\n" + + "Article 1.\n \nAll human beings are born free"); + assertSection(tracks.get(10), "11", "1", "Unknown", "End\nEnd slide test text"); + } + + @Test + public void testMergeAllTextPptx() throws MPFComponentDetectionError { + String mediaPath = this.getClass().getResource("/data/test-tika-detection.pptx").getPath(); + + Map jobProperties = new HashMap<>(); + Map mediaProperties = new HashMap<>(); + jobProperties.put("MIN_CHARS_FOR_LANGUAGE_DETECTION", "20"); + jobProperties.put("LIST_ALL_PAGES", "true"); + jobProperties.put("MERGE_PAGES", "true"); + + MPFGenericJob genericJob = new MPFGenericJob("TestGenericJob", mediaPath, jobProperties, mediaProperties); + List tracks = tikaComponent.getDetections(genericJob); + + assertEquals(1 ,tracks.size()); + + assertSection(tracks.get(0), "1", "1", "English", "Testing Text Detection"); + assertSection(tracks.get(0), "1", "1", "English", "ジアゼパム"); + assertSection(tracks.get(0), "1", "1", "English", "All human beings are born free"); + assertSection(tracks.get(0), "1", "1", "English", "End slide test text"); + } + @Test public void testGetDetectionsOdp() throws MPFComponentDetectionError { String mediaPath = this.getClass().getResource("/data/test-tika-detection.odp").getPath();