From daeda4860fc20f0b4518533e8c89f8f5fccd2281 Mon Sep 17 00:00:00 2001
From: Howard W Huang <40070840+hhuangMITRE@users.noreply.github.com>
Date: Tue, 27 Feb 2024 16:11:50 -0500
Subject: [PATCH] Add Option to Merge Text Sections and Pages in
 TikaTextDetection. (#351)

Co-Authored-By: jrobble <jrobble@mitre.org>
---
 java/TikaTextDetection/README.md              |  6 +++
 .../plugin-files/descriptor/descriptor.json   | 12 +++++
 .../tika/TextExtractionContentHandler.java    | 16 +++++--
 .../tika/TikaTextDetectionComponent.java      |  8 +++-
 .../tika/TestTikaTextDetectionComponent.java  | 46 +++++++++++++++++++
 5 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/java/TikaTextDetection/README.md b/java/TikaTextDetection/README.md
index 0324b901..d73286ba 100644
--- a/java/TikaTextDetection/README.md
+++ b/java/TikaTextDetection/README.md
@@ -17,6 +17,12 @@ be a line or paragrah of text surrounded by newlines and/or page breaks, a singl
 etc. In addition to `PAGE_NUM`, each track will also have a `SECTION_NUM` property. `SECTION_NUM` starts over at 1 on
 each page / slide.
 
+Setting `MERGE_LINES = true` will disable text splitting by section, and return back a single text detection per page.
+This option is useful whenever a document contains an excessive number of sections.
+
+Setting `MERGE_PAGES = true` will disable all text splitting behaviors and return a single track with text extracted from the document. When enabled, `MERGE_PAGES` will override `MERGE_LINES` and is useful whenever a document contains an
+excessive number of page breaks.
+
 Users can also enable metadata reporting. If enabled by setting the job property `STORE_METADATA = true`, document
 metadata will be labeled and stored as the first track. Metadata track will not contain the `PAGE_NUM`, `SECTION_NUM`,
 or `TEXT` detection properties. Instead, the track will have a `METADATA` property with a value formatted as a JSON
diff --git a/java/TikaTextDetection/plugin-files/descriptor/descriptor.json b/java/TikaTextDetection/plugin-files/descriptor/descriptor.json
index d00212c2..1dd4ae1b 100755
--- a/java/TikaTextDetection/plugin-files/descriptor/descriptor.json
+++ b/java/TikaTextDetection/plugin-files/descriptor/descriptor.json
@@ -44,6 +44,18 @@
           "description": "Specifies minimum length of detected text before language filtering is applied.",
           "type": "INT",
           "defaultValue": "20"
+        },
+        {
+          "name": "MERGE_LINES",
+          "description": "Specifies whether or not to combine detected sections of text into a single detection. When true, all lines within a given page will be combined into one detection output. Enabling this property can avoid an excessive number of detections.",
+          "type": "BOOLEAN",
+          "defaultValue": "false"
+        },
+        {
+          "name": "MERGE_PAGES",
+          "description": "Specifies whether or not to combine all text detections into a single track output. If set to true, all boundary detections in documents are ignored during processing. Enabling this option will override `MERGE_LINES`.",
+          "type": "BOOLEAN",
+          "defaultValue": "false"
         }
 
       ]
diff --git a/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TextExtractionContentHandler.java b/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TextExtractionContentHandler.java
index 452a826c..8fb4d4a0 100644
--- a/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TextExtractionContentHandler.java
+++ b/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TextExtractionContentHandler.java
@@ -38,7 +38,9 @@ public class TextExtractionContentHandler extends ToTextContentHandler {
     private static final String SECTION_TAG = "p";
     private static final String PAGE_LABEL = "page";
     private static final String SLIDE_LABEL = "slide-content";
-    
+
+    private boolean _mergePages;
+    private boolean _mergeLines;
     private int _pageNumber;
 
     private StringBuilder _allText;
@@ -49,16 +51,18 @@ public class TextExtractionContentHandler extends ToTextContentHandler {
     // Enable to avoid storing metadata/title text from pdf and ppt documents
     private boolean _skipTitle = true;
 
-    public TextExtractionContentHandler(){
+    public TextExtractionContentHandler(boolean mergeLines, boolean mergePages) {
         super();
         _allText = new StringBuilder();
         _pageNumber = 0;
+        _mergeLines = mergeLines;
+        _mergePages = mergePages;
         createPage();
     }
 
     @Override
     public void startElement(String uri, String localName, String qName, Attributes atts) {
-        if (SECTION_TAG.equals(qName)) {
+        if (SECTION_TAG.equals(qName) && !_mergeLines && !_mergePages) {
             newSection();
             return;
         }
@@ -75,8 +79,10 @@ public void startElement(String uri, String localName, String qName, Attributes
                 // If pdf: Discard blank page.
                 reset();
             } else {
-                _pageNumber++;
-                createPage();
+                if (!_mergePages) {
+                    _pageNumber++;
+                    createPage();
+                }
             }
         }
     }
diff --git a/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TikaTextDetectionComponent.java b/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TikaTextDetectionComponent.java
index ad03bfcc..9801caea 100755
--- a/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TikaTextDetectionComponent.java
+++ b/java/TikaTextDetection/src/main/java/org/mitre/mpf/detection/tika/TikaTextDetectionComponent.java
@@ -64,15 +64,19 @@ public List<MPFGenericTrack> getDetections(MPFGenericJob mpfGenericJob) throws M
             mpfGenericJob.getJobName(), mpfGenericJob.getDataUri(),
             mpfGenericJob.getJobProperties().size(), mpfGenericJob.getMediaProperties().size());
 
+        Map<String,String> properties = mpfGenericJob.getJobProperties();
+
         // Specify filename for tika parsers here.
         File file = new File(mpfGenericJob.getDataUri());
 
         Map<Integer, List<StringBuilder>> pageToSections;
         Metadata metadata = new Metadata();
+        boolean mergeLines = MapUtils.getBooleanValue(properties, "MERGE_LINES", false);
+        boolean mergePages = MapUtils.getBooleanValue(properties, "MERGE_PAGES", false);
         try (FileInputStream inputstream = new FileInputStream(file)) {
             // Init parser with custom content handler for parsing text per page (PDF/PPTX).
             Parser parser = new AutoDetectParser();
-            TextExtractionContentHandler handler = new TextExtractionContentHandler();
+            TextExtractionContentHandler handler = new TextExtractionContentHandler(mergeLines, mergePages);
             ParseContext context = new ParseContext();
             // Parse file.
             // If the format is .pdf or .pptx, output will be divided by page/slide.
@@ -93,7 +97,7 @@ public List<MPFGenericTrack> getDetections(MPFGenericJob mpfGenericJob) throws M
                 contentType.equals("application/pdf") ||
                 contentType.startsWith("application/vnd.openxmlformats-officedocument.presentationml");
 
-        Map<String,String> properties = mpfGenericJob.getJobProperties();
+
 
         // Set language filtering limit.
         int charLimit = MapUtils.getIntValue(properties, "MIN_CHARS_FOR_LANGUAGE_DETECTION", 0);
diff --git a/java/TikaTextDetection/src/test/java/org/mitre/mpf/detection/tika/TestTikaTextDetectionComponent.java b/java/TikaTextDetection/src/test/java/org/mitre/mpf/detection/tika/TestTikaTextDetectionComponent.java
index 7a5026fa..00b3eb72 100755
--- a/java/TikaTextDetection/src/test/java/org/mitre/mpf/detection/tika/TestTikaTextDetectionComponent.java
+++ b/java/TikaTextDetection/src/test/java/org/mitre/mpf/detection/tika/TestTikaTextDetectionComponent.java
@@ -142,6 +142,52 @@ public void testGetDetectionsPptx() throws MPFComponentDetectionError {
         assertSection(tracks.get(22), "11", "2", "Unknown", "End slide test text"); // cannot determine language
     }
 
+
+
+    @Test
+    public void testMergeSectionTextPptx() throws MPFComponentDetectionError {
+        String mediaPath = this.getClass().getResource("/data/test-tika-detection.pptx").getPath();
+
+        Map<String, String> jobProperties = new HashMap<>();
+        Map<String, String> mediaProperties = new HashMap<>();
+        jobProperties.put("MIN_CHARS_FOR_LANGUAGE_DETECTION", "20");
+        jobProperties.put("LIST_ALL_PAGES", "true");
+        jobProperties.put("MERGE_LINES", "true");
+
+        MPFGenericJob genericJob = new MPFGenericJob("TestGenericJob", mediaPath, jobProperties, mediaProperties);
+        List<MPFGenericTrack> tracks = tikaComponent.getDetections(genericJob);
+        assertEquals(11 ,tracks.size());
+
+        assertSection(tracks.get(0), "1", "1", "English", "Testing Text Detection\nSlide 1");
+        assertSection(tracks.get(1), "2", "1", "Japanese", "Testing:\n\nジアゼパム");
+        assertSection(tracks.get(3), "4", "1", "English", "An automobile with a bike races down the street");
+        assertSection(tracks.get(9), "10", "1", "English",
+                "Phrase Test\nFrom the Universal Declaration of Human Rights (1948):\n\n" + 
+                "Article 1.\n \nAll human beings are born free");
+        assertSection(tracks.get(10), "11", "1", "Unknown", "End\nEnd slide test text");
+    }
+
+    @Test
+    public void testMergeAllTextPptx() throws MPFComponentDetectionError {
+        String mediaPath = this.getClass().getResource("/data/test-tika-detection.pptx").getPath();
+
+        Map<String, String> jobProperties = new HashMap<>();
+        Map<String, String> mediaProperties = new HashMap<>();
+        jobProperties.put("MIN_CHARS_FOR_LANGUAGE_DETECTION", "20");
+        jobProperties.put("LIST_ALL_PAGES", "true");
+        jobProperties.put("MERGE_PAGES", "true");
+
+        MPFGenericJob genericJob = new MPFGenericJob("TestGenericJob", mediaPath, jobProperties, mediaProperties);
+        List<MPFGenericTrack> tracks = tikaComponent.getDetections(genericJob);
+
+        assertEquals(1 ,tracks.size());
+
+        assertSection(tracks.get(0), "1", "1", "English", "Testing Text Detection");
+        assertSection(tracks.get(0), "1", "1", "English", "ジアゼパム");
+        assertSection(tracks.get(0), "1", "1", "English", "All human beings are born free");
+        assertSection(tracks.get(0), "1", "1", "English", "End slide test text");
+    }
+
     @Test
     public void testGetDetectionsOdp() throws MPFComponentDetectionError {
         String mediaPath = this.getClass().getResource("/data/test-tika-detection.odp").getPath();