openmpf · jrobble · Jul 14, 2022 · Jun 28, 2021 · Jul 7, 2021 · Jul 7, 2021
diff --git a/cpp/OcvFaceDetection/plugin-files/descriptor/descriptor.json b/cpp/OcvFaceDetection/plugin-files/descriptor/descriptor.json
@@ -76,6 +76,17 @@
       "algorithm": "FACECV",
       "properties": []
     },
+    {
+      "name": "OCV FACE DETECTION DERIVATIVE MEDIA ONLY ACTION",
+      "description": "Executes the OpenCV face detection algorithm using the default parameters on derivative media only.",
+      "algorithm": "FACECV",
+      "properties": [
+        {
+          "name": "DERIVATIVE_MEDIA_ONLY",
+          "value": "TRUE"
+        }
+      ]
+    },
     {
       "name": "OCV FACE DETECTION (WITH AUTO-ORIENTATION) ACTION",
       "description": "Executes the OpenCV face detection algorithm and rotates and/or flips media based on EXIF data or video metadata.",
@@ -100,6 +111,13 @@
         "OCV FACE DETECTION ACTION"
       ]
     },
+    {
+      "name": "OCV FACE DETECTION DERIVATIVE MEDIA ONLY TASK",
+      "description": "Performs OpenCV face detection on derivative media only.",
+      "actions": [
+        "OCV FACE DETECTION DERIVATIVE MEDIA ONLY ACTION"
+      ]
+    },
     {
       "name": "OCV FACE DETECTION (WITH AUTO-ORIENTATION) TASK",
       "description": "Executes the OpenCV face detection algorithm and rotates and/or flips media based on EXIF data or video metadata.",

diff --git a/cpp/TesseractOCRTextDetection/plugin-files/descriptor/descriptor.json b/cpp/TesseractOCRTextDetection/plugin-files/descriptor/descriptor.json
@@ -91,7 +91,6 @@
           "type": "BOOLEAN",
           "defaultValue": "true"
         },
-
         {
           "name": "MIN_OSD_TEXT_ORIENTATION_CONFIDENCE",
           "description": "Specifies the minimum confidence value (>= 0) required to use the detected text orientation when OSD automation is enabled. Top rotation confidence is calculated based on comparison of top-detected rotation score against secondary (lower confidence) rotation scores.",
@@ -282,6 +281,17 @@
       "algorithm": "TESSERACTOCR",
       "properties": []
     },
+    {
+      "name": "TESSERACT OCR TEXT DETECTION DERIVATIVE MEDIA ONLY ACTION",
+      "description": "Performs Tesseract OCR on derivative media only.",
+      "algorithm": "TESSERACTOCR",
+      "properties": [
+        {
+          "name": "DERIVATIVE_MEDIA_ONLY",
+          "value": "TRUE"
+        }
+      ]
+    },
     {
       "name": "TESSERACT OCR TEXT DETECTION ACTION SPARSE TEXT",
       "description": "Performs Tesseract OCR with page segmentation mode set to 11 (sparse text).",
@@ -327,6 +337,45 @@
           "value": "3.2"
         }
       ]
+    },
+    {
+      "name": "TESSERACT OCR TEXT DETECTION (WITH FF REGION) DERIVATIVE MEDIA ONLY ACTION",
+      "description": "Performs Tesseract OCR on feed-forward regions on derivative media only.",
+      "algorithm": "TESSERACTOCR",
+      "properties": [
+        {
+          "name": "FEED_FORWARD_TYPE",
+          "value": "REGION"
+        },
+        {
+          "name": "MIN_OSD_TEXT_ORIENTATION_CONFIDENCE",
+          "value": "0"
+        },
+        {
+          "name": "MIN_OSD_PRIMARY_SCRIPT_CONFIDENCE",
+          "value": "0"
+        },
+        {
+          "name": "MIN_OSD_SCRIPT_SCORE",
+          "value": "0"
+        },
+        {
+          "name": "MIN_OSD_SECONDARY_SCRIPT_THRESHOLD",
+          "value": "0.40"
+        },
+        {
+          "name": "MAX_OSD_SCRIPTS",
+          "value": "2"
+        },
+        {
+          "name": "UNSTRUCTURED_TEXT_SCALE",
+          "value": "3.2"
+        },
+        {
+          "name": "DERIVATIVE_MEDIA_ONLY",
+          "value": "TRUE"
+        }
+      ]
     }
   ],
   "tasks": [
@@ -337,6 +386,13 @@
         "TESSERACT OCR TEXT DETECTION ACTION"
       ]
     },
+    {
+      "name": "TESSERACT OCR TEXT DETECTION DERIVATIVE MEDIA ONLY TASK",
+      "description": "Performs Tesseract OCR on derivative media only.",
+      "actions": [
+        "TESSERACT OCR TEXT DETECTION DERIVATIVE MEDIA ONLY ACTION"
+      ]
+    },
     {
       "name": "TESSERACT OCR TEXT DETECTION TASK SPARSE TEXT",
       "description": "Performs Tesseract OCR with page segmentation mode set to 11 (sparse text).",
@@ -350,6 +406,13 @@
       "actions": [
         "TESSERACT OCR TEXT DETECTION (WITH FF REGION) ACTION"
       ]
+    },
+    {
+      "name": "TESSERACT OCR TEXT DETECTION (WITH FF REGION) DERIVATIVE MEDIA ONLY TASK",
+      "description": "Performs Tesseract OCR on feed-forward regions on derivative media only.",
+      "actions": [
+        "TESSERACT OCR TEXT DETECTION (WITH FF REGION) DERIVATIVE MEDIA ONLY ACTION"
+      ]
     }
   ],
   "pipelines": [

diff --git a/java/TikaImageDetection/README.md b/java/TikaImageDetection/README.md
@@ -2,24 +2,35 @@
 
 This directory contains source code for the OpenMPF Tika image detection component.
 
-Extracts images embedded in document formats (.pdf, .ppt, .doc)
-
-For PDF documents, images will be extracted and processed per
-page/slide. Detected images will be reported in the detection property
-(IMAGE_FILES) in the order they were extracted. The first track (with
-detection property PAGE = 1) corresponds to first page of each document by
-default.
-
-The extractor only extracts unique images once per page.
-Images that are reused or duplicated in the PDF are ignored to save processing
-time and avoid infinite recursion. However, modifications were made to the PDF
-parser to allow for tracking of images repeatedly used across multiple pages.
-If an image occurs across two or more pages it will be extracted once from the
-first page then reported in the detection tracks of the following pages whenever
-it found again. For empty pages or pages with no extracted images,
-users can allow empty tracks to be reported by setting ALLOW_EMPTY_PAGES to true.
-
-By default, extracted images are stored in `$MPF_HOME/share/artifacts/<job#>/tika-extracted`.
-Users can set ORGANIZE_BY_PAGE to true to store each image in a sub-directory labeled by
-page number - for example, `page-1` - and images that appear on more than one page will be
-placed in a `common` directory instead.
+This component extracts images embedded in document formats, such as PDF (`.pdf`), PowerPoint (`.pptx`),
+Word (`.docx`), OpenDocument Presentation (`.odp`), and OpenDocument Text (`.odt`) documents.
+
+In general, the Tika parsers will extract unique images, once per track. Each track will contain a `PAGE_NUM` property
+specifying where the image is embedded. Note that page numbers start at 1, not 0, unless otherwise noted.
+
+The path where the extracted image is stored will be reported in the `DERIVATIVE_MEDIA_TEMP_PATH` property for each
+track.
+
+By default, extracted images are stored in `$MPF_HOME/share/tmp/derivative-media/<job#>/<uuid>/tika-extracted`. Users
+can set `ORGANIZE_BY_PAGE` to true to store each image in a sub-directory labeled by page number (for example, `page-1`)
+. Images that appear on more than one page will be placed in a `common` directory instead.
+
+Note that the OpenMPF Workflow Manager will move those files to a more persistent storage location reported in
+the JSON output object for each piece of derivative media with the job is complete.
+
+# Format-Specific Behaviors
+
+The following format-specific behaviors were observed using Tika 1.28.1 on Ubuntu 20.04:
+
+- For PDF files, the first page corresponds to a `PAGE_NUM` value of `
+  1`. Multiple pages can be reported in a PDF document and are separated by semicolons. For
+  example, `PAGE_NUM = 1; 2; 4`
+  , would indicate the embedded image appears on pages 1, 2, and 4 of a PDF document.
+
+- For non-PDF files we intentionally set `PAGE_NUM = -1` to indicate that the page number cannot be determined.
+
+- OpenDocument Presentation documents will generate a thumbnail / preview `.png` of the content of the last modified
+  slide, including text, even if it's blank.
+
+- OpenDocument Text documents will generate a thumbnail / preview `.png` of the content of the first page, including
+  text, even if it's blank.
diff --git a/java/TikaImageDetection/plugin-files/descriptor/descriptor.json b/java/TikaImageDetection/plugin-files/descriptor/descriptor.json
@@ -15,8 +15,8 @@
     "providesCollection": {
       "states": [
         "DETECTION",
-        "DETECTION_IMAGE",
-        "DETECTION_IMAGE_TIKA"
+        "DETECTION_MEDIA",
+        "DETECTION_MEDIA_TIKA"
       ],
       "properties": [
         {
@@ -29,13 +29,7 @@
           "name": "SAVE_PATH",
           "description": "Specifies main directory for storing extracted images.",
           "type": "STRING",
-          "defaultValue": "$MPF_HOME/share/artifacts"
-        },
-        {
-          "name": "ALLOW_EMPTY_PAGES",
-          "description": "When true, pages with no images are still stored as tracks with an empty IMAGE_FILES field. When false, empty pages are not stored at all.",
-          "type": "BOOLEAN",
-          "defaultValue": "false"
+          "defaultValue": "$MPF_HOME/share/tmp/derivative-media"
         },
         {
           "name": "ORGANIZE_BY_PAGE",
@@ -70,6 +64,54 @@
       "tasks": [
         "TIKA IMAGE DETECTION TASK"
       ]
+    },
+    {
+      "name": "TIKA IMAGE DETECTION WITH DERIVATIVE MEDIA TESSERACT OCR PIPELINE",
+      "description": "Performs Tika image detection followed by Tika text detection on source media. Performs Tesseract OCR on derivative media.",
+      "tasks": [
+        "TIKA IMAGE DETECTION TASK",
+        "TIKA TEXT DETECTION SOURCE MEDIA ONLY TASK",
+        "TESSERACT OCR TEXT DETECTION DERIVATIVE MEDIA ONLY TASK"
+      ]
+    },
+    {
+      "name": "TIKA IMAGE DETECTION WITH DERIVATIVE MEDIA TESSERACT OCR AND KEYWORD TAGGING PIPELINE",
+      "description": "Performs Tika image detection followed by Tika text detection on source media. Performs Tesseract OCR on derivative media. Keyword tagging is performed on all TEXT results.",
+      "tasks": [
+        "TIKA IMAGE DETECTION TASK",
+        "TIKA TEXT DETECTION SOURCE MEDIA ONLY TASK",
+        "TESSERACT OCR TEXT DETECTION DERIVATIVE MEDIA ONLY TASK",
+        "KEYWORD TAGGING (WITH FF REGION) TASK"
+      ]
+    },
+    {
+      "name": "TIKA IMAGE DETECTION WITH DERIVATIVE MEDIA TESSERACT OCR (WITH EAST REGIONS) AND KEYWORD TAGGING AND MARKUP PIPELINE",
+      "description": "Performs Tika image detection followed by Tika text detection on source media. Performs Tesseract OCR and markup on feed-forward regions from EAST on derivative media. Keyword tagging is performed on all TEXT results.",
+      "tasks": [
+        "TIKA IMAGE DETECTION TASK",
+        "TIKA TEXT DETECTION SOURCE MEDIA ONLY TASK",
+        "EAST TEXT DETECTION DERIVATIVE MEDIA ONLY TASK",
+        "TESSERACT OCR TEXT DETECTION (WITH FF REGION) DERIVATIVE MEDIA ONLY TASK",
+        "KEYWORD TAGGING (WITH FF REGION) TASK",
+        "OCV GENERIC MARKUP DERIVATIVE MEDIA ONLY TASK"
+      ]
+    },
+    {
+      "name": "TIKA IMAGE DETECTION WITH DERIVATIVE MEDIA OCV FACE PIPELINE",
+      "description": "Performs Tika image detection on source media followed by OpenCV face detection on derivative media.",
+      "tasks": [
+        "TIKA IMAGE DETECTION TASK",
+        "OCV FACE DETECTION DERIVATIVE MEDIA ONLY TASK"
+      ]
+    },
+    {
+      "name": "TIKA IMAGE DETECTION WITH DERIVATIVE MEDIA OCV FACE AND MARKUP PIPELINE",
+      "description": "Performs Tika image detection on source media followed by OpenCV face detection and markup on derivative media.",
+      "tasks": [
+        "TIKA IMAGE DETECTION TASK",
+        "OCV FACE DETECTION DERIVATIVE MEDIA ONLY TASK",
+        "OCV GENERIC MARKUP DERIVATIVE MEDIA ONLY TASK"
+      ]
     }
   ]
 }
diff --git a/java/TikaImageDetection/pom.xml b/java/TikaImageDetection/pom.xml
@@ -54,12 +54,27 @@
         <dependency>
             <groupId>org.apache.tika</groupId>
             <artifactId>tika-core</artifactId>
-            <version>1.28.1</version>
+            <version>2.4.1</version>
         </dependency>
         <dependency>
             <groupId>org.apache.tika</groupId>
-            <artifactId>tika-parsers</artifactId>
-            <version>1.28.1</version>
+            <artifactId>tika-parsers-standard-package</artifactId>
+            <version>2.4.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parser-scientific-module</artifactId>
+            <version>2.4.1</version>
+            </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parser-sqlite3-module</artifactId>
+            <version>2.4.1</version>
+        </dependency>
+        <dependency>
+            <groupId>com.github.jai-imageio</groupId>
+            <artifactId>jai-imageio-jpeg2000</artifactId>
+            <version>1.4.0</version>
         </dependency>
         <dependency>
             <groupId>com.fasterxml.jackson.core</groupId>