openmpf · kburkewv · Mar 13, 2025 · Mar 19, 2025 · Apr 3, 2025 · Apr 3, 2025
diff --git a/python/GeminiDetection/Dockerfile b/python/GeminiDetection/Dockerfile
@@ -0,0 +1,67 @@
+# syntax=docker/dockerfile:experimental
+
+#############################################################################
+# NOTICE                                                                    #
+#                                                                           #
+# This software (or technical data) was produced for the U.S. Government    #
+# under contract, and is subject to the Rights in Data-General Clause       #
+# 52.227-14, Alt. IV (DEC 2007).                                            #
+#                                                                           #
+# Copyright 2024 The MITRE Corporation. All Rights Reserved.                #
+#############################################################################
+
+#############################################################################
+# Copyright 2024 The MITRE Corporation                                      #
+#                                                                           #
+# Licensed under the Apache License, Version 2.0 (the "License");           #
+# you may not use this file except in compliance with the License.          #
+# You may obtain a copy of the License at                                   #
+#                                                                           #
+#    http://www.apache.org/licenses/LICENSE-2.0                             #
+#                                                                           #
+# Unless required by applicable law or agreed to in writing, software       #
+# distributed under the License is distributed on an "AS IS" BASIS,         #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
+# See the License for the specific language governing permissions and       #
+# limitations under the License.                                            #
+#############################################################################
+
+ARG BUILD_REGISTRY
+ARG BUILD_TAG=latest
+FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG}
+
+RUN --mount=type=tmpfs,target=/var/cache/apt \
+    --mount=type=tmpfs,target=/var/lib/apt/lists  \
+    --mount=type=tmpfs,target=/tmp \
+    apt-get update; \
+    DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y wget \
+    # For Google Gemini 
+    # After installing the following /usr/bin will have:
+    # python3 -> python3.8
+    # python3.8
+    # python3.9
+    python3.9 python3.9-venv libpython3.9
+
+# Create separate venv for Python 3.9 subprocess
+RUN mkdir -p /gemini-subprocess/venv; \
+    python3.9 -m venv /gemini-subprocess/venv; \
+    /gemini-subprocess/venv/bin/pip3 install google-genai pillow numpy
+
+COPY gemini-process-image.py gemini_component/resource_tracker_monkeypatch.py /gemini-subprocess
+
+RUN pip3 install --upgrade pip
+
+RUN pip3 install tenacity opencv-python
+
+ARG RUN_TESTS=false
+
+RUN --mount=target=.,readwrite \
+    install-component.sh; \
+    if [ "${RUN_TESTS,,}" == true ]; then python tests/test_gemini.py; fi
+
+LABEL org.label-schema.license="Apache 2.0" \
+      org.label-schema.name="OpenMPF Gemini Detection" \
+      org.label-schema.schema-version="1.0" \
+      org.label-schema.url="https://openmpf.github.io" \
+      org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \
+      org.label-schema.vendor="MITRE"
diff --git a/python/GeminiDetection/README.md b/python/GeminiDetection/README.md
@@ -0,0 +1,63 @@
+# Overview
+
+This repository contains source code for the OpenMPF Gemini Detection Component.
+
+This component utilizes a config file that contains any number of prompts for any number of object classes. These prompts and the images/video frames are passed to the Google Gemini server to generate responses.
+
+# Job Properties
+
+The following are the properties that can be specified for the component. All properties except for GEMINI_API_KEY and CLASSIFICATION have default values, making them optional to set.
+
+- `GEMINI_API_KEY`: Your API key to send requests to Google Gemini
+- `CLASSIFICATION`: The class of the object(s) in the media. Used to determine the prompt(s). Examples: PERSON and VEHICLE.
+- `PROMPT_CONFIGURATION_PATH`: The path to JSON file which contains prompts for specified classifications.
+- `JSON_PROMPT_CONFIGURATION_PATH`: The path to a JSON file which contains classes and prompts that specify Gemini to return a JSON object.
+- `ENABLE_JSON_PROMPT_FORMAT`: Enables returning a JSON formatted response from Gemini, with the prompt specified at PROMPT_JSON_CONFIGURATION_PATH job property. By default set to false.
+- `GENERATE_FRAME_RATE_CAP`: The threshold on the maximum number of frames to process in the video segment within one second of the native video time.
+- `MODEL_NAME`: The model to use for Gemini inference. By default it is set to `"gemma-3-27b-it"`.
+- `GENERATION_MAX_ATTEMPTS`: The maximum number of times the component will attempt to generate valid JSON output.
+
+# Config File
+
+The config file is a JSON formatted file that is used by the component to know which prompts to ask Gemini depending on the class of the object. The user can write their own config file and can be used by setting the `PROMPT_CONFIGURATION_PATH` property. The following is an example of the proper syntax to follow:
+
+```json
+[
+    {
+        "classes": [
+            "DOG",
+            "CAT",
+            "HORSE"
+        ],
+        "prompts": [
+            {
+                "detectionProperty": "DESCRIPTION",
+                "prompt": "Describe the animal's color and appearance."
+            }
+        ]
+    },
+    {
+        "classes": [
+            "DOG"
+        ],
+        "prompts": [
+            {
+                "detectionProperty": "DOG BREED",
+                "prompt": "Describe the potential breeds that this dog could contain."
+            }
+        ]
+    }
+]
+```
+
+Note that a class can appear in multiple entries in the JSON, such as `"DOG"` in the example. If you have multiple classes that share a prompt, you can list them together like above and then add more questions for each individual class if you wish to get more specific. 
+
+Also be sure to make each `"detectionProperty"` distinct for a given class so that none of your prompts are overwritten.
+
+# Outputs
+
+Once the responses are generated, they are added onto the `detection_properties` dictionary of the associated `ImageLocation` object. for each prompt, the key is specified by the `"detectionProperty"` field of the config JSON and the value will be the Gemini-generated response.
+
+# TODO
+
+- Add functionality for generic class property detection
diff --git a/python/GeminiDetection/gemini-process-image.py b/python/GeminiDetection/gemini-process-image.py
@@ -0,0 +1,83 @@
+#############################################################################
+# NOTICE                                                                    #
+#                                                                           #
+# This software (or technical data) was produced for the U.S. Government    #
+# under contract, and is subject to the Rights in Data-General Clause       #
+# 52.227-14, Alt. IV (DEC 2007).                                            #
+#                                                                           #
+# Copyright 2024 The MITRE Corporation. All Rights Reserved.                #
+#############################################################################
+
+#############################################################################
+# Copyright 2024 The MITRE Corporation                                      #
+#                                                                           #
+# Licensed under the Apache License, Version 2.0 (the "License");           #
+# you may not use this file except in compliance with the License.          #
+# You may obtain a copy of the License at                                   #
+#                                                                           #
+#    http://www.apache.org/licenses/LICENSE-2.0                             #
+#                                                                           #
+# Unless required by applicable law or agreed to in writing, software       #
+# distributed under the License is distributed on an "AS IS" BASIS,         #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
+# See the License for the specific language governing permissions and       #
+# limitations under the License.                                            #
+#############################################################################
+
+import argparse
+import json
+import sys
+import numpy as np
+
+from google import genai
+from multiprocessing.shared_memory import SharedMemory
+from google.genai.errors import ClientError
+from PIL import Image
+
+from resource_tracker_monkeypatch import remove_shm_from_resource_tracker 
+
+def main():
+    parser = argparse.ArgumentParser(description='Sends image and prompt to Gemini Client for processing.')
+
+    parser.add_argument("--model", "-m", type=str, default="gemma-3-27b-it", help="The name of the Gemini model to use.")
+    parser.add_argument("--shm-name", type=str, required=True, help="Shared memory name for image data.")
+    parser.add_argument("--shm-shape", type=str, required=True, help="Shape of the image in shared memory (JSON list).")
+    parser.add_argument("--shm-dtype", type=str, required=True, help="Numpy dtype of the image in shared memory.")
+    parser.add_argument("--prompt", "-p", type=str, required=True, help="The prompt you want to use with the image.")
+    parser.add_argument("--api_key", "-a", type=str, required=True, help="Your API key for Gemini.")
+    args = parser.parse_args()
+
+    remove_shm_from_resource_tracker()
+
+    shm = None
+
+    try:
+        shape = tuple(json.loads(args.shm_shape))
+        dtype = np.dtype(args.shm_dtype)
+        shm = SharedMemory(name=args.shm_name)
+
+        np_img = np.ndarray(shape, dtype=dtype, buffer=shm.buf)
+        image = Image.fromarray(np_img)
+
+        client = genai.Client(api_key=args.api_key)
+        content = client.models.generate_content(model=args.model, contents=[args.prompt, image])
+        print(content.text)
+        sys.exit(0)
+
+    except ClientError as e:
+        if hasattr(e, 'code') and e.code == 429:
+            print("Caught a ResourceExhausted error (429 Too Many Requests)", file=sys.stderr)
+        else:
+            print(e, file=sys.stderr)
+        sys.exit(1)
+
+    except Exception as e:
+        print(e, file=sys.stderr)
+        sys.exit(1)
+
+    finally:
+        if shm:
+            shm.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/python/GeminiDetection/gemini_component/__init__.py b/python/GeminiDetection/gemini_component/__init__.py
@@ -0,0 +1,27 @@
+#############################################################################
+# NOTICE                                                                    #
+#                                                                           #
+# This software (or technical data) was produced for the U.S. Government    #
+# under contract, and is subject to the Rights in Data-General Clause       #
+# 52.227-14, Alt. IV (DEC 2007).                                            #
+#                                                                           #
+# Copyright 2025 The MITRE Corporation. All Rights Reserved.                #
+#############################################################################
+
+#############################################################################
+# Copyright 2025 The MITRE Corporation                                      #
+#                                                                           #
+# Licensed under the Apache License, Version 2.0 (the "License");           #
+# you may not use this file except in compliance with the License.          #
+# You may obtain a copy of the License at                                   #
+#                                                                           #
+#    http://www.apache.org/licenses/LICENSE-2.0                             #
+#                                                                           #
+# Unless required by applicable law or agreed to in writing, software       #
+# distributed under the License is distributed on an "AS IS" BASIS,         #
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  #
+# See the License for the specific language governing permissions and       #
+# limitations under the License.                                            #
+#############################################################################
+
+from .gemini_component import GeminiComponent
diff --git a/python/GeminiDetection/gemini_component/data/json_prompts.json b/python/GeminiDetection/gemini_component/data/json_prompts.json
@@ -0,0 +1,24 @@
+{
+    "classPrompts": [
+        {
+            "classes": [
+                "PERSON"
+            ],
+            "prompts": [
+                "If there is no person visible in the image, produce JSON matching this specification: \"person\": { \"visible_person\": false } Return person. If a person is visible, extract their features and include answers only if 100% confident, if not provide \"unsure\". If an attribute is not visible, set the value to \"not visible\". For clarification, facial features are permanent or consistent traits of the face that do not change with expressions or emotions. Examples include face shape, nose structure, lip shape, eye shape/spacing, jawline, cheekbones, and consistent marks like scars or moles. They do not include temporary expressions (e.g., smiling), emotions (e.g., sadness), or conditions like makeup or lighting. Produce JSON matching this specification: \"person\": { \"visible_person\": true, \"type\": (\"civilian\", \"guard\", \"public figure\"), \"clothing\": array<{\"type\": (ie. \"shirt\", \"pants\", \"dress\", \"t-shirt\", \"shorts\", \"skirt\", etc.), \"color\": string, \"describe\": string}>, \"age_range\": (\"minor/child\", \"adult\", \"elderly\"), \"gender\": string, \"skin_color\": (\"very fair\", \"fair\", \"medium\", \"olive\", \"brown\", \"black\"), \"race\": (\"american indian/alaska native\", \"asian\", \"black/african american\", \"hispanic/latino\", \"native hawaiian/pacific islander\", \"white\"), \"accessories\": array< \"type\": string, \"color\": string, \"describe\": string}>, \"glasses\": {\"type\": string, \"color\": string, \"describe\": string}, \"object_in_hand\": array< \"type\": clothing_enum>, \"color\": string, \"describe\": string}>, \"shoes\": {\"type\": string, \"color\": string, \"describe\": string}, \"head_features\": {\"hair_color\": string, \"bald\": boolean, \"head_cover\": {\"type\": string, \"color\": string, \"describe\": string}}, \"tattoo_features\": {\"location\": string, \"color\": string, \"describe\": string}, \"face_features\": {\"eye_color\": (ie. \"brown\", \"blue\", \"green\", \"hazel\", \"gray\", \"amber\", \"violet\", etc.), \"facial_hair_color\": string, \"facial_features\": string}, \"action_performed\": string, \"background\": {\"type\": string, \"color\": string, \"describe\": string}, \"other_notable_characteristics\": string } Return: person"
+            ]
+        },
+        {
+            "classes": [
+                "VEHICLE",
+                "CAR",
+                "TRUCK",
+                "BUS",
+                "MOTORBIKE"
+            ],
+            "prompts": [
+                "If there is no vehicle visible in the image, produce JSON matching this specification: \"vehicle\": { \"visible_vehicle\": false } Return vehicle. If a vehicle is visible, extract its features and include answers only if 100% confident, if not provide \"unsure\". If an attribute is not visible, set the value to \"not visible\". Produce JSON matching this specification: \"vehicle\": { \"visible_vehicle\": true, \"make\": string, \"type\": string, \"color\": string, \"license_plate_state\": string, \"license_plate_number\": string, \"other_notable_characteristics\": string} Return: vehicle"
+            ]
+        }
+    ]
+}
diff --git a/python/GeminiDetection/gemini_component/data/prompts.json b/python/GeminiDetection/gemini_component/data/prompts.json
@@ -0,0 +1,39 @@
+{
+    "classPrompts": [
+        {
+            "classes": [
+                "PERSON"
+            ],
+            "prompts": [
+                {
+                    "detectionProperty": "CLOTHING",
+                    "prompt": "Describe what this person is wearing"
+                },
+                {
+                    "detectionProperty": "ACTIVITY",
+                    "prompt": "Describe what this person is doing"
+                }
+            ]
+        },
+        {
+            "classes": [
+                "VEHICLE",
+                "CAR",
+                "TRUCK",
+                "BUS"
+            ],
+            "prompts": [
+                {
+                    "detectionProperty": "DESCRIPTION",
+                    "prompt": "Describe this vehicle"
+                }
+            ]
+        }
+    ],
+    "framePrompts": [
+        {
+            "detectionProperty": "LOCATION",
+            "prompt": "Describe the location in this scene"
+        }
+    ]
+}