diff --git a/python/ClipDetection/Dockerfile b/python/ClipDetection/Dockerfile index 681b05d3..0c8f240c 100644 --- a/python/ClipDetection/Dockerfile +++ b/python/ClipDetection/Dockerfile @@ -29,7 +29,7 @@ ARG MODELS_REGISTRY=openmpf/ ARG BUILD_REGISTRY ARG BUILD_TAG=latest -FROM ${MODELS_REGISTRY}openmpf_clip_detection_models:8.0.0 as models +FROM ${MODELS_REGISTRY}openmpf_clip_detection_models:9.0.0 as models FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} COPY --from=models /models/ViT-B-32.pt /models/ViT-B-32.pt diff --git a/python/LlavaDetection/COPYING b/python/LlavaDetection/COPYING new file mode 100644 index 00000000..19dc35b2 --- /dev/null +++ b/python/LlavaDetection/COPYING @@ -0,0 +1,175 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. \ No newline at end of file diff --git a/python/LlavaDetection/Dockerfile b/python/LlavaDetection/Dockerfile new file mode 100644 index 00000000..128ee267 --- /dev/null +++ b/python/LlavaDetection/Dockerfile @@ -0,0 +1,54 @@ +# syntax=docker/dockerfile:experimental + +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +ARG BUILD_REGISTRY +ARG BUILD_TAG=latest +FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} + +RUN --mount=type=tmpfs,target=/var/cache/apt \ + --mount=type=tmpfs,target=/var/lib/apt/lists \ + --mount=type=tmpfs,target=/tmp \ + apt-get update; \ + DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y wget + +RUN pip3 install --upgrade pip + +RUN pip3 install opencv-python ollama + +ARG RUN_TESTS=false + +RUN --mount=target=.,readwrite \ + install-component.sh; \ + if [ "${RUN_TESTS,,}" == true ]; then python tests/test_llava.py; fi + +LABEL org.label-schema.license="Apache 2.0" \ + org.label-schema.name="OpenMPF LLaVA Detection" \ + org.label-schema.schema-version="1.0" \ + org.label-schema.url="https://openmpf.github.io" \ + org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \ + org.label-schema.vendor="MITRE" \ No newline at end of file diff --git a/python/LlavaDetection/LICENSE b/python/LlavaDetection/LICENSE new file mode 100644 index 00000000..034a4a8a --- /dev/null +++ b/python/LlavaDetection/LICENSE @@ -0,0 +1,27 @@ +/****************************************************************************** +* Copyright 2024 The MITRE Corporation * +* * +* Licensed under the Apache License, Version 2.0 (the "License"); * +* you may not use this file except in compliance with the License. * +* You may obtain a copy of the License at * +* * +* http://www.apache.org/licenses/LICENSE-2.0 * +* * +* Unless required by applicable law or agreed to in writing, software * +* distributed under the License is distributed on an "AS IS" BASIS, * +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * +* See the License for the specific language governing permissions and * +* limitations under the License. * +******************************************************************************/ + +This project contains content developed by The MITRE Corporation. If this code +is used in a deployment or embedded within another project, it is requested +that you send an email to opensource@mitre.org in order to let us know where +this software is being used. + + + +This software makes use of a data model derived from third party software: + +-------------------------------------------------------------------------- + diff --git a/python/LlavaDetection/NOTICE b/python/LlavaDetection/NOTICE new file mode 100644 index 00000000..ae6303f5 --- /dev/null +++ b/python/LlavaDetection/NOTICE @@ -0,0 +1,7 @@ +# NOTICE + +This software (or technical data) was produced for the U.S. Government +under contract, and is subject to the Rights in Data-General Clause +552.227-14, Alt. IV (DEC 2007). + +Copyright 2024 The MITRE Corporation. All Rights Reserved. \ No newline at end of file diff --git a/python/LlavaDetection/README.md b/python/LlavaDetection/README.md new file mode 100644 index 00000000..00cd64cd --- /dev/null +++ b/python/LlavaDetection/README.md @@ -0,0 +1,86 @@ +# Overview + +This repository contains source code for the OpenMPF LLaVA Detection Component. + +This component utilizes a config file that contains any number of prompts for any number of object classes. These prompts and the images/video frames are passed to an instance of the [LLaVA 34b](https://huggingface.co/liuhaotian/llava-v1.6-34b) model within [Ollama](https://ollama.com) to generate responses. + +The component is built to support multi-stage pipelines where feed forward tracks from an object detector component are passed in to be queried further. The user can specify custom classes and prompts that coincide with the classes of objects being passed into the component. + +# Job Properties + +The following are the properties that can be specified for the component. Each property has a default value and so none of them necessarily need to be specified for processing jobs. + +- `PROMPT_CONFIGURATION_PATH`: Specifies the file path to the prompt config file. +- `JSON_PROMPT_CONFIGURATION_PATH`: Specifies the file path to the JSON output prompt config file. +- `ENABLE_JSON_PROMPT_FORMAT`: Boolean that enables JSON outputs. +- `OLLAMA_SERVER`: The Ollama server ``:`` to use for inferencing. +- `GENERATE_FRAME_RATE_CAP`: Specifies the maximum number of frames to process every second. + +# Config File + +The config file is a JSON formatted file that is used by the component to know which prompts to ask LLaVA depending on the class of the object. The user can write their own config file and can be used by setting the `PROMPT_CONFIGURATION_PATH` property. The following is an example of the proper syntax to follow: + +```json +[ + { + "classes": [ + "DOG", + "CAT", + "HORSE" + ], + "prompts": [ + { + "detectionProperty": "DESCRIPTION", + "prompt": "Describe the animal's color and appearance." + } + ] + }, + { + "classes": [ + "DOG" + ], + "prompts": [ + { + "detectionProperty": "DOG BREED", + "prompt": "Describe the potential breeds that this dog could contain." + } + ] + } +] +``` + +Note that a class can appear in multiple entries in the JSON, such as `"DOG"` in the example. If you have multiple classes that share a prompt, you can list them together like above and then add more questions for each individual class if you wish to get more specific. + +Also be sure to make each `"detectionProperty"` distinct for a given class so that none of your prompts are overwritten. + +# JSON Config File + +The JSON config file allows you to specify that LLaVA returns a JSON object as output where you can specify the fields that the model should fill out. The user can write their own config file and can be used by setting the `JSON_PROMPT_CONFIGURATION_PATH` property. The following is an example of the proper syntax to follow: + + +```json +{ + "classPrompts": [ + { + "classes": [ + "DOG" + ], + "prompts": [ + "Describe the dog in JSON. The JSON should have the following keys: breed, color, size." + ] + }, + { + "classes": [ + "PERSON" + ], + "prompts": [ + "Describe the person in JSON. The JSON should have the following keys: hair_color (if unsure, respond with unsure), clothes, activity." + ] + } + ] +} +``` + +# Outputs + +Once the responses are generated, they are added onto the `detection_properties` dictionary of the associated `ImageLocation` object. for each prompt, the key is specified by the `"detectionProperty"` field of the config JSON and the value will be the LLaVA-generated response. \ No newline at end of file diff --git a/python/LlavaDetection/llava_component/__init__.py b/python/LlavaDetection/llava_component/__init__.py new file mode 100644 index 00000000..00e4952b --- /dev/null +++ b/python/LlavaDetection/llava_component/__init__.py @@ -0,0 +1,27 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +from .llava_component import LlavaComponent \ No newline at end of file diff --git a/python/LlavaDetection/llava_component/data/json_prompts.json b/python/LlavaDetection/llava_component/data/json_prompts.json new file mode 100644 index 00000000..d47b8233 --- /dev/null +++ b/python/LlavaDetection/llava_component/data/json_prompts.json @@ -0,0 +1,22 @@ +{ + "classPrompts": [ + { + "classes": [ + "PERSON" + ], + "prompts": [ + "Extract features of the person in JSON format and include answers only if 100% confident for each object if not, provide unsure. The JSON should have the following keys: visible_person(true or false), person (if visible_person is true then provide person object). Person objects should have the following attributes: Type (Answer with ‘Public figure’, ‘guard’, ‘civilian’,’unsure’), clothing (describe clothing color, location and type(dress, shirt, jacket, scarf, t-shirt, jeans, etc)), estimated_age_range (Answer with 'minor/child', 'adult', 'elderly’, or 'unsure’), estimated_gender, estimated_race, accessories (list of accessory objects), visible_glasses (true or false),glasses (if has_glasses value is true, then provide type, color, describe), visible_object_in_hand (true or false), object_in_hand(if has_object_in_hand value is true then provide type, color, describe), person_wearing_shoe (true or false), shoe (if has_Shoes value is true then provide type, color, describe), visible_head_hair (boolean), head_features(head_hair_color, bald(boolean),visible_head_cover(Boolean),head_cover_type(hat,scraf,hoodie etc..),visible_tattoo(Boolean),tattoo_features(if visible_tattoo is true then provide location, color, describe) visible_face (true or false),visible_eye(Boolean), face_features (if visible_face value is true then provide visible_eye(Boolean), eye_color, visible_facial_hair, facial_hair_color, facial_features, emotion_of_person), action_performed, background(describe, color, type), other_notable_characteristics. Accessory objects should have the following attributes: type (scarf, hat, etc..), color, describe" + ] + }, + { + "classes": [ + "CAR", + "TRUCK", + "BUS" + ], + "prompts": [ + "Describe the vehicle in JSON. The JSON should have the following keys: make, type, color, decals (list of decal objects), licence_plate (describe state and plate number), and other_notable_characteristics." + ] + } + ] +} \ No newline at end of file diff --git a/python/LlavaDetection/llava_component/data/prompts.json b/python/LlavaDetection/llava_component/data/prompts.json new file mode 100644 index 00000000..3a09290b --- /dev/null +++ b/python/LlavaDetection/llava_component/data/prompts.json @@ -0,0 +1,38 @@ +{ + "classPrompts": [ + { + "classes": [ + "PERSON" + ], + "prompts": [ + { + "detectionProperty": "CLOTHING", + "prompt": "Describe what this person is wearing" + }, + { + "detectionProperty": "ACTIVITY", + "prompt": "Describe what this person is doing" + } + ] + }, + { + "classes": [ + "CAR", + "TRUCK", + "BUS" + ], + "prompts": [ + { + "detectionProperty": "DESCRIPTION", + "prompt": "Describe this vehicle" + } + ] + } + ], + "framePrompts": [ + { + "detectionProperty": "LOCATION", + "prompt": "Describe the location in this scene" + } + ] +} \ No newline at end of file diff --git a/python/LlavaDetection/llava_component/llava_component.py b/python/LlavaDetection/llava_component/llava_component.py new file mode 100644 index 00000000..2de81faa --- /dev/null +++ b/python/LlavaDetection/llava_component/llava_component.py @@ -0,0 +1,467 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import time +import os +import cv2 +import base64 +import json +import ollama +import re +import math + +import logging +from typing import Mapping, Iterable + +import mpf_component_api as mpf +import mpf_component_util as mpf_util + +logger = logging.getLogger('LlavaComponent') + +class LlavaComponent: + detection_type = 'CLASS' + + def __init__(self): + self.model = 'llava:34b' + self.host_url = '' + self.client = None + self.class_prompts = dict() + self.json_class_prompts = dict() + self.frame_prompts = dict() + + self.json_limit = 3 + + def get_detections_from_image(self, image_job: mpf.ImageJob) -> Iterable[mpf.ImageLocation]: + logger.info('Received image job: %s', image_job.job_name) + + self.video_process_timer = Timer() + self.video_decode_timer = Timer() + self.frame_count = 0 + + config = JobConfig(image_job.job_properties) + image_reader = mpf_util.ImageReader(image_job) + + if image_job.feed_forward_location is None: + detections = self._get_frame_detections(image_job, [image_reader.get_image(),], config) + elif config.enable_json_prompt_format: + detections = self._get_feed_forward_detections_json(image_job.feed_forward_location, image_reader, config) + else: + detections = self._get_feed_forward_detections(image_job.feed_forward_location, image_reader, config) + + logger.info(f"Job complete. Found {len(detections)} detections.") + return detections + + def get_detections_from_video(self, video_job: mpf.VideoJob) -> Iterable[mpf.VideoTrack]: + logger.info('Received video job: %s', video_job.job_name) + + self.video_process_timer = Timer() + self.video_decode_timer = Timer() + self.frame_count = 0 + + config = JobConfig(video_job.job_properties, video_job.media_properties) + video_capture = mpf_util.VideoCapture(video_job) + + if video_job.feed_forward_track is None: + tracks = self._get_frame_detections(video_job, video_capture, config, is_video_job=True) + elif config.enable_json_prompt_format: + tracks = self._get_feed_forward_detections_json(video_job.feed_forward_track, video_capture, config, is_video_job=True) + else: + tracks = self._get_feed_forward_detections(video_job.feed_forward_track, video_capture, config, is_video_job=True) + + decode_time = self.video_decode_timer.get_seconds_elapsed_from_last_pause() + if decode_time > 0.0: + logger.info("Total frame load time: " + f"{decode_time:0.3f} seconds ({self.frame_count / decode_time:0.3f} frames/second)") + + process_time = self.video_process_timer.get_seconds_elapsed_from_last_pause() + if process_time > 0.0: + logger.info("Total detection and tracking time: " + f"{process_time:0.3f} seconds ({self.frame_count / process_time:0.3f} frames/second)") + + logger.info(f"Job complete. Found {len(tracks)} tracks.") + return tracks + + def _get_frame_detections(self, job, reader, config, is_video_job=False): + # Check if both frame_rate_cap and generate_frame_rate_cap are set > 0. If so, throw exception + if (mpf_util.get_property(job.job_properties, 'FRAME_RATE_CAP', -1) > 0) and (config.frames_per_second_to_process > 0): + raise mpf.DetectionException( + "Cannot have FRAME_RATE_CAP and GENERATE_FRAME_RATE_CAP both set to values greater than zero on jobs without feed forward detections:", + mpf.DetectionError.INVALID_PROPERTY + ) + + self._update_prompts(config.prompt_config_path, config.json_prompt_config_path) + self._check_client(config.ollama_server) + + tracks = [] + self.frame_count = 0 + self.video_decode_timer = Timer() + self.video_process_timer = Timer() + + self.video_decode_timer.start() + for idx, frame in enumerate(reader): + if (config.frames_per_second_to_process <= 0) or (idx % config.frames_per_second_to_process == 0): + self.video_decode_timer.pause() + self.frame_count += 1 + + height, width, _ = frame.shape + detection_properties = dict() + + self._get_ollama_response(self.frame_prompts, frame, detection_properties, self.video_process_timer) + + img_location = mpf.ImageLocation(0, 0, width, height, -1, detection_properties) + if is_video_job: + tracks.append(mpf.VideoTrack(idx, idx, -1, { idx:img_location }, detection_properties)) + else: + tracks.append(img_location) + + self.video_decode_timer.start() + + if is_video_job: + for track in tracks: + reader.reverse_transform(track) + + return tracks + + def _get_feed_forward_detections_json(self, job_feed_forward, reader, config, is_video_job=False): + self._update_prompts(config.prompt_config_path, config.json_prompt_config_path) + self._check_client(config.ollama_server) + + classification = job_feed_forward.detection_properties["CLASSIFICATION"].lower() + + # Send prompts to ollama to generate responses + self.frame_count = 0 + self.video_decode_timer = Timer() + self.video_process_timer = Timer() + + prompts_to_use = self.json_class_prompts if config.enable_json_prompt_format else self.class_prompts + if is_video_job: + self.video_decode_timer.start() + frame_indices = { i:frame for i, frame in zip(job_feed_forward.frame_locations.keys(), reader) } + for idx in self._get_frames_to_process(list(frame_indices.keys()), config.frames_per_second_to_process): + self.video_decode_timer.pause() + frame = frame_indices[idx] + ff_location = job_feed_forward.frame_locations[idx] + self.frame_count += 1 + + encoded = self._encode_image(frame) + if classification in prompts_to_use: + for tag, prompt in prompts_to_use[classification].items(): + # re-initialize json_attempts=0 and json_failed=True + json_attempts, json_failed = 0, True + while (json_attempts < self.json_limit) and (json_failed): + json_attempts += 1 + response = self._get_ollama_response_json(prompt, encoded) + try: + response = response.split('```json\n')[1].split('```')[0] + response_json = json.loads(response) + self._update_detection_properties(ff_location.detection_properties, response_json) + json_failed = False + except: + logger.warning(f"LLaVA failed to produce valid JSON output. Failed {json_attempts} of {self.json_limit} attempts.") + continue + if json_failed: + logger.warning(f"Using last full LLaVA response instead of parsed JSON output.") + job_feed_forward.detection_properties['FAILED TO PROCESS LLAVA RESPONSE'] = True + job_feed_forward.detection_properties['FULL LLAVA RESPONSE'] = response + + self.video_decode_timer.start() + else: + encoded = self._encode_image(reader.get_image()) + if classification in prompts_to_use: + for tag, prompt in prompts_to_use[classification].items(): + json_attempts, json_failed = 0, True + while (json_attempts <= self.json_limit) and (json_failed): + json_attempts += 1 + response = self._get_ollama_response_json(prompt, encoded) + try: + response = response.split('```json\n')[1].split('```')[0] + response_json = json.loads(response) + self._update_detection_properties(job_feed_forward.detection_properties, response_json) + json_failed = False + except: + logger.warning(f"LLaVA failed to produce valid JSON output. Failed {json_attempts} of {self.json_limit} attempts.") + continue + if json_failed: + logger.warning(f"Using last full LLaVA response instead of parsed JSON output.") + job_feed_forward.detection_properties['FAILED TO PROCESS LLAVA RESPONSE'] = True + job_feed_forward.detection_properties['FULL LLAVA RESPONSE'] = response + + return [job_feed_forward] + + def _update_detection_properties(self, detection_properties, response_json): + ignore_words = ['unsure', 'none', 'false', 'no', 'unclear', ''] + key_list = self._get_keys(response_json) + key_vals = dict() + keywords = [] + for key_str in key_list: + split_key = [' '.join(x.split('_')) for x in ('llava' + key_str).split('||')] + key, val = " ".join([s.upper() for s in split_key[:-1]]), split_key[-1] + key_vals[key] = val + + if ('LLAVA VISIBLE PERSON' not in key_vals) or (key_vals['LLAVA VISIBLE PERSON'].strip().lower() not in ignore_words): + for key, val in key_vals.items(): + if ('VISIBLE' in key) and (val.strip().lower() in ignore_words): + keywords.append(key.split(' VISIBLE ')[1]) + key_vals.pop(key) + + for keyword in keywords: + pattern = re.compile(fr'\b{keyword}\b') + for key_to_remove in filter(pattern.search, key_vals): + key_vals.pop(key_to_remove) + + for key, val in key_vals.items(): + if val.strip().lower() in ignore_words: + key_vals.pop(key) + + detection_properties.update(key_vals) + detection_properties['ANNOTATED BY LLAVA'] = True + + def _get_keys(self, response_json): + if isinstance(response_json, list) or isinstance(response_json, str): + yield f'||{json.dumps(response_json)}' + elif isinstance(response_json, dict): + if self._is_lowest_level(response_json): + yield f'||{json.dumps(response_json)}' + else: + for key, value in response_json.items(): + yield from (f'||{key}{p}' for p in self._get_keys(value)) + + @staticmethod + def _is_lowest_level(response_json): + for key, val in response_json.items(): + if not isinstance(val, str): + return False + return True + + def _get_feed_forward_detections(self, job_feed_forward, reader, config, is_video_job=False): + self._update_prompts(config.prompt_config_path, config.json_prompt_config_path) + self._check_client(config.ollama_server) + + classification = job_feed_forward.detection_properties["CLASSIFICATION"].lower() + frame_count = 0 + video_decode_timer = Timer() + video_process_timer = Timer() + + if is_video_job: + video_decode_timer.start() + frame_indices = { i:frame for i, frame in zip(job_feed_forward.frame_locations.keys(), reader) } + frames_to_process = self._get_frames_to_process(list(frame_indices.keys()), config.frames_per_second_to_process) + for idx in frames_to_process: + video_decode_timer.pause() + frame = frame_indices[idx] + ff_location = job_feed_forward.frame_locations[idx] + frame_count += 1 + + if classification in self.class_prompts: + self._get_ollama_response(self.class_prompts[classification], frame, ff_location.detection_properties, video_process_timer) + + video_decode_timer.start() + return [job_feed_forward] + else: + if classification in self.class_prompts: + self._get_ollama_response(self.class_prompts[classification], reader.get_image(), job_feed_forward.detection_properties, video_process_timer) + return [job_feed_forward] + + def _check_client(self, host_url): + try: + if self.client is None or host_url != self.host_url: + self.host_url = host_url + self.client = ollama.Client(host=self.host_url) + except: + raise mpf.DetectionException( + "Could not instantiate Ollama Client. Make sure OLLAMA_SERVER is set correctly: ", + mpf.DetectionError.NETWORK_ERROR + ) + + def _encode_image(self, image): + encode_params = [int(cv2.IMWRITE_PNG_COMPRESSION), 9] + _, buffer = cv2.imencode('.png', image, encode_params) + return base64.b64encode(buffer).decode("utf-8") + + def _update_prompts(self, prompt_config_path, json_prompt_config_path): + ''' + Updates self.class_prompts dictionary to have the following format + + { + CLASS1: {TAG1: PROMPT1}, + CLASS2: {TAG2: PROMPT2, TAG3: PROMPT3}, + ... + } + + and self.frame_prompts to be a dict of key, prompt string pairs. + ''' + try: + with open(prompt_config_path, 'r') as f: + data = json.load(f) + class_dicts, frame_dicts = data['classPrompts'], data['framePrompts'] + for class_dict in class_dicts: + classes, prompts = [cls.lower() for cls in class_dict['classes']], class_dict['prompts'] + for cls in classes: + if cls not in self.class_prompts: + self.class_prompts[cls] = dict() + self.class_prompts[cls].update({ dct['detectionProperty']:dct['prompt'] for dct in prompts }) + + for frame_dict in frame_dicts: + self.frame_prompts[frame_dict['detectionProperty']] = frame_dict['prompt'] + + with open(json_prompt_config_path, 'r') as f: + data = json.load(f) + json_class_dicts = data['classPrompts'] + for class_dict in json_class_dicts: + classes, prompts = [cls.lower() for cls in class_dict['classes']], class_dict['prompts'] + for cls in classes: + for idx, prompt in enumerate(prompts): + self.json_class_prompts[cls] = { f'JSON_{idx}':prompt } + + except Exception as e: + raise mpf.DetectionException( + f"Invalid JSON structure for component: {e}", + mpf.DetectionError.COULD_NOT_READ_DATAFILE + ) + + def _get_ollama_response_json(self, prompt, encoded_image): + try: + self.video_process_timer.start() + response = self.client.generate(self.model, prompt, images=[encoded_image])['response'] + self.video_process_timer.pause() + return response + except: + raise mpf.DetectionException( + "Could not communicate with Ollama server: ", + mpf.DetectionError.NETWORK_ERROR + ) + + def _get_ollama_response(self, prompt_dict, image, detection_properties, video_process_timer): + try: + encoded = self._encode_image(image) + for tag, prompt in prompt_dict.items(): + video_process_timer.start() + detection_properties[tag] = self.client.generate(self.model, prompt, images=[encoded])['response'] + video_process_timer.pause() + detection_properties['ANNOTATED BY LLAVA'] = True + + except: + raise mpf.DetectionException( + "Could not communicate with Ollama server: ", + mpf.DetectionError.NETWORK_ERROR + ) + + def _get_frames_to_process(self, frame_locations: list, skip: int) -> list: + if not frame_locations: + return [] + + retval = [] + curr = frame_locations[0] + retval.append(curr) + want = curr + skip + + for i in range(1, len(frame_locations)): + + next = math.inf + if i + 1 < len(frame_locations): + next = frame_locations[i + 1] + + if next < want: + continue + + curr = frame_locations[i] + + curr_delta = abs(want - curr) + next_delta = abs(next - want) + + too_close_to_last = (curr - retval[-1]) <= (skip / 3) + + if curr_delta <= next_delta and not too_close_to_last: + retval.append(curr) + want = curr + skip + continue + + if next != math.inf: + retval.append(next) + want = next + skip + + return retval +class JobConfig: + def __init__(self, job_properties: Mapping[str, str], media_properties=None): + self.prompt_config_path = self._get_prop(job_properties, "PROMPT_CONFIGURATION_PATH", "") + if self.prompt_config_path == "": + self.prompt_config_path = os.path.join(os.path.dirname(__file__), 'data', 'prompts.json') + + self.json_prompt_config_path = self._get_prop(job_properties, "JSON_PROMPT_CONFIGURATION_PATH", "") + if self.json_prompt_config_path == "": + self.json_prompt_config_path = os.path.join(os.path.dirname(__file__), 'data', 'json_prompts.json') + + self.enable_json_prompt_format = self._get_prop(job_properties, "ENABLE_JSON_PROMPT_FORMAT", False) + + self.ollama_server = self._get_prop(job_properties, "OLLAMA_SERVER", "llava-detection-server:11434") + if not os.path.exists(self.prompt_config_path): + raise mpf.DetectionException( + "Invalid path provided for prompt config JSON file: ", + mpf.DetectionError.COULD_NOT_OPEN_DATAFILE + ) + + generate_frame_rate_cap = self._get_prop(job_properties, "GENERATE_FRAME_RATE_CAP", 1.0) + if (media_properties != None) and (generate_frame_rate_cap > 0): + # Check if fps exists. If not throw mpf.DetectionError.MISSING_PROPERTY exception + try: + self.frames_per_second_to_process = max(1, math.floor(float(media_properties['FPS']) / generate_frame_rate_cap)) + except: + raise mpf.DetectionException( + "FPS not found for media: ", + mpf.DetectionError.MISSING_PROPERTY + ) + else: + self.frames_per_second_to_process = -1 + + + @staticmethod + def _get_prop(job_properties, key, default_value, accept_values=[]): + prop = mpf_util.get_property(job_properties, key, default_value) + if (accept_values != []) and (prop not in accept_values): + raise mpf.DetectionException( + f"Property {key} not in list of acceptable values: {accept_values}", + mpf.DetectionError.INVALID_PROPERTY + ) + return prop + +class Timer: + def __init__(self): + self._seconds_elapsed = 0.0 + self._last_start_time = None + + def start(self): + if self._last_start_time is None: + self._last_start_time = time.perf_counter() + + def pause(self): + if self._last_start_time is not None: + self._seconds_elapsed += time.perf_counter() - self._last_start_time + self._last_start_time = None + + def get_seconds_elapsed_from_last_pause(self) -> float: + return self._seconds_elapsed + +EXPORT_MPF_COMPONENT = LlavaComponent diff --git a/python/LlavaDetection/ollama_server/Dockerfile b/python/LlavaDetection/ollama_server/Dockerfile new file mode 100644 index 00000000..3b3222b2 --- /dev/null +++ b/python/LlavaDetection/ollama_server/Dockerfile @@ -0,0 +1,49 @@ +# syntax=docker/dockerfile:experimental + +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +FROM ollama/ollama + +RUN apt-get update && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV HF_HUB_ENABLE_HF_TRANSFER=true + +COPY ./run-ollama.sh /tmp/run-ollama.sh +WORKDIR /tmp + +RUN chmod +x run-ollama.sh \ + && ./run-ollama.sh +EXPOSE 11434 + +LABEL org.label-schema.license="Apache 2.0" \ + org.label-schema.name="OpenMPF LLaVA Detection Triton Server" \ + org.label-schema.schema-version="1.0" \ + org.label-schema.url="https://openmpf.github.io" \ + org.label-schema.vcs-url="https://github.com/openmpf/openmpf-components" \ + org.label-schema.vendor="MITRE" \ No newline at end of file diff --git a/python/LlavaDetection/ollama_server/run-ollama.sh b/python/LlavaDetection/ollama_server/run-ollama.sh new file mode 100644 index 00000000..bd657193 --- /dev/null +++ b/python/LlavaDetection/ollama_server/run-ollama.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +set -o errexit -o pipefail -o xtrace + +/bin/ollama serve & + +while [ "$(ollama list | grep 'NAME')" == "" ]; do + sleep 1 +done + +echo "Building LLaVA model..." +ollama pull llava:34b +echo "Done!" diff --git a/python/LlavaDetection/plugin-files/descriptor/descriptor.json b/python/LlavaDetection/plugin-files/descriptor/descriptor.json new file mode 100644 index 00000000..1a3fc658 --- /dev/null +++ b/python/LlavaDetection/plugin-files/descriptor/descriptor.json @@ -0,0 +1,194 @@ +{ + "componentName": "LlavaDetection", + "componentVersion": "9.0", + "middlewareVersion": "9.0", + "sourceLanguage": "python", + "batchLibrary": "LlavaDetection", + "environmentVariables": [], + "algorithm": { + "name": "LLaVA", + "description": "LLaVA prompt response generation.", + "actionType": "DETECTION", + "trackType": "CLASS", + "outputChangedCounter": 1, + "requiresCollection": { + "states": [] + }, + "providesCollection": { + "states": [ + "DETECTION", + "DETECTION_CLASS", + "DETECTION_CLASS_LLAVA" + ], + "properties": [ + { + "name": "PROMPT_CONFIGURATION_PATH", + "description": "Path to a custom JSON file which contains the classes and associated prompts that will be sent to LLaVA.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "JSON_PROMPT_CONFIGURATION_PATH", + "description": "Path to a custom JSON file which contains classes and prompts that specify LLaVA to return a JSON object.", + "type": "STRING", + "defaultValue": "" + }, + { + "name": "ENABLE_JSON_PROMPT_FORMAT", + "description": "Enables returning a JSON formatted response from LLaVA, with the prompt specified at PROMPT_JSON_CONFIGURATION_PATH job property.", + "type": "BOOLEAN", + "defaultValue": "false" + }, + { + "name": "OLLAMA_SERVER", + "description": "Ollama server : to use for inferencing.", + "type": "STRING", + "defaultValue": "llava-detection-server:11434" + }, + { + "name": "GENERATE_FRAME_RATE_CAP", + "description": "The threshold on the maximum number of frames to process in the video segment within one second of the native video time. If set to a value > 0, then an internal frame interval is calculated as max(1, floor(mediaNativeFPS / GENERATE_FRAME_RATE_CAP)). If set <= 0, property is disabled and every frame is used. Throws exception if FRAME_RATE_CAP and GENERATE_FRAME_RATE_CAP both set > 0.", + "type": "DOUBLE", + "defaultValue": "1.0" + } + ] + } + }, + "actions": [ + { + "name": "LLAVA DETECTION ACTION", + "description": "Runs LLaVA with prompts for each video frame passed in.", + "algorithm": "LLaVA", + "properties": [ + { + "name": "ARTIFACT_EXTRACTION_POLICY_BEST_DETECTION_PROP_NAMES_LIST", + "value": "ANNOTATED BY LLAVA" + } + ] + }, + { + "name": "LLAVA DETECTION (WITH FF REGION AND JSON PROMPT) ACTION", + "description": "Runs LLaVA with prompts that specify JSON object outputs for the class of the feed forward detection passed in.", + "algorithm": "LLaVA", + "properties": [ + { + "name": "FEED_FORWARD_TYPE", + "value": "REGION" + }, + { + "name": "ENABLE_JSON_PROMPT_FORMAT", + "value": "true" + }, + { + "name": "ARTIFACT_EXTRACTION_POLICY_BEST_DETECTION_PROP_NAMES_LIST", + "value": "ANNOTATED BY LLAVA" + } + ] + }, + { + "name": "LLAVA DETECTION (EXEMPLAR ONLY WITH FF REGION) ACTION", + "description": "Runs LLaVA with prompts for the class of the feed forward detection passed in, only processing for the exemplar of each track.", + "algorithm": "LLaVA", + "properties": [ + { + "name": "FEED_FORWARD_TYPE", + "value": "REGION" + }, + { + "name": "FEED_FORWARD_TOP_QUALITY_COUNT", + "value": "1" + }, + { + "name": "GENERATE_FRAME_RATE_CAP", + "value": "-1" + } + ] + }, + { + "name": "LLAVA DETECTION (EXEMPLAR ONLY WITH FF REGION AND JSON PROMPT) ACTION", + "description": "Runs LLaVA with prompts that specify JSON object outputs for the class of the feed forward detection passed in, only processing for the exemplar of each track.", + "algorithm": "LLaVA", + "properties": [ + { + "name": "FEED_FORWARD_TYPE", + "value": "REGION" + }, + { + "name": "FEED_FORWARD_TOP_QUALITY_COUNT", + "value": "1" + }, + { + "name": "ENABLE_JSON_PROMPT_FORMAT", + "value": "true" + }, + { + "name": "GENERATE_FRAME_RATE_CAP", + "value": "-1" + } + ] + } + ], + "tasks": [ + { + "name": "LLAVA DETECTION TASK", + "description": "Runs LLaVA with prompts for each video frame passed in.", + "actions": [ + "LLAVA DETECTION ACTION" + ] + }, + { + "name": "LLAVA DETECTION (WITH FF REGION AND JSON PROMPT) TASK", + "description": "Runs LLaVA with prompts that specify JSON object outputs for the class of the feed forward detection passed in.", + "actions": [ + "LLAVA DETECTION (WITH FF REGION AND JSON PROMPT) ACTION" + ] + }, + { + "name": "LLAVA DETECTION (EXEMPLAR ONLY WITH FF REGION) TASK", + "description": "Runs LLaVA with prompts for the class of the feed forward detection passed in, only processing for the exemplar of each track.", + "actions": [ + "LLAVA DETECTION (EXEMPLAR ONLY WITH FF REGION) ACTION" + ] + }, + { + "name": "LLAVA DETECTION (EXEMPLAR ONLY WITH FF REGION AND JSON PROMPT) TASK", + "description": "Runs LLaVA with prompts that specify JSON object outputs for the class of the feed forward detection passed in, only processing for the exemplar of each track.", + "actions": [ + "LLAVA DETECTION (EXEMPLAR ONLY WITH FF REGION AND JSON PROMPT) ACTION" + ] + } + ], + "pipelines": [ + { + "name": "LLAVA DETECTION PIPELINE", + "description":"Runs LLaVA with prompts on images and videos at the specified frame rate.", + "tasks": [ + "LLAVA DETECTION TASK" + ] + }, + { + "name": "LLAVA DETECTION (WITH FF REGION FROM TRITON YOLO AND JSON PROMPT) PIPELINE", + "description":"Runs LLaVA with prompts that specify JSON object outputs on images and videos at the specified frame rate.", + "tasks": [ + "OCV TRITON YOLO OBJECT DETECTION TASK", + "LLAVA DETECTION (WITH FF REGION AND JSON PROMPT) TASK" + ] + }, + { + "name": "LLAVA DETECTION (EXEMPLAR ONLY WITH FF REGION FROM TRITON YOLO) PIPELINE", + "description": "Runs LLaVA with prompts for the class of the detection passed in, only processing the exemplar from each track.", + "tasks": [ + "OCV TRITON YOLO OBJECT DETECTION TASK", + "LLAVA DETECTION (EXEMPLAR ONLY WITH FF REGION) TASK" + ] + }, + { + "name": "LLAVA DETECTION (EXEMPLAR ONLY WITH FF REGION FROM TRITON YOLO AND JSON PROMPT) PIPELINE", + "description": "Runs LLaVA with prompts that specify JSON object outputs for the class of the detection passed in, only processing the exemplar from each track.", + "tasks": [ + "OCV TRITON YOLO OBJECT DETECTION TASK", + "LLAVA DETECTION (EXEMPLAR ONLY WITH FF REGION AND JSON PROMPT) TASK" + ] + } + ] +} diff --git a/python/LlavaDetection/pyproject.toml b/python/LlavaDetection/pyproject.toml new file mode 100644 index 00000000..bcd2b658 --- /dev/null +++ b/python/LlavaDetection/pyproject.toml @@ -0,0 +1,29 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" diff --git a/python/LlavaDetection/setup.cfg b/python/LlavaDetection/setup.cfg new file mode 100644 index 00000000..a9a592a2 --- /dev/null +++ b/python/LlavaDetection/setup.cfg @@ -0,0 +1,44 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +[metadata] +name = LlavaDetection +version = 9.0 + +[options] +packages = llava_component +install_requires = + mpf_component_api>=9.0 + mpf_component_util>=9.0 + ollama + opencv-python + +[options.entry_points] +mpf.exported_component = + component = llava_component.llava_component:LlavaComponent + +[options.package_data] +llava_component=data/prompts.json, data/json_prompts.json diff --git a/python/LlavaDetection/tests/data/NOTICE b/python/LlavaDetection/tests/data/NOTICE new file mode 100644 index 00000000..cd215b7f --- /dev/null +++ b/python/LlavaDetection/tests/data/NOTICE @@ -0,0 +1,13 @@ +# dog.jpg +# Public domain + +# person.jpg +# Photo crop from 2017 COCO Validation Set + +# test_video.mp4 +# Created from public domain images + +# car.jpg +# Photo by Sven D on Unsplash +# Unsplash License: https://unsplash.com/license +# https://unsplash.com/photos/parked-white-ford-explorer-suv-a4S6KUuLeoM \ No newline at end of file diff --git a/python/LlavaDetection/tests/data/car.jpg b/python/LlavaDetection/tests/data/car.jpg new file mode 100644 index 00000000..2ddd23f1 Binary files /dev/null and b/python/LlavaDetection/tests/data/car.jpg differ diff --git a/python/LlavaDetection/tests/data/custom_json_prompts.json b/python/LlavaDetection/tests/data/custom_json_prompts.json new file mode 100644 index 00000000..ae86e97d --- /dev/null +++ b/python/LlavaDetection/tests/data/custom_json_prompts.json @@ -0,0 +1,20 @@ +{ + "classPrompts": [ + { + "classes": [ + "DOG" + ], + "prompts": [ + "Describe the dog in JSON. The JSON should have the following keys: breed, color, size." + ] + }, + { + "classes": [ + "PERSON" + ], + "prompts": [ + "Describe the person in JSON. The JSON should have the following keys: hair_color (if unsure, respond with unsure), clothes, activity." + ] + } + ] +} \ No newline at end of file diff --git a/python/LlavaDetection/tests/data/custom_prompts.json b/python/LlavaDetection/tests/data/custom_prompts.json new file mode 100644 index 00000000..2dcdef47 --- /dev/null +++ b/python/LlavaDetection/tests/data/custom_prompts.json @@ -0,0 +1,25 @@ +{ + "classPrompts": [ + { + "classes": [ + "DOG" + ], + "prompts": [ + { + "detectionProperty": "DESCRIPTION", + "prompt": "Describe the color and breed of the dog." + } + ] + } + ], + "framePrompts": [ + { + "detectionProperty": "DESCRIPTION", + "prompt": "Describe this image" + }, + { + "detectionProperty": "LOCATION", + "prompt": "Describe the location in this scene" + } + ] +} \ No newline at end of file diff --git a/python/LlavaDetection/tests/data/dog.jpg b/python/LlavaDetection/tests/data/dog.jpg new file mode 100644 index 00000000..ed24dac9 Binary files /dev/null and b/python/LlavaDetection/tests/data/dog.jpg differ diff --git a/python/LlavaDetection/tests/data/outputs/test-ignore-output.txt b/python/LlavaDetection/tests/data/outputs/test-ignore-output.txt new file mode 100644 index 00000000..43c1ebac --- /dev/null +++ b/python/LlavaDetection/tests/data/outputs/test-ignore-output.txt @@ -0,0 +1,11 @@ +```json +{ + "hair_color": "gray", + "clothes": { + "jacket": "blue", + "shirt": "white", + "tie": "yellow" + }, + "activity": "shaking hands with another person" +} +``` \ No newline at end of file diff --git a/python/LlavaDetection/tests/data/outputs/test-json-response-image-output.txt b/python/LlavaDetection/tests/data/outputs/test-json-response-image-output.txt new file mode 100644 index 00000000..dc3a0977 --- /dev/null +++ b/python/LlavaDetection/tests/data/outputs/test-json-response-image-output.txt @@ -0,0 +1,60 @@ +```json +{ + "visible_person": true, + "person": { + "Type": "Public figure", + "clothing": { + "jacket": "gray", + "shirt": "white", + "tie": "gold" + }, + "estimated_age_range": "adult", + "estimated_gender": "male", + "estimated_race": "Caucasian", + "accessories": [ + { + "type": "glasses", + "color": "black" + } + ], + "visible_glasses": true, + "glasses": { + "type": "spectacles", + "color": "black", + "describe": "A pair of black rimmed glasses." + }, + "visible_object_in_hand": false, + "object_in_hand": null, + "person_wearing_shoe": true, + "shoe": { + "type": "business dress shoes", + "color": "black", + "describe": "A pair of black dress shoes." + }, + "visible_head_hair": true, + "head_features": { + "head_hair_color": "graying brown", + "bald": false, + "visible_head_cover": false, + "head_cover_type": null, + "visible_tattoo": false, + "tattoo_features": null + }, + "visible_face": true, + "visible_eye": true, + "face_features": { + "visible_eye": true, + "eye_color": "blue", + "visible_facial_hair": false, + "facial_hair_color": null, + "facial_features": null, + "emotion_of_person": "neutral" + }, + "action_performed": "Shaking hands with another person", + "background": { + "describe": "The background shows an event venue with tables and chairs. The setting suggests a formal or semi-formal gathering." + }, + "other_notable_characteristics": null + } +} +``` \ No newline at end of file diff --git a/python/LlavaDetection/tests/data/outputs/test-json-response-video-output.txt b/python/LlavaDetection/tests/data/outputs/test-json-response-video-output.txt new file mode 100644 index 00000000..a172d9aa --- /dev/null +++ b/python/LlavaDetection/tests/data/outputs/test-json-response-video-output.txt @@ -0,0 +1,7 @@ +```json +{ + "breed": "Collie", + "color": "black and white", + "size": "medium to large" +} +``` \ No newline at end of file diff --git a/python/LlavaDetection/tests/data/outputs/test-video-nth-frame-json-output.txt b/python/LlavaDetection/tests/data/outputs/test-video-nth-frame-json-output.txt new file mode 100644 index 00000000..4b3555f5 --- /dev/null +++ b/python/LlavaDetection/tests/data/outputs/test-video-nth-frame-json-output.txt @@ -0,0 +1,7 @@ +```json +{ + "breed": "Collie", + "color": "Black and white", + "size": "Large" +} +``` \ No newline at end of file diff --git a/python/LlavaDetection/tests/data/person.jpg b/python/LlavaDetection/tests/data/person.jpg new file mode 100644 index 00000000..a6e548e8 Binary files /dev/null and b/python/LlavaDetection/tests/data/person.jpg differ diff --git a/python/LlavaDetection/tests/data/test_video.mp4 b/python/LlavaDetection/tests/data/test_video.mp4 new file mode 100644 index 00000000..1303a1ea Binary files /dev/null and b/python/LlavaDetection/tests/data/test_video.mp4 differ diff --git a/python/LlavaDetection/tests/test_llava.py b/python/LlavaDetection/tests/test_llava.py new file mode 100644 index 00000000..473dfd90 --- /dev/null +++ b/python/LlavaDetection/tests/test_llava.py @@ -0,0 +1,404 @@ +############################################################################# +# NOTICE # +# # +# This software (or technical data) was produced for the U.S. Government # +# under contract, and is subject to the Rights in Data-General Clause # +# 52.227-14, Alt. IV (DEC 2007). # +# # +# Copyright 2024 The MITRE Corporation. All Rights Reserved. # +############################################################################# + +############################################################################# +# Copyright 2024 The MITRE Corporation # +# # +# Licensed under the Apache License, Version 2.0 (the "License"); # +# you may not use this file except in compliance with the License. # +# You may obtain a copy of the License at # +# # +# http://www.apache.org/licenses/LICENSE-2.0 # +# # +# Unless required by applicable law or agreed to in writing, software # +# distributed under the License is distributed on an "AS IS" BASIS, # +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # +# See the License for the specific language governing permissions and # +# limitations under the License. # +############################################################################# + +import sys +import os +import logging +import warnings + +# Add llava_component to path. +sys.path.append(os.path.join(os.path.dirname(__file__), '..')) +from llava_component.llava_component import LlavaComponent + +import unittest +import unittest.mock +from unittest.mock import MagicMock, Mock +import mpf_component_api as mpf + +logging.basicConfig(level=logging.DEBUG) +USE_MOCKS = True + +class TestLlava(unittest.TestCase): + + def run_patched_job(self, component, job, side_effect_function): + if isinstance(job, mpf.ImageJob): + detection_func = component.get_detections_from_image + elif isinstance(job, mpf.VideoJob): + detection_func = component.get_detections_from_video + else: + raise Exception("Must be image or video job.") + + if not USE_MOCKS: + return detection_func(job) + + mock_container = MagicMock() + with unittest.mock.patch("ollama.Client", return_value=mock_container): + with unittest.mock.patch.object(LlavaComponent, "_encode_image") as _encoded_image_mocked: + mock_container.generate = Mock(side_effect=side_effect_function) + _encoded_image_mocked.return_value = "" + + results = list(detection_func(job)) + return results + + def test_image_file(self): + ff_loc = mpf.ImageLocation(0, 0, 347, 374, -1, dict(CLASSIFICATION="PERSON")) + job = mpf.ImageJob( + job_name='test-image', + data_uri=self._get_test_file('person.jpg'), + job_properties=dict( + OLLAMA_SERVER='localhost:11434' + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = LlavaComponent() + + def side_effect_function(model, prompt, images): + if prompt == "Describe what this person is wearing": + response = "The person in the image is wearing a dark suit with a matching tie. The shirt underneath appears to be light-colored, possibly white or off-white. He has glasses on his face and is smiling as he shakes hands with someone who isn't fully visible in the frame. His attire suggests a formal setting, possibly for business or an event that requires professional dress code." + elif prompt == "Describe what this person is doing": + response = "The person in the image appears to be shaking someone's hand. They are wearing a suit and tie, which suggests they may be in a professional or formal setting. The context of the photo is not clear from this angle, but it looks like they could be at an event or gathering where such interactions are common." + + return {"response": f"{response}"} + + result = self.run_patched_job(component, job, side_effect_function)[0] + + self.assertTrue("CLOTHING" in result.detection_properties and "ACTIVITY" in result.detection_properties) + self.assertTrue(len(result.detection_properties["CLOTHING"]) > 0 and len(result.detection_properties["ACTIVITY"]) > 0) + + def test_image_file_no_prompts(self): + ff_loc = mpf.ImageLocation(0, 0, 347, 374, -1, dict(CLASSIFICATION="PERSON")) + job = mpf.ImageJob( + job_name='test-image-no-prompts', + data_uri=self._get_test_file('person.jpg'), + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + OLLAMA_SERVER='localhost:11434' + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = LlavaComponent() + + def side_effect_function(model, prompt, images): + return {"response": ""} + + result = self.run_patched_job(component, job, side_effect_function)[0] + self.assertTrue(len(result.detection_properties) == 1 and result.detection_properties['CLASSIFICATION'] == 'PERSON') + + def test_custom_config(self): + ff_loc = mpf.ImageLocation(0, 0, 900, 1600, -1, dict(CLASSIFICATION="DOG")) + job = mpf.ImageJob( + job_name='test-custom', + data_uri=self._get_test_file('dog.jpg'), + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + OLLAMA_SERVER='localhost:11434' + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = LlavaComponent() + + def side_effect_function(model, prompt, images): + if prompt == "Describe the color and breed of the dog.": + response = "The dog in the image appears to be a Golden Retriever. The breed is known for its golden-colored fur, which can range from pale blonde to deeper golden shades, often with some darker feathering around the ears and along the tail. This specific dog has a beautiful golden coat that suggests it may be younger or well-groomed. The facial features of Golden Retriever dogs are also quite distinctive, such as their expressive eyes and long, floppy ears. They are medium to large-sized breed with a friendly and intelligent disposition." + + return {"response": f"{response}"} + + result = self.run_patched_job(component, job, side_effect_function)[0] + + self.assertTrue("DESCRIPTION" in result.detection_properties) + self.assertTrue(len(result.detection_properties["DESCRIPTION"]) > 0) + + def test_video_file(self): + warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning) + + ff_track = mpf.VideoTrack(0, 0, -1, {}, {'CLASSIFICATION': 'DOG'}) + ff_track.frame_locations[0] = mpf.ImageLocation(0, 0, 3456, 5184, -1, {'CLASSIFICATION': 'DOG', 'CLASSIFICATION CONFIDENCE LIST': '-1', 'CLASSIFICATION LIST': 'DOG'}) + + job = mpf.VideoJob( + job_name='test-video', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=0, + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + OLLAMA_SERVER='localhost:11434', + GENERATE_FRAME_RATE_CAP='-1' + ), + media_properties={}, + feed_forward_track=ff_track + ) + component = LlavaComponent() + + def side_effect_function(model, prompt, images): + if prompt == "Describe the color and breed of the dog.": + response = "The dog in the image appears to be a Border Collie. The breed is characterized by its black and white color pattern, which you can see here with distinct patches of black fur against a mostly white background. Border Collies are known for their intelligent eyes and expressive faces, which they use to work livestock. They also have a double coat that is thick and wavy in texture. In this photo, the dog looks well-groomed and healthy." + + return {"response": f"{response}"} + + result = list(self.run_patched_job(component, job, side_effect_function))[0] + for ff_location in result.frame_locations.values(): + self.assertTrue("DESCRIPTION" in ff_location.detection_properties) + self.assertTrue(len(ff_location.detection_properties['DESCRIPTION']) > 0) + + def test_full_frame_image(self): + job = mpf.ImageJob( + job_name='test-full-frame-image', + data_uri=self._get_test_file('dog.jpg'), + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + OLLAMA_SERVER='localhost:11434' + ), + media_properties={}, + feed_forward_location=None + ) + component = LlavaComponent() + def side_effect_function(model, prompt, images): + if prompt == "Describe this image": + response = "The image shows a medium-sized dog sitting on a couch. The dog appears to be a breed with tan and white fur, likely a mix given the irregular patterns of its coat. It has a scrunched up expression on its face, possibly indicating curiosity or attentiveness towards something off-camera. There is a small animal, potentially another pet such as a cat, in front of the dog's paws. The background is blurred but seems to be an indoor setting with natural light filtering through." + elif prompt == "Describe the location in this scene": + response = "The image shows a dog sitting on a couch indoors. The room has a light wood floor and there's a glimpse of what appears to be an artwork or picture frame hanging on the wall in the background. The focus is on the dog, which suggests that it's either the main subject of the photograph or someone wanted to capture a candid moment with their pet." + + return {"response": f"{response}"} + + results = self.run_patched_job(component, job, side_effect_function) + for result in results: + self.assertTrue("LOCATION" in result.detection_properties and "DESCRIPTION" in result.detection_properties) + self.assertTrue(len(result.detection_properties["LOCATION"]) > 0 and len(result.detection_properties["DESCRIPTION"]) > 0) + + def test_full_frame_video(self): + job = mpf.VideoJob( + job_name='test-full-frame-video', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=14, + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + OLLAMA_SERVER='localhost:11434', + GENERATE_FRAME_RATE_CAP='-1' + ), + media_properties={}, + feed_forward_track=None + ) + component = LlavaComponent() + + def side_effect_function(model, prompt, images): + if prompt == "Describe this image": + response = "This is a photo of a dog with a black, white, and grey coat. The dog appears to be a Border Collie or similar breed known for its distinctive coloring. It's sitting on what looks like a concrete surface outdoors, possibly in a yard or on a patio. The dog has a focused gaze towards the camera, and its mouth is slightly open, suggesting it might be panting or perhaps reacting to the person taking the photo. In the background, there are elements of a fence and vegetation, indicating that this setting could be near a garden or a fenced area. The lighting suggests it's daytime." + elif prompt == "Describe the location in this scene": + response = "The image shows a dog sitting on what appears to be a stone or concrete floor. The dog is facing the camera with its mouth open, revealing its teeth and tongue, which could suggest it's panting or smiling. There is a fence in the background, indicating that this might be an outdoor area such as a garden, patio, or a residential backyard. Beyond the fence, there are some plants and trees, suggesting a natural environment. The lighting appears to be diffused, possibly from cloudy weather or shaded by nearby structures or foliage. There's no text visible in the image." + + return {"response": f"{response}"} + + results = self.run_patched_job(component, job, side_effect_function) + for result in results: + self.assertTrue("LOCATION" in result.detection_properties and "DESCRIPTION" in result.detection_properties) + self.assertTrue(len(result.detection_properties["LOCATION"]) > 0 and len(result.detection_properties["DESCRIPTION"]) > 0) + + def test_json_response_image(self): + ff_loc = mpf.ImageLocation(0, 0, 347, 374, -1, dict(CLASSIFICATION="PERSON")) + job = mpf.ImageJob( + job_name='test-json-response-image', + data_uri=self._get_test_file('person.jpg'), + job_properties=dict( + OLLAMA_SERVER='localhost:11434', + ENABLE_JSON_PROMPT_FORMAT='True' + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = LlavaComponent() + + def side_effect_function(model, prompt, images): + with open(os.path.join(os.path.dirname(__file__), 'data', 'outputs', f"{job.job_name}-output.txt")) as f: + response = f.read() + + return {"response": f"{response}"} + + result = self.run_patched_job(component, job, side_effect_function)[0] + self.assertTrue(len(result.detection_properties) > 1) + + def test_json_response_video(self): + warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning) + + ff_track = mpf.VideoTrack(0, 0, -1, {}, {'CLASSIFICATION': 'DOG'}) + ff_track.frame_locations[0] = mpf.ImageLocation(0, 0, 3456, 5184, -1, {'CLASSIFICATION': 'DOG', 'CLASSIFICATION CONFIDENCE LIST': '-1', 'CLASSIFICATION LIST': 'DOG'}) + + job = mpf.VideoJob( + job_name='test-json-response-video', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=0, + job_properties=dict( + OLLAMA_SERVER='localhost:11434', + ENABLE_JSON_PROMPT_FORMAT='True', + JSON_PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_json_prompts.json'), + GENERATE_FRAME_RATE_CAP='-1' + ), + media_properties={}, + feed_forward_track=ff_track + ) + component = LlavaComponent() + + def side_effect_function(model, prompt, images): + with open(os.path.join(os.path.dirname(__file__), 'data', 'outputs', f"{job.job_name}-output.txt")) as f: + response = f.read() + + return {"response": f"{response}"} + + result = list(self.run_patched_job(component, job, side_effect_function))[0] + self.assertTrue(len(result.frame_locations[0].detection_properties) > 3) + + def test_video_file_nth_frame(self): + warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning) + + ff_track = mpf.VideoTrack(0, 0, -1, {}, {'CLASSIFICATION': 'DOG'}) + for i in range(5): + ff_track.frame_locations[i] = mpf.ImageLocation(0, 0, 3456, 5184, -1, {'CLASSIFICATION': 'DOG', 'CLASSIFICATION CONFIDENCE LIST': '-1', 'CLASSIFICATION LIST': 'DOG'}) + + job = mpf.VideoJob( + job_name='test-video-nth-frame', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=4, + job_properties=dict( + PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_prompts.json'), + OLLAMA_SERVER='localhost:11434', + GENERATE_FRAME_RATE_CAP='1.0' + ), + media_properties={ + 'FPS': '2' + }, + feed_forward_track=ff_track + ) + component = LlavaComponent() + + def side_effect_function(model, prompt, images): + if prompt == "Describe the color and breed of the dog.": + response = "The dog in the image appears to be a Border Collie. The breed is characterized by its black and white color pattern, which you can see here with distinct patches of black fur against a mostly white background. Border Collies are known for their intelligent eyes and expressive faces, which they use to work livestock. They also have a double coat that is thick and wavy in texture. In this photo, the dog looks well-groomed and healthy." + + return {"response": f"{response}"} + + result = list(self.run_patched_job(component, job, side_effect_function))[0] + + for i, ff_location in result.frame_locations.items(): + if i % 2 == 0: + self.assertTrue("DESCRIPTION" in ff_location.detection_properties) + self.assertTrue(len(ff_location.detection_properties['DESCRIPTION']) > 0) + else: + self.assertTrue("DESCRIPTION" not in ff_location.detection_properties) + + def test_video_file_nth_frame_json(self): + warnings.filterwarnings(action="ignore", message="unclosed", category=ResourceWarning) + + ff_track = mpf.VideoTrack(0, 0, -1, {}, {'CLASSIFICATION': 'DOG'}) + for i in range(5): + ff_track.frame_locations[i] = mpf.ImageLocation(0, 0, 3456, 5184, -1, {'CLASSIFICATION': 'DOG', 'CLASSIFICATION CONFIDENCE LIST': '-1', 'CLASSIFICATION LIST': 'DOG'}) + + job = mpf.VideoJob( + job_name='test-video-nth-frame-json', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=4, + job_properties=dict( + JSON_PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_json_prompts.json'), + ENABLE_JSON_PROMPT_FORMAT='True', + OLLAMA_SERVER='localhost:11434' + ), + media_properties={ + 'FPS': '2' + }, + feed_forward_track=ff_track + ) + component = LlavaComponent() + + def side_effect_function(model, prompt, images): + with open(os.path.join(os.path.dirname(__file__), 'data', 'outputs', f"{job.job_name}-output.txt")) as f: + response = f.read() + + return {"response": f"{response}"} + + result = list(self.run_patched_job(component, job, side_effect_function))[0] + + for i, ff_location in result.frame_locations.items(): + if i % 2 == 0: + self.assertTrue("LLAVA" in ff_location.detection_properties) + else: + self.assertTrue("LLAVA" not in ff_location.detection_properties) + + def test_ignore_results(self): + ff_loc = mpf.ImageLocation(0, 0, 347, 374, -1, dict(CLASSIFICATION="PERSON")) + job = mpf.ImageJob( + job_name='test-ignore', + data_uri=self._get_test_file('person.jpg'), + job_properties=dict( + OLLAMA_SERVER='localhost:11434', + JSON_PROMPT_CONFIGURATION_PATH=self._get_test_file('custom_json_prompts.json'), + ENABLE_JSON_PROMPT_FORMAT='True' + ), + media_properties={}, + feed_forward_location=ff_loc + ) + component = LlavaComponent() + + def side_effect_function(model, prompt, images): + with open(os.path.join(os.path.dirname(__file__), 'data', 'outputs', f"{job.job_name}-output.txt")) as f: + response = f.read() + + return {"response": f"{response}"} + + result = self.run_patched_job(component, job, side_effect_function)[0] + + for key, value in result.detection_properties.items(): + if key.startswith("LLAVA"): + self.assertTrue(value.strip().lower() != 'unsure') + + def test_get_frames(self): + component = LlavaComponent() + self.assertEqual(component._get_frames_to_process([], 1), []) + self.assertEqual(component._get_frames_to_process([1], 2), [1]) + self.assertEqual(component._get_frames_to_process([503], 2), [503]) + self.assertEqual(component._get_frames_to_process([503, 1_000], 5_000), [503]) + self.assertEqual(component._get_frames_to_process([0,1,2,3,4,5], 1), [0,1,2,3,4,5]) + self.assertEqual(component._get_frames_to_process([0,1,2,3,4,5], 2), [0,2,4, 5]) + self.assertEqual(component._get_frames_to_process([0,1,2,3,4,5], 3), [0,3,5]) + self.assertEqual(component._get_frames_to_process([0,1,2,3,4,5,900], 3), [0,3,5,900]) + self.assertEqual(component._get_frames_to_process([4,900,902,905,906,907,908,909,910,911,912,913], 5), [4,900,905,910,913]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918], 6), [910,916]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918,919], 6), [910,916,919]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918,919,920], 6), [910,916,920]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918,919,920,921], 6), [910,916,921]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918,919,920,921,922], 6), [910,916,922]) + self.assertEqual(component._get_frames_to_process([910,911,912,913,914,915,916,917,918,5_000,5_001,10_000], 6), [910,916,5_000,10_000]) + + @staticmethod + def _get_test_file(filename): + return os.path.join(os.path.dirname(__file__), 'data', filename) + +if __name__ == '__main__': + unittest.main(verbosity=2) \ No newline at end of file