diff --git a/.gitignore b/.gitignore index 3f7e1f41..f7817793 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ hs_err_pid* *.devcontainer* target/ +venv/ # CMake Files CMakeCache.txt @@ -60,3 +61,6 @@ cmake-build-release/ # Python *.egg-info *.pyc + +*.private +venv \ No newline at end of file diff --git a/python/ClipDetection/Dockerfile b/python/ClipDetection/Dockerfile index b2ac57c3..ce32534a 100644 --- a/python/ClipDetection/Dockerfile +++ b/python/ClipDetection/Dockerfile @@ -29,10 +29,11 @@ ARG MODELS_REGISTRY=openmpf/ ARG BUILD_REGISTRY ARG BUILD_TAG=latest -FROM ${MODELS_REGISTRY}openmpf_clip_detection_models:7.2.0 as models +FROM ${MODELS_REGISTRY}openmpf_clip_detection_models:8.0.0 as models FROM ${BUILD_REGISTRY}openmpf_python_executor_ssb:${BUILD_TAG} COPY --from=models /models/ViT-B-32.pt /models/ViT-B-32.pt +COPY --from=models /models/ViT-L-14.pt /models/ViT-L-14.pt RUN --mount=type=tmpfs,target=/var/cache/apt \ --mount=type=tmpfs,target=/var/lib/apt/lists \ diff --git a/python/ClipDetection/README.md b/python/ClipDetection/README.md index 75d40b37..675443b5 100644 --- a/python/ClipDetection/README.md +++ b/python/ClipDetection/README.md @@ -6,6 +6,8 @@ This repository contains source code for the OpenMPF CLIP detection component. C The following are the properties that can be specified for the component. Each property has a default value and so none of them necessarily need to be specified for processing jobs. +- `MODEL_NAME`: Specifies the CLIP model that is loaded and used by the component. The only supported models are 'ViT-L/14' (the default model) and 'ViT-B/32'. + - `NUMBER_OF_CLASSIFICATIONS`: Specifies how many of the top classifications you want to return. The default value is set to 1, and so you'll only see the classification with the greatest confidence. - `CLASSIFICATION_PATH`: If specified, this allows the user to give the component a file path to their own list of classifications in a CSV file, if the COCO or ImageNet class lists aren't of interest. See below for the formatting that's required for that file. @@ -14,9 +16,9 @@ The following are the properties that can be specified for the component. Each p - `TEMPLATE_PATH`: If specified, this allows the user to give the component a file path to their own list of templates. See below for the formatting that's required for that file. The OpenAI developers admitted that the process of developing templates was a lot of trial and error, so feel free to come up with your own! -- `NUMBER_OF_TEMPLATES`: There are three template files that are included in the component, with the number of templates in each being 1, 7, and 80. The one template is a basic template, while the 7 and 80 come from the OpenAI team when trying to [improve performance](https://github.com/openai/CLIP/blob/main/notebooks/Prompt_Engineering_for_ImageNet.ipynb) on the ImageNet dataset. The default value is 80, while 1 and 7 are the only other valid inputs. Also this property is overridden if a `TEMPLATE_PATH` is specified. +- `TEMPLATE_TYPE`: There are three template files that are included in the component, with the number of templates in each being 1, 7, and 80. The one template is a basic template, while the 7 and 80 come from the OpenAI team when trying to [improve performance](https://github.com/openai/CLIP/blob/main/notebooks/Prompt_Engineering_for_ImageNet.ipynb) on the ImageNet dataset. The default value is 'openai_80', while 'openai_1' and 'openai_7' are the only other valid inputs. Also this property is overridden if a `TEMPLATE_PATH` is specified. -- `ENABLE_CROPPING`: A boolean toggle to specify if the image is to be cropped into 144 images of size 224x224 which cover all areas of the original. By default, this is set to true. This technique is described Section 7 of the paper "[Going deeper with convolutions](https://arxiv.org/abs/1409.4842)" from Szegedy, et al. +- `ENABLE_CROPPING`: A boolean toggle to specify if the image is to be cropped into 144 images of size 224x224 which cover all areas of the original. By default, this is set to true. This technique is described in Section 7 of the paper "[Going deeper with convolutions](https://arxiv.org/abs/1409.4842)" from Szegedy, et al. - `ENABLE_TRITON`: A boolean toggle to specify whether the component should use a Triton inference server to process the image job. By default this is set to false. @@ -24,6 +26,8 @@ The following are the properties that can be specified for the component. Each p - `TRITON_SERVER`: Specifies the Triton server `:` to use for inferencing. By default, this is set to 'clip-detection-server:8001'. +- `DETECTION_FRAME_BATCH_SIZE`: Specifies the batch size when processing video files. By default, this is set to 64. + ## Detection Properties Returned `ImageLocation` objects have the following members in their `detection_properties`: @@ -54,6 +58,42 @@ tench,"tench, Tinca tinca" kite (bird of prey),kite magpie,magpie ``` +# Non-Triton Performance +The table below shows the performance of this component on a NVIDIA Tesla V100 32GB GPU, for varying batch sizes with both models: +| Model Name | Batch Size | Total Time (seconds) | Average Time per Batch (seconds) | Average Images per Second | +|------------|------------|----------------------|----------------------------------|---------------------------| +| ViT-B/32 | 16 | 38.5732 | 0.04311 | 371.1126 | +| ViT-B/32 | 32 | 37.3478 | 0.08349 | 383.289 | +| ViT-B/32 | 64 | 34.6141 | 0.1548 | 413.5598 | +| ViT-B/32 | 128 | 35.897 | 0.321 | 398.7798 | +| ViT-B/32 | 256 | 33.5689 | 0.6003 | 426.4364 | +| ViT-B/32 | 512 | 36.3621 | 1.3006 | 393.6791 | +| ViT-L/14 | 16 | 108.6101 | 0.1214 | 131.8017 | +| ViT-L/14 | 32 | 103.8613 | 0.2322 | 137.828 | +| ViT-L/14 | 64 | 101.1478 | 0.4522 | 141.5256 | +| ViT-L/14 | 128 | 102.0473 | 0.9125 | 140.2781 | +| ViT-L/14 | 256 | 99.6637 | 1.7823 | 143.633 | +| ViT-L/14 | 512 | 105.8889 | 3.7873 | 135.1889 | + +# Triton Performance +The table below shows the performance of this component with Triton on a NVIDIA Tesla V100 32GB GPU, for varying batch sizes: +| Model Name | Batch Size | VRAM Usage (MiB) | Total Time (seconds) | Average Time per Batch (seconds) | Average Images per Second | +|------------|------------|------------------|----------------------|----------------------------------|---------------------------| +| ViT-B/32 | 16 | 1249 | 23.9591 | 0.02678 | 597.4765 | +| ViT-B/32 | 32 | 1675 | 20.1931 | 0.04514 | 708.9055 | +| ViT-B/32 | 64 | 1715 | 33.08468 | 0.1479 | 432.6776 | +| ViT-B/32 | 128 | 1753 | 35.3511 | 0.3161 | 404.9379 | +| ViT-B/32 | 256 | 1827 | 33.7730 | 0.6040 | 423.8593 | +| ViT-L/14 | 16 | 1786 | 126.2017 | 0.1411 | 113.4295 | +| ViT-L/14 | 32 | 2414 | 114.7415 | 0.2565 | 124.7587 | +| ViT-L/14 | 64 | 2662 | 132.1087 | 0.5906 | 108.3577 | +| ViT-L/14 | 128 | 3150 | 140.7985 | 1.2590 | 101.6701 | +| ViT-L/14 | 256 | 3940 | 131.6293 | 2.3540 | 108.7524 | + +# Future Research +* Investigate using the CLIP interrogator for determining text prompts for classification. +* Investigate methods to automate the generation of text prompts. + * [Context Optimization (CoOp)](http://arxiv.org/abs/2109.01134) and [Conditional Context Optimization (CoCoOp)](http://arxiv.org/abs/2203.05557) models a prompt's context as a set of learnable vectors that can be optimized for the classes you're looking for, with CoCoOp improving on CoOp's ability in classifying to classes unseen by CoOp in training. # Known Issues diff --git a/python/ClipDetection/clip_component/clip_component.py b/python/ClipDetection/clip_component/clip_component.py index bcea3132..c1ac460e 100644 --- a/python/ClipDetection/clip_component/clip_component.py +++ b/python/ClipDetection/clip_component/clip_component.py @@ -28,7 +28,8 @@ import os import csv from pkg_resources import resource_filename -from typing import Mapping, Iterable +from itertools import islice +from typing import Iterable, Mapping from PIL import Image import cv2 @@ -48,82 +49,175 @@ logger = logging.getLogger('ClipComponent') device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') -class ClipComponent(mpf_util.ImageReaderMixin): +class ClipComponent(mpf_util.ImageReaderMixin, mpf_util.VideoCaptureMixin): + detection_type = 'CLASS' def __init__(self): - self._wrapper = ClipWrapper() + self._model_wrappers = {} + + @staticmethod + def _get_prop(job_properties, key, default_value, accept_values=[]): + prop = mpf_util.get_property(job_properties, key, default_value) + if (accept_values != []) and (prop not in accept_values): + raise mpf.DetectionException( + f"Property {key} not in list of acceptible values: {accept_values}", + mpf.DetectionError.INVALID_PROPERTY + ) + return prop + + def _parse_properties(self, job_properties): + model_name = self._get_prop(job_properties, "MODEL_NAME", "ViT-L/14", ["ViT-L/14", "ViT-B/32"]) + batch_size = self._get_prop(job_properties, "DETECTION_FRAME_BATCH_SIZE", 64) + classification_list = self._get_prop(job_properties, "CLASSIFICATION_LIST", 'coco', ['coco', 'imagenet']) + classification_path = os.path.expandvars(self._get_prop(job_properties, "CLASSIFICATION_PATH", '')) + enable_cropping = self._get_prop(job_properties, "ENABLE_CROPPING", True) + enable_triton = self._get_prop(job_properties, "ENABLE_TRITON", False) + include_features = self._get_prop(job_properties, "INCLUDE_FEATURES", False) + num_classifications = self._get_prop(job_properties, "NUMBER_OF_CLASSIFICATIONS", 1) + template_type = self._get_prop(job_properties, "TEMPLATE_TYPE", 'openai_80', ['openai_1', 'openai_7', 'openai_80']) + template_path = os.path.expandvars(self._get_prop(job_properties, "TEMPLATE_PATH", '')) + triton_server = self._get_prop(job_properties, "TRITON_SERVER", 'clip-detection-server:8001') + + return dict( + model_name = model_name, + batch_size = batch_size, + classification_list = classification_list, + classification_path = classification_path, + enable_cropping = enable_cropping, + enable_triton = enable_triton, + include_features = include_features, + num_classifications = num_classifications, + template_type = template_type, + template_path = template_path, + triton_server = triton_server + ) def get_detections_from_image_reader(self, image_job, image_reader): + logger.info("Received image job: %s", image_job) + + kwargs = self._parse_properties(image_job.job_properties) + image = image_reader.get_image() + num_detections = 0 try: - logger.info("received image job: %s", image_job) - image = image_reader.get_image() - detections = self._wrapper.get_classifications((image,), image_job.job_properties) + wrapper = self._get_model_wrapper(kwargs['model_name']) + detections = wrapper.get_detections((image,), **kwargs) for detection in detections: yield detection num_detections += 1 - logger.info(f"Job complete. Found {num_detections} detection{'s' if num_detections > 1 else ''}.") + logger.info(f"Job complete. Found {num_detections} detections.") except Exception as e: logger.exception(f'Job failed due to: {e}') raise + @staticmethod + def _batches_from_video_capture(video_capture, batch_size): + frames = [] + for frame in video_capture: + frames.append(frame) + if len(frames) >= batch_size: + yield len(frames), np.stack(frames) + frames = [] + + if len(frames): + padded = np.pad( + array=np.stack(frames), + pad_width=((0, batch_size - len(frames)), (0, 0), (0, 0), (0, 0)), + mode='constant', + constant_values=0 + ) + yield len(frames), padded + + def get_detections_from_video_capture(self, + video_job: mpf.VideoJob, + video_capture: mpf_util.VideoCapture) -> Iterable[mpf.VideoTrack]: + logger.info("Received video job: %s", video_job) + kwargs = self._parse_properties(video_job.job_properties) + + # If processing a video where each frame is cropped into 144 images, the batch size is set to one so that the crops aren't split between batches + batch_size = 1 if kwargs['enable_cropping'] else kwargs['batch_size'] + + batch_gen = self._batches_from_video_capture(video_capture, batch_size) + detections = [] + wrapper = self._get_model_wrapper(kwargs['model_name']) + + for n, batch in batch_gen: + try: + detections += list(islice(wrapper.get_detections(batch, **kwargs), n)) + except Exception as e: + logger.exception(f"Job failed due to: {e}") + raise + + tracks = create_tracks(detections) + logger.info(f"Job complete. Found {len(tracks)} tracks.") + return tracks + + def _get_model_wrapper(self, model_name): + if model_name not in self._model_wrappers: + self._model_wrappers[model_name] = ClipWrapper(model_name) + + return self._model_wrappers[model_name] + class ClipWrapper(object): - def __init__(self): + def __init__(self, model_name='ViT-L/14'): logger.info("Loading model...") - model, _ = clip.load('ViT-B/32', device=device, download_root='/models') + model, _ = clip.load(model_name, device=device, download_root='/models') logger.info("Model loaded.") + self._model = model self._preprocessor = None + self._input_resolution = self._model.visual.input_resolution self._classification_path = '' self._template_path = '' self._classification_list = '' self._templates = None + self._template_type = None self._class_mapping = None self._text_features = None self._inferencing_server = None - self._triton_server_url = None - - def get_classifications(self, images, job_properties: Mapping[str, str]) -> Iterable[mpf.ImageLocation]: - kwargs = self._parse_properties(job_properties) - self._check_template_list(kwargs['template_path'], kwargs['num_templates']) - self._check_class_list(kwargs['classification_path'], kwargs['classification_list']) - self._preprocessor = ImagePreprocessor(kwargs['enable_cropping']) - - for image in images: - image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) - image_width, image_height = image.size - - image = self._preprocessor.preprocess(image).to(device) - - if kwargs['enable_triton']: - if self._inferencing_server is None or kwargs['triton_server'] != self._triton_server_url: - self._inferencing_server = CLIPInferencingServer(kwargs['triton_server']) - self._triton_server_url = kwargs['triton_server'] - - results = self._inferencing_server.get_responses(image) - image_tensors= torch.Tensor(np.copy(results)).to(device=device) - image_features = torch.mean(image_tensors, 0) - else: - with torch.no_grad(): - image_features = self._model.encode_image(image).float() - image_features = torch.mean(image_features, 0).unsqueeze(0) - + def get_detections(self, images, **kwargs) -> Iterable[mpf.ImageLocation]: + templates_changed = self._check_template_list(kwargs['template_path'], kwargs['template_type']) + self._check_class_list(kwargs['classification_path'], kwargs['classification_list'], templates_changed) + + self._preprocessor = ImagePreprocessor(kwargs['enable_cropping'], self._input_resolution) + images = [Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) for image in images] + image_sizes = [image.size for image in images] + torch_imgs = torch.stack([self._preprocessor.preprocess(image).squeeze(0) for image in images]).to(device) + if kwargs['enable_cropping']: + torch_imgs = torch_imgs.squeeze(0) + + if kwargs['enable_triton']: + if self._inferencing_server is None or \ + kwargs['triton_server'] != self._inferencing_server.get_url() or \ + kwargs['model_name'] != self._inferencing_server.get_model_name(): + self._inferencing_server = CLIPInferencingServer(kwargs['triton_server'], kwargs['model_name']) + + results = self._inferencing_server.get_responses(torch_imgs) + image_features = torch.Tensor(np.copy(results)).squeeze(0).to(device=device) + else: with torch.no_grad(): - image_features /= image_features.norm(dim=-1, keepdim=True) + image_features = self._model.encode_image(torch_imgs).float() + + with torch.no_grad(): + image_features /= image_features.norm(dim=-1, keepdim=True) + + similarity = (100.0 * image_features @ self._text_features).softmax(dim=-1).to(device) - similarity = (100.0 * image_features @ self._text_features).softmax(dim=-1).to(device) - similarity = torch.mean(similarity, 0) - values, indices = similarity.topk(len(self._class_mapping)) + if kwargs['enable_cropping']: + similarity = torch.mean(similarity, 0).unsqueeze(0) + + values, indices = similarity.topk(len(self._class_mapping)) + for detection_values, detection_indices, image_size in zip(values, indices, image_sizes): classification_list = [] classification_confidence_list = [] count = 0 - for value, index in zip(values, indices): + for value, index in zip(detection_values, detection_indices): if count >= kwargs['num_classifications']: break class_name = self._class_mapping[list(self._class_mapping.keys())[int(index)]] @@ -131,7 +225,7 @@ def get_classifications(self, images, job_properties: Mapping[str, str]) -> Iter classification_list.append(class_name) classification_confidence_list.append(str(value.item())) count += 1 - + classification_list = '; '.join(classification_list) classification_confidence_list = '; '.join(classification_confidence_list) @@ -147,46 +241,13 @@ def get_classifications(self, images, job_properties: Mapping[str, str]) -> Iter yield mpf.ImageLocation( x_left_upper = 0, y_left_upper = 0, - width = image_width, - height = image_height, + width = image_size[0], + height = image_size[1], confidence = float(classification_confidence_list.split('; ')[0]), detection_properties = detection_properties ) - def _parse_properties(self, job_properties): - classification_list = self._get_prop(job_properties, "CLASSIFICATION_LIST", 'coco', ['coco', 'imagenet']) - classification_path = os.path.expandvars(self._get_prop(job_properties, "CLASSIFICATION_PATH", '')) - enable_cropping = self._get_prop(job_properties, "ENABLE_CROPPING", True) - enable_triton = self._get_prop(job_properties, "ENABLE_TRITON", False) - include_features = self._get_prop(job_properties, "INCLUDE_FEATURES", False) - num_classifications = self._get_prop(job_properties, "NUMBER_OF_CLASSIFICATIONS", 1) - num_templates = self._get_prop(job_properties, "NUMBER_OF_TEMPLATES", 80, [1, 7, 80]) - template_path = os.path.expandvars(self._get_prop(job_properties, "TEMPLATE_PATH", '')) - triton_server = self._get_prop(job_properties, "TRITON_SERVER", 'clip-detection-server:8001') - - return dict( - classification_list = classification_list, - classification_path = classification_path, - enable_cropping = enable_cropping, - enable_triton = enable_triton, - include_features = include_features, - num_classifications = num_classifications, - num_templates = num_templates, - template_path = template_path, - triton_server = triton_server - ) - - @staticmethod - def _get_prop(job_properties, key, default_value, accep_values=[]): - prop = mpf_util.get_property(job_properties, key, default_value) - if (accep_values != []) and (prop not in accep_values): - raise mpf.DetectionException( - f"Property {key} not in list of acceptible values: {accep_values}", - mpf.DetectionError.INVALID_PROPERTY - ) - return prop - - def _check_template_list(self, template_path, number_of_templates): + def _check_template_list(self, template_path: str, template_type: str) -> bool: if template_path != '': if (not os.path.exists(template_path)): raise mpf.DetectionException( @@ -196,30 +257,33 @@ def _check_template_list(self, template_path, number_of_templates): elif self._template_path != template_path: self._template_path = template_path - try: - logger.info("Updating templates...") - self._templates = self._get_templates_from_file(template_path) - logger.info("Templates updated.") - except: - raise mpf.DetectionException( - f"Could not read templates from {template_path}", - mpf.DetectionError.COULD_NOT_READ_DATAFILE - ) - - elif self._templates == None or number_of_templates != len(self._templates): - if number_of_templates == 80: + try: + logger.info("Updating templates...") + self._templates = self._get_templates_from_file(template_path) + logger.info("Templates updated.") + return True + except: + raise mpf.DetectionException( + f"Could not read templates from {template_path}", + mpf.DetectionError.COULD_NOT_READ_DATAFILE + ) + elif (self._templates == None) or (template_type != self._template_type): + if template_type == 'openai_80': template_filename = 'eighty_templates.txt' - elif number_of_templates == 7: + elif template_type == 'openai_7': template_filename = 'seven_templates.txt' - elif number_of_templates == 1: + elif template_type == 'openai_1': template_filename = 'one_template.txt' template_path = os.path.realpath(resource_filename(__name__, 'data/' + template_filename)) logger.info("Updating templates...") self._templates = self._get_templates_from_file(template_path) + self._template_type = template_type logger.info("Templates updated.") + return True + return False - def _check_class_list(self, classification_path, classification_list): + def _check_class_list(self, classification_path: str, classification_list: str, templates_changed: bool) -> None: if classification_path != "": if (not os.path.exists(classification_path)): raise mpf.DetectionException( @@ -231,7 +295,7 @@ def _check_class_list(self, classification_path, classification_list): self._classification_list = classification_list.lower() classification_path = os.path.realpath(resource_filename(__name__, f'data/{self._classification_list}_classification_list.csv')) - if self._classification_path != classification_path: + if self._classification_path != classification_path or templates_changed: self._classification_path = classification_path try: @@ -259,7 +323,7 @@ def _check_class_list(self, classification_path, classification_list): logger.info("Text embeddings created.") @staticmethod - def _get_mapping_from_classifications(classification_path): + def _get_mapping_from_classifications(classification_path: str) -> Mapping[str, str]: with open(classification_path) as csvfile: mapping = {} csvreader = csv.reader(csvfile) @@ -269,7 +333,7 @@ def _get_mapping_from_classifications(classification_path): return mapping @staticmethod - def _get_templates_from_file(template_path): + def _get_templates_from_file(template_path: str) -> Iterable[str]: with open(template_path) as f: return [line.strip() for line in f.readlines()] @@ -277,8 +341,13 @@ class CLIPInferencingServer(object): ''' Class that handles Triton inferencing if enabled. ''' - def __init__(self, triton_server): - self._model_name = 'ip_clip_512' + + MODEL_NAME_ON_SERVER_MAPPING = {'ViT-L/14': 'vit_l_14', 'ViT-B/32': 'vit_b_32'} + + def __init__(self, triton_server: str, model_name: str = 'ViT-L/14'): + self._url = triton_server + self._model_name = model_name + self._model_name_on_server = self.MODEL_NAME_ON_SERVER_MAPPING[model_name] self._input_name = None self._output_name = None self._dtype = None @@ -293,14 +362,22 @@ def __init__(self, triton_server): self._check_triton_server() try: - model_metadata = self._triton_client.get_model_metadata(model_name=self._model_name) + model_metadata = self._triton_client.get_model_metadata(model_name=self._model_name_on_server) except InferenceServerException as e: - logger.exception("Failed to retrieve model metadata.") - raise + raise mpf.DetectionException( + f"Failed to retrieve model metadata for {self._model_name_on_server}: {e}", + mpf.DetectionError.NETWORK_ERROR + ) self._parse_model(model_metadata) - def _parse_model(self, model_metadata): + def get_url(self) -> str: + return self._url + + def get_model_name(self) -> str: + return self._model_name + + def _parse_model(self, model_metadata) -> None: input_metadata = model_metadata.inputs[0] output_metadata = model_metadata.outputs[0] @@ -323,10 +400,10 @@ def get_responses(self, images): responses = [] try: for inputs, outputs in self._get_inputs_outputs(images): - responses.append(self._triton_client.infer(model_name=self._model_name, inputs=inputs, outputs=outputs)) - except Exception: + responses.append(self._triton_client.infer(model_name=self._model_name_on_server, inputs=inputs, outputs=outputs)) + except Exception as e: raise mpf.DetectionException( - f"Inference failed.", + f"Inference failed: {e}", mpf.DetectionError.NETWORK_ERROR ) @@ -336,10 +413,16 @@ def get_responses(self, images): results.append(result) return results - def _check_triton_server(self): - if not self._triton_client.is_server_live(): + def _check_triton_server(self) -> None: + try: + if not self._triton_client.is_server_live(): + raise mpf.DetectionException( + "Server is not live.", + mpf.DetectionError.NETWORK_ERROR + ) + except InferenceServerException as e: raise mpf.DetectionException( - "Server is not live.", + f"Failed to check if server is live: {e}", mpf.DetectionError.NETWORK_ERROR ) @@ -349,9 +432,9 @@ def _check_triton_server(self): mpf.DetectionError.NETWORK_ERROR ) - if not self._triton_client.is_model_ready(self._model_name): + if not self._triton_client.is_model_ready(self._model_name_on_server): raise mpf.DetectionException( - f"Model {self._model_name} is not ready.", + f"Model {self._model_name_on_server} is not ready.", mpf.DetectionError.NETWORK_ERROR ) @@ -360,7 +443,8 @@ class ImagePreprocessor(object): Class that handles the preprocessing of images before being sent through the CLIP model. Values from T.Normalize() taken from OpenAI's code for CLIP, https://github.com/openai/CLIP/blob/main/clip/clip.py#L85 ''' - def __init__(self, enable_cropping): + def __init__(self, enable_cropping: bool, image_size: int): + self.image_size = image_size if enable_cropping: self.preprocess = self.crop else: @@ -376,16 +460,16 @@ def crop(self, image): def resize_pad(self, image): width, height = image.width, image.height - resize_ratio = 224 / max(width, height) - new_w, new_h = (int(width * resize_ratio), int(height * resize_ratio)) + resize_ratio = self.image_size / max(width, height) + new_w, new_h = (round(width * resize_ratio), round(height * resize_ratio)) if new_w < new_h: - left = (224 - new_w) // 2 - right = (225 - new_w) // 2 + left = (self.image_size - new_w) // 2 + right = (self.image_size + 1 - new_w) // 2 padding = (left, 0, right, 0) else: - top = (224 - new_h) // 2 - bottom = (225 - new_h) // 2 + top = (self.image_size - new_h) // 2 + bottom = (self.image_size + 1 - new_h) // 2 padding = (0, top, 0, bottom) new_img = T.Compose([ @@ -416,6 +500,27 @@ def _get_crops(imgs): resized = TF.resize(img, 224) crops += five_crops + (resized, TF.hflip(resized)) + tuple([TF.hflip(fcrop) for fcrop in five_crops]) return crops - - + +def create_tracks(detections: Iterable[mpf.ImageLocation]) -> Iterable[mpf.VideoTrack]: + tracks = [] + for idx, detection in enumerate(detections): + if len(tracks) == 0 or tracks[-1].detection_properties["CLASSIFICATION"] != detection.detection_properties["CLASSIFICATION"]: + detection_properties = { "CLASSIFICATION": detection.detection_properties["CLASSIFICATION"] } + frame_locations = { idx: detection } + track = mpf.VideoTrack( + start_frame=idx, + stop_frame=idx, + confidence=detection.confidence, + frame_locations=frame_locations, + detection_properties=detection_properties + ) + tracks.append(track) + else: + tracks[-1].stop_frame = idx + tracks[-1].frame_locations[idx] = detection + if tracks[-1].confidence < detection.confidence: + tracks[-1].confidence = detection.confidence + + return tracks + EXPORT_MPF_COMPONENT = ClipComponent diff --git a/python/ClipDetection/clip_component/data/imagenet_classification_list.csv b/python/ClipDetection/clip_component/data/imagenet_classification_list.csv index 78c8537b..6de2c296 100644 --- a/python/ClipDetection/clip_component/data/imagenet_classification_list.csv +++ b/python/ClipDetection/clip_component/data/imagenet_classification_list.csv @@ -132,7 +132,7 @@ flamingo,flamingo little blue heron,"little blue heron, Egretta caerulea" great egret,"American egret, great white heron, Egretta albus" bittern bird,bittern -crane bird,crane +crane bird,crane bird limpkin,"limpkin, Aramus pictus" common gallinule,"European gallinule, Porphyrio porphyrio" American coot,"American coot, marsh hen, mud hen, water hen, Fulica americana" @@ -515,7 +515,7 @@ cornet,"cornet, horn, trumpet, trump" cowboy boot,cowboy boot cowboy hat,"cowboy hat, ten-gallon hat" cradle,cradle -construction crane,crane +construction crane,construction crane crash helmet,crash helmet crate,crate infant bed,"crib, cot" diff --git a/python/ClipDetection/plugin-files/descriptor/descriptor.json b/python/ClipDetection/plugin-files/descriptor/descriptor.json index 745e25b4..38c3a039 100644 --- a/python/ClipDetection/plugin-files/descriptor/descriptor.json +++ b/python/ClipDetection/plugin-files/descriptor/descriptor.json @@ -21,6 +21,12 @@ "DETECTION_CLASS_CLIP" ], "properties": [ + { + "name": "MODEL_NAME", + "description": "Specifies which CLIP model to load for inferencing. The available models are 'ViT-L/14' and 'ViT-B/32'.", + "type": "STRING", + "defaultValue": "ViT-L/14" + }, { "name": "NUMBER_OF_CLASSIFICATIONS", "description": "The number of classifications, N, to be returned. The N highest confidence classifications found by the network will be returned with their associated confidence values. The value must be greater than 0, and less than the size of the model output layer.", @@ -28,10 +34,10 @@ "defaultValue": "1" }, { - "name": "NUMBER_OF_TEMPLATES", - "description": "The number of templates to be used in the text encoder. The current acceptable values are 7 and 80.", - "type": "INT", - "defaultValue": "80" + "name": "TEMPLATE_TYPE", + "description": "The number of templates to be used in the text encoder. The current acceptable values are 'openai_1', 'openai_7', and 'openai_80'.", + "type": "STRING", + "defaultValue": "openai_80" }, { "name": "CLASSIFICATION_LIST", @@ -53,7 +59,7 @@ }, { "name": "ENABLE_CROPPING", - "description": "If true, the image will be cropped into 144 images of size 224x224. The results from each of these images is averaged to get the results. Not available for use on CPU.", + "description": "If true, the image will be cropped into 144 images of size 224x224. The results from each of these images is averaged to get the results. Not recommended for use on CPU.", "type": "BOOLEAN", "defaultValue": "true" }, @@ -74,6 +80,12 @@ "description": "Triton server : to use for inferencing.", "type": "STRING", "defaultValue": "clip-detection-server:8001" + }, + { + "name": "DETECTION_FRAME_BATCH_SIZE", + "description": "Number of frames to batch inference when processing video. GPU VRAM dependant. If ENABLE_CROPPING is set to true, then the value will be ignored and set to 1.", + "type": "INT", + "defaultValue": "64" } ] } @@ -93,6 +105,10 @@ { "name": "ENABLE_TRITON", "value": "true" + }, + { + "name": "DETECTION_FRAME_BATCH_SIZE", + "value": "32" } ] }, @@ -119,6 +135,10 @@ { "name": "CLASSIFICATION_LIST", "value": "imagenet" + }, + { + "name": "DETECTION_FRAME_BATCH_SIZE", + "value": "32" } ] } diff --git a/python/ClipDetection/tests/data/NOTICE b/python/ClipDetection/tests/data/NOTICE index 9e5de8ce..0cee4e82 100644 --- a/python/ClipDetection/tests/data/NOTICE +++ b/python/ClipDetection/tests/data/NOTICE @@ -7,6 +7,9 @@ # riot.jpg # Public Domain +# test_video.mp4 +# Custom created file from public domain images + # violence_classes.csv # Custom created file for testing CLASSIFICATION_PATH diff --git a/python/ClipDetection/tests/data/test_video.mp4 b/python/ClipDetection/tests/data/test_video.mp4 new file mode 100644 index 00000000..1303a1ea Binary files /dev/null and b/python/ClipDetection/tests/data/test_video.mp4 differ diff --git a/python/ClipDetection/tests/test_clip.py b/python/ClipDetection/tests/test_clip.py index 50171345..8b1a674f 100644 --- a/python/ClipDetection/tests/test_clip.py +++ b/python/ClipDetection/tests/test_clip.py @@ -45,10 +45,10 @@ def test_image_file(self): data_uri=self._get_test_file('dog.jpg'), job_properties=dict( NUMBER_OF_CLASSIFICATIONS = 3, - NUMBER_OF_TEMPLATES = 1, - CLASSIFICATION_LIST = 'coco', - ENABLE_CROPPING='False', - INCLUDE_FEATURES = 'True' + TEMPLATE_TYPE = 'openai_1', + ENABLE_CROPPING ='False', + INCLUDE_FEATURES = 'True', + MODEL_NAME="ViT-B/32" ), media_properties={}, feed_forward_location=None @@ -73,20 +73,22 @@ def test_image_file_custom(self): media_properties={}, feed_forward_location=None ) - result = list(ClipComponent().get_detections_from_image(job))[0] + component = ClipComponent() + result = list(component.get_detections_from_image(job))[0] + self.assertEqual(job.job_properties["NUMBER_OF_CLASSIFICATIONS"], len(self._output_to_list(result.detection_properties["CLASSIFICATION LIST"]))) self.assertTrue("violent scene" in self._output_to_list(result.detection_properties["CLASSIFICATION LIST"])) self.assertEqual("violent scene", result.detection_properties["CLASSIFICATION"]) - + def test_image_file_rollup(self): job = mpf.ImageJob( job_name='test-image-rollup', data_uri=self._get_test_file('dog.jpg'), job_properties=dict( NUMBER_OF_CLASSIFICATIONS = 4, - NUMBER_OF_TEMPLATES = 1, + TEMPLATE_TYPE = 'openai_1', CLASSIFICATION_PATH = self._get_test_file("rollup.csv"), - ENABLE_CROPPING='False' + ENABLE_CROPPING = 'False' ), media_properties={}, feed_forward_location=None @@ -94,6 +96,35 @@ def test_image_file_rollup(self): result = list(ClipComponent().get_detections_from_image(job))[0] self.assertEqual("indoor animal", result.detection_properties["CLASSIFICATION"]) + def test_video_file(self): + job = mpf.VideoJob( + job_name='test-video', + data_uri=self._get_test_file('test_video.mp4'), + start_frame=0, + stop_frame=14, + job_properties=dict( + TEMPLATE_TYPE = 'openai_1', + ENABLE_CROPPING = 'False', + DETECTION_FRAME_BATCH_SIZE = 4 + ), + media_properties={}, + feed_forward_track=None + ) + component = ClipComponent() + results = list(component.get_detections_from_video(job)) + + self.assertEqual(results[0].detection_properties['CLASSIFICATION'], "dog") + self.assertEqual(results[0].start_frame, 0) + self.assertEqual(results[0].stop_frame, 4) + + self.assertEqual(results[1].detection_properties['CLASSIFICATION'], "orange") + self.assertEqual(results[1].start_frame, 5) + self.assertEqual(results[1].stop_frame, 9) + + self.assertEqual(results[2].detection_properties['CLASSIFICATION'], "dog") + self.assertEqual(results[2].start_frame, 10) + self.assertEqual(results[2].stop_frame, 14) + @staticmethod def _get_test_file(filename): return os.path.join(os.path.dirname(__file__), 'data', filename) diff --git a/python/ClipDetection/tests/test_clip_triton.py b/python/ClipDetection/tests/test_clip_triton.py index 80113644..8b6481e1 100644 --- a/python/ClipDetection/tests/test_clip_triton.py +++ b/python/ClipDetection/tests/test_clip_triton.py @@ -37,27 +37,25 @@ logging.basicConfig(level=logging.DEBUG) -class TestClip(unittest.TestCase): +class TestClipTriton(unittest.TestCase): def test_image_file(self): job = mpf.ImageJob( - job_name='test-image', + job_name='test-image-triton', data_uri=self._get_test_file('collie.jpg'), job_properties=dict( NUMBER_OF_CLASSIFICATIONS = 10, - NUMBER_OF_TEMPLATES = 80, - CLASSIFICATION_LIST = 'imagenet', - ENABLE_CROPPING='False', - ENABLE_TRITON='True', - TRITON_SERVER='clip-detection-server:8001' + TEMPLATE_TYPE = 'openai_80', + ENABLE_CROPPING = 'False', + ENABLE_TRITON = 'True', + TRITON_SERVER = 'clip-detection-server:8001' ), media_properties={}, feed_forward_location=None ) result = list(ClipComponent().get_detections_from_image(job))[0] - self.assertTrue("collie" in self._output_to_list(result.detection_properties["CLASSIFICATION LIST"]) or "Border collie" in self._output_to_list(result.detection_properties["CLASSIFICATION LIST"])) - - + self.assertTrue("dog" in self._output_to_list(result.detection_properties["CLASSIFICATION LIST"])) + @staticmethod def _get_test_file(filename): return os.path.join(os.path.dirname(__file__), 'data', filename) diff --git a/python/ClipDetection/triton_server/Dockerfile b/python/ClipDetection/triton_server/Dockerfile index 17e27d5c..7656c071 100644 --- a/python/ClipDetection/triton_server/Dockerfile +++ b/python/ClipDetection/triton_server/Dockerfile @@ -28,12 +28,15 @@ ARG MODELS_REGISTRY=openmpf/ -FROM ${MODELS_REGISTRY}openmpf_clip_detection_models:7.2.0 as models +FROM ${MODELS_REGISTRY}openmpf_clip_detection_triton_models:8.0.0 as models FROM nvcr.io/nvidia/tritonserver:22.04-py3 as openmpf_triton_server -COPY --from=models /models/model.pt /models/ip_clip_512/1/model.pt -COPY models/ip_clip_512.config.pbtxt /models/ip_clip_512/config.pbtxt +COPY --from=models /models/vit_b_32.pt /models/vit_b_32/1/vit_b_32.pt +COPY models/vit_b_32.config.pbtxt /models/vit_b_32/config.pbtxt + +COPY --from=models /models/vit_l_14.pt /models/vit_l_14/1/vit_l_14.pt +COPY models/vit_l_14.config.pbtxt /models/vit_l_14/config.pbtxt RUN apt-get update; \ apt-get -y upgrade; \ diff --git a/python/ClipDetection/triton_server/models/ip_clip_512.config.pbtxt b/python/ClipDetection/triton_server/models/vit_b_32.config.pbtxt similarity index 88% rename from python/ClipDetection/triton_server/models/ip_clip_512.config.pbtxt rename to python/ClipDetection/triton_server/models/vit_b_32.config.pbtxt index bdc0d103..32249761 100644 --- a/python/ClipDetection/triton_server/models/ip_clip_512.config.pbtxt +++ b/python/ClipDetection/triton_server/models/vit_b_32.config.pbtxt @@ -1,4 +1,5 @@ -name: "ip_clip_512" +name: "vit_b_32" +default_model_filename: "vit_b_32.pt" backend: "pytorch" max_batch_size: 2048 input [ diff --git a/python/ClipDetection/triton_server/models/vit_l_14.config.pbtxt b/python/ClipDetection/triton_server/models/vit_l_14.config.pbtxt new file mode 100644 index 00000000..3431bfa7 --- /dev/null +++ b/python/ClipDetection/triton_server/models/vit_l_14.config.pbtxt @@ -0,0 +1,28 @@ +name: "vit_l_14" +default_model_filename: "vit_l_14.pt" +backend: "pytorch" +max_batch_size: 2048 +input [ + { + name: "image_input" + data_type: TYPE_FP32 + dims: [3, 224, 224] + } +] +output [ + { + name: "feature_vector__0" + data_type: TYPE_FP32 + dims: [512] + } +] +parameters [ + { + key: "INFERENCE_MODE" + value: {string_value: "true"} + }, + { + key: "ENABLE_NVFUSER" + value: {string_value: "true"} + } +] \ No newline at end of file