microsoft · fepegar · Sep 16, 2022 · Sep 14, 2022 · Sep 14, 2022 · Sep 15, 2022
diff --git a/hi-ml-multimodal/src/health_multimodal/image/inference_engine.py b/hi-ml-multimodal/src/health_multimodal/image/inference_engine.py
@@ -8,6 +8,7 @@
 from typing import Callable, Tuple
 
 import torch
+import torch.nn.functional as F
 from torchvision.transforms import Compose
 
 from health_multimodal.image.data.io import load_image
@@ -56,7 +57,7 @@ def load_and_transform_input_image(self, image_path: Path, transform: Callable)
 
     @torch.no_grad()
     def get_patch_embeddings_from_image(self, image_path: Path) -> Tuple[torch.Tensor, TypeShape2D]:
-        """Compute image embeddings in the joint latent space, preserving the image grid.
+        """Compute image patch embeddings in the joint latent space, preserving the image grid.
 
         :param image_path: Path to the image to compute embeddings for.
         :return: A tuple containing the image patch embeddings and
@@ -67,3 +68,19 @@ def get_patch_embeddings_from_image(self, image_path: Path) -> Tuple[torch.Tenso
         assert projected_img_emb.shape[0] == 1
 
         return projected_img_emb[0], img_shape
+
+    @torch.no_grad()
+    def get_projected_global_embedding_from_image(self, image_path: Path) -> torch.Tensor:
+        """Compute global image embedding in the joint latent space.
+
+        :param image_path: Path to the image to compute embeddings for.
+        :return: Torch tensor containing l2-normalised global image embedding [joint_feature_dim,]
+        """
+        input_image, _ = self.load_and_transform_input_image(image_path, self.transform)
+        projected_img_emb = self.model.forward(input_image).projected_global_embedding
+        projected_img_emb = F.normalize(projected_img_emb, dim=-1)
+
+        assert projected_img_emb.shape[0] == 1
+        assert projected_img_emb.ndim == 2
+
+        return projected_img_emb[0]
diff --git a/hi-ml-multimodal/src/health_multimodal/vlp/inference_engine.py b/hi-ml-multimodal/src/health_multimodal/vlp/inference_engine.py
@@ -27,6 +27,24 @@ def __init__(self,
         self.image_inference_engine = image_inference_engine
         self.text_inference_engine = text_inference_engine
 
+    def get_similarity_score_from_raw_data(self,
+                                           image_path: Path,
+                                           query_text: str) -> float:
+        """Return the similarity score between the image and the text.
+
+        :param image_path: Path to the input chest X-ray, either a DICOM or JPEG file.
+        :param query_text: Input radiology text phrase.
+        :return: The similarity score between the image and the text.
+        """
+        assert not self.image_inference_engine.model.training
+        assert not self.text_inference_engine.model.training
+
+        image_embedding = self.image_inference_engine.get_projected_global_embedding_from_image(image_path)
+        text_embedding = self.text_inference_engine.get_embeddings_from_prompt(query_text)
+        cos_similarity = image_embedding @ text_embedding.t()
+
+        return cos_similarity.item()
+
     def get_similarity_map_from_raw_data(self,
                                          image_path: Path,
                                          query_text: str,

diff --git a/hi-ml-multimodal/test_multimodal/image/test_image_inference_engine.py b/hi-ml-multimodal/test_multimodal/image/test_image_inference_engine.py
@@ -0,0 +1,38 @@
+#  -------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  -------------------------------------------------------------------------------------------
+
+import tempfile
+from pathlib import Path
+
+import pytest
+import torch
+from PIL import Image
+
+from health_multimodal.image import ImageModel, ResnetType, ImageInferenceEngine
+from health_multimodal.image.data.transforms import create_chest_xray_transform_for_inference
+
+
+@pytest.mark.parametrize("height", (400, 500, 650))
+def test_image_inference_engine(height: int) -> None:
+    """Test the image inference engine with a dummy image and ensure that the output is of the correct shape."""
+
+    joint_feature_size = 128
+    resize = 512
+    center_crop_size = 480
+
+    width = 600
+    image_inference = ImageInferenceEngine(
+        image_model=ImageModel(img_model_type=ResnetType.RESNET50.value, joint_feature_size=joint_feature_size),
+        transform=create_chest_xray_transform_for_inference(resize=resize, center_crop_size=center_crop_size))
+
+    with tempfile.NamedTemporaryFile(suffix='.jpg') as f:
+        image_path = Path(f.name)
+        image = Image.new('RGB', (width, height))
+        image.save(image_path)
+
+        # Test individual components
+        image_embedding = image_inference.get_projected_global_embedding_from_image(image_path)
+        assert image_embedding.shape == (joint_feature_size,)
+        assert torch.allclose(torch.norm(image_embedding), torch.tensor([1.00]))
diff --git a/hi-ml-multimodal/test_multimodal/vlp/test_vlp_inference_engine.py b/hi-ml-multimodal/test_multimodal/vlp/test_vlp_inference_engine.py
@@ -66,3 +66,8 @@ def test_vlp_inference(height: int, query_text: str) -> None:
 
         similarity_map = img_txt_inference._get_similarity_map_from_embeddings(image_embedding, text_embedding)
         assert similarity_map.shape == expected_image_embedding_size
+
+        # Test global similarity score
+        sim_score = img_txt_inference.get_similarity_score_from_raw_data(image_path=image_path, query_text=query_text)
+        assert isinstance(sim_score, float)
+        assert sim_score >= -1 and sim_score <= 1