# MoZuMa overview


# Downloading images


In [None]:
import torch
from mozuma.torch.callbacks import TorchRunnerCallbackType
from mozuma.torch.datasets import ListDatasetIndexed, TorchDataset
from mozuma.torch.modules import TorchModel
from mozuma.torch.runners import TorchInferenceRunner
from mozuma.torch.options import TorchRunnerOptions


def run_torch_model_inference(
    model: TorchModel,
    callbacks: "list[TorchRunnerCallbackType]",
    dataset: "TorchDataset | None" = None,
) -> None:
    """Runs inference for a PyTorch model"""
    # Setting the dataset to images if not defined
    dataset = dataset or ListDatasetIndexed(indices=IMAGE_URLS, objects=images_objects)

    runner = TorchInferenceRunner(
        model=model,
        dataset=dataset,
        callbacks=callbacks,
        options=TorchRunnerOptions(
            device=torch.device("cpu"), data_loader_options={"batch_size": 20}
        ),
    )
    runner.run()


## Generic function to compute features


In [None]:
from mozuma.callbacks import CollectFeaturesInMemory


def collect_features(
    model: TorchModel, dataset: "TorchDataset | None" = None
) -> npt.NDArray[np.float_]:
    features = CollectFeaturesInMemory()
    run_torch_model_inference(model=model, callbacks=[features], dataset=dataset)
    if dataset is None:
        assert features.indices == IMAGE_URLS, features.indices
    return features.features


## Find an image from a text query

<p style="text-align: center;">A dog at the beach</p>


In [None]:
# TODO: add noise with dog images
from mozuma.models.clip.pretrained import (
    torch_clip_image_encoder,
    torch_clip_text_encoder,
)
from mozuma.torch.datasets import ListDataset

# See https://mozuma.github.io/mozuma/examples/overview/
clip_image_features = collect_features(model=torch_clip_image_encoder("ViT-B/32"))
clip_text_features = collect_features(
    model=torch_clip_text_encoder("ViT-B/32"),
    dataset=ListDataset(["a dog at the beach"]),
)
display_images(
    arg_rank_by_cosine_similarity(clip_text_features, clip_image_features, take=1), img_width=500
)


# Generic function to compute bounding boxes


In [None]:
from mozuma.callbacks import CollectBoundingBoxesInMemory


def collect_bbox(
    model: TorchModel,
) -> "tuple[npt.NDArray[np.str_], npt.NDArray[np.float_], npt.NDArray[np.float_]]":
    bbox = CollectBoundingBoxesInMemory()
    run_torch_model_inference(model=model, callbacks=[bbox])
    assert bbox.indices == IMAGE_URLS, bbox.indices
    # Flattening the bounding boxes
    indices: "list[str]" = []
    features: "list[npt.NDArray[np.float_]]" = []
    boxes: "list[npt.NDArray[np.float_]]" = []
    for index, box_list in zip(bbox.indices, bbox.bounding_boxes):
        indices += [index] * len(box_list.bounding_boxes)
        boxes.append(box_list.bounding_boxes)
        if box_list.features is None:
            raise ValueError("This model does not returned features")
        features.append(box_list.features)
    return np.array(indices, dtype=str), np.vstack(boxes), np.vstack(features)


## Find images with similar objects


In [None]:
from mozuma.models.vinvl.pretrained import torch_vinvl_detector

bbox_indices, bbox_boxes, bbox_features = collect_bbox(model=torch_vinvl_detector())


In [None]:
from scipy.spatial.distance import cdist

# Find an image of a paddle
paddle_coordinates = np.array([ 899.95416,  581.6102 , 1105.5442 ,  640.5274 ])
paddle_box_index = np.argmin(cdist(paddle_coordinates[np.newaxis], bbox_boxes[bbox_indices == KAYAK_IMAGE]))
paddle_bounding_box = bbox_boxes[bbox_indices == KAYAK_IMAGE][paddle_box_index]
paddle_features = bbox_features[bbox_indices == KAYAK_IMAGE][paddle_box_index]
bbox_features[IMAGE_URLS]

display_crops([IMAGE_URLS.index(KAYAK_IMAGE)], paddle_bounding_box[np.newaxis], img_width=500)


In [None]:
# Finding similar objects
top_matching_objects = arg_rank_by_cosine_similarity(paddle_features, bbox_features, take=12)[1:]
top_matching_objects_image_urls = bbox_indices[top_matching_objects]

display_crops([IMAGE_URLS.index(img_url) for img_url in top_matching_objects_image_urls], bbox_boxes[top_matching_objects, :], img_width=500)


In [None]:
np.sort(cosine_similarity(paddle_features, bbox_features))[-12:]

# places + kayak

In [None]:
from mozuma.models.densenet.pretrained import torch_densenet_places365

densenet_places_features = collect_features(model=torch_densenet_places365())

In [None]:
display_images([IMAGE_URLS.index(SEA_CAVE_IMAGE)], img_width=500)
# TODO add more cave images

In [None]:
# Find images with an object that looks like a paddle with threshold 0.5 on cosine similarity
paddle_objects = cosine_similarity(paddle_features, bbox_features) > 0.5
image_urls_with_paddles = set(bbox_indices[paddle_objects])

# Ranking image with paddles with the places365 similarity
display_images([next(
    img_idx
    for img_idx in arg_rank_by_cosine_similarity(
        densenet_places_features[IMAGE_URLS.index(SEA_CAVE_IMAGE)],
        densenet_places_features,
    )
    if IMAGE_URLS[img_idx] in image_urls_with_paddles
)], img_width=500)
