## Setup Conda Env

In [None]:
!conda create -n cleanenv python=3.10
!conda activate cleanenv
!pip install fiftyone umap-learn numba numpy

In [None]:
!pip install numpy==1.23.5 ultralytics transformers opencv-python torch torchvision huggingface open_clip_torch 

In [None]:
import torch
import cv2
import os
import glob
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
import subprocess
from ultralytics import YOLO, SAM
import matplotlib.pyplot as plt
import open_clip
from transformers import CLIPProcessor, CLIPModel
import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.core.labels as fol

## Import Models

In [3]:
file_path = "./Benthic-Mapping-highlight/benthic_mapping/model.pt"
yolo_model = YOLO(file_path) # Instantiate your model

In [None]:
#Load yolo model for obj detection. Import different models, yolo_mode and bio_models were for experimentation. 
# Model frop precious cell and this clip_model are ones used

yolo_model = YOLO("yolov8n.pt")

bio_model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms('hf-hub:imageomics/bioclip')
bio_model.eval()

device = "cuda" if torch.cuda.is_available() else "cpu"
bio_model = bio_model.to(device)

clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14-336")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
clip_model.eval()

## Establish embedding functions

In [5]:
def get_clip_embeddings(image):
    '''
    Computes CLIP embeddings for a given image.

    Args:
        image (PIL.Image.Image): Input image.

    Returns:
        torch.Tensor: Normalized CLIP embeddings.
    '''
    inputs = clip_processor(images=image, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = clip_model.get_image_features(**inputs)
    return outputs / outputs.norm(p=2, dim = 1, keepdim=True)

def extract_objects_with_embeddings(yolo_model,frame):
    '''
     Detects objects in a frame and computes embeddings for each detected object.

        Args:
            frame (numpy.ndarray): Input frame in BGR format.

        Returns:
            list: List of dictionaries containing embeddings and bounding boxes.
    '''
    results = yolo_model(frame)
    objects = []
    for box in results[0].boxes:
        x1, y1, x2, y2 = box.xyxy[0].int().tolist()
        cropped = frame[y1:y2, x1:x2]
        if cropped.shape[0] > 0 and cropped.shape[1] > 0:
            # Convert the cropped image to PIL format
            pil_image = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
            emb = get_clip_embeddings(pil_image)
            objects.append((emb, (x1, y1, x2, y2)))
        return objects
    

In [6]:
def detect_objects(image):
    """
    Detect objects in an image using YOLOv8 from ultralytics.
    
    Args:
        image (numpy.ndarray): Input image in BGR format.
        embedding: CLIP embedding (not used in this example, but available to integrate if needed).
        
    Returns:
        list: A list of dictionaries with keys 'label' and 'bbox'. The bbox is a tuple (x_min, y_min, x_max, y_max).
    """
    # Load a pre-trained YOLOv8 model (ensure you have ultralytics installed)
    model = YOLO("yolov8n.pt")  # or use another weight file/model as needed
    # model = SAM("sam2.1_b.pt")
    file_path = "./Benthic-Mapping-highlight/benthic_mapping/model.pt"
    model = YOLO(file_path) # Instantiate your model
    # Run the model on the image; the model accepts BGR images if using cv2 images
    results = model(image)
    detections = []
    # Iterate over each result (usually one result per image)
    for result in results:
        # result.boxes contains detections in xyxy format and other information
        for box in result.boxes:
            # Extract bounding box coordinates and convert to integers
            x1, y1, x2, y2 = box.xyxy[0].tolist()
            # Append detection as a dictionary
            detections.append({
                "bbox": (int(x1), int(y1), int(x2), int(y2))
            })
    print(detections)
    return detections

In [7]:
def get_clip_embeddings_for_detected_objects(cv2image, yolo_model, clip_model):
    """
    Detects objects in an image and returns CLIP embeddings for each detected object.

    Args:
        cv2image (np.ndarray): The input image in BGR format (from OpenCV).
        yolo_model: Initialized Ultralytics YOLO model.
        clip_model: Initialized CLIP image encoder.
        clip_preprocess_function: Preprocessing function for the CLIP model.
        device (str): Device to run CLIP model on ('cuda' or 'cpu').

    Returns:
        list: A list of dictionaries, where each dictionary contains:
              'bbox': (x1, y1, x2, y2) coordinates of the detected object.
              'class_id': class ID from YOLO.
              'confidence': confidence score from YOLO.
              'clip_embedding': torch.Tensor containing the CLIP embedding for the object.
              Returns an empty list if no objects are detected or an error occurs.
    """

    # 1. Detect Objects using YOLO
    yolo_results = yolo_model(cv2image, verbose=False) # verbose=False to reduce console output

    object_embeddings_data = []

    if yolo_results and len(yolo_results) > 0:
        # Assuming results for a single image, so take the first element
        detections = yolo_results[0]
        boxes = detections.boxes # Access the Boxes object

        for i in range(len(boxes)):
            box = boxes[i]
            xyxy = box.xyxy[0].cpu().numpy().astype(int) # Get (x1, y1, x2, y2)
       
            x1, y1, x2, y2 = xyxy

            # 2. Crop the detected object from the original image
            # Ensure coordinates are within image bounds and valid
            if x1 >= x2 or y1 >= y2:
                # print(f"Warning: Invalid bounding box coordinates for object {i}: {xyxy}. Skipping.")
                continue
            
            # Clamp coordinates to be within image dimensions to avoid errors during cropping
            img_h, img_w = cv2image.shape[:2]
            x1_c = max(0, x1)
            y1_c = max(0, y1)
            x2_c = min(img_w, x2)
            y2_c = min(img_h, y2)

            if x1_c >= x2_c or y1_c >= y2_c: # If clamped box is invalid
                # print(f"Warning: Clamped bounding box is invalid for object {i}: ({x1_c},{y1_c},{x2_c},{y2_c}). Skipping.")
                continue

            cropped_object_bgr = cv2image[y1_c:y2_c, x1_c:x2_c]

            if cropped_object_bgr.size == 0:
                # print(f"Warning: Cropped object {i} is empty. BBox: {xyxy}. Clamped BBox: ({x1_c},{y1_c},{x2_c},{y2_c}). Skipping.")
                continue

            # 3. Preprocess the cropped object for CLIP
            #    a. Convert BGR (OpenCV) to RGB
            cropped_object_rgb = cv2.cvtColor(cropped_object_bgr, cv2.COLOR_BGR2RGB)
            #    b. Convert NumPy array to PIL Image
            pil_image = Image.fromarray(cropped_object_rgb)
            #    c. Apply CLIP preprocessing
            inputs = clip_processor(images=pil_image, return_tensors="pt", padding=True)
            
            # 4. Get CLIP embedding for the cropped object
            with torch.no_grad():
                outputs = clip_model.get_image_features(**inputs)
            object_embedding =  outputs / outputs.norm(p=2, dim = 1, keepdim=True)

            object_embeddings_data.append({
                'bbox': (x1, y1, x2, y2),
                'clip_embedding': object_embedding.cpu() # Move to CPU if you plan to store/use it there
            })

    return object_embeddings_data


## Run script to load initial input image with bounding boxes

In [None]:
file_path = "./Benthic-Mapping-highlight/benthic_mapping/model.pt"
yolo_model = YOLO(file_path) # Instantiate your model
im_path = './Benthic-Mapping-highlight/benthic_mapping/data/Images/PS2222_20220601T162145Z_FWD_ROV01_IMG_0151.JPG'
similar_objects = []
SIMILARITY_THRESHOLD = 0.7
frame_idx = 0

query_image = Image.open(im_path)
query_image = cv2.imread(im_path)
query_image = cv2.resize(query_image, (1024, 1024), interpolation=cv2.INTER_CUBIC)
query_embedding = get_clip_embeddings(query_image)

# Convert the frame to RGB for displaying with matplotlib
detections = detect_objects(query_image)
output_image = query_image.copy()
for i,detection in enumerate(detections):
    print(detection)
    x_min, y_min, x_max, y_max = detection["bbox"]
    # Draw rectangle and label
    cv2.rectangle(output_image, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)
    label = str(i)
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 0.6
    thickness = 2
    text_color = (0, 255, 0)  # Green, matches the box

    # Calculate text size (optional, helps align better)
    (text_width, text_height), _ = cv2.getTextSize(label, font, font_scale, thickness)

    # Offset so the text is just above the box
    text_x = x_min
    text_y = y_min - 5 if y_min - 5 > text_height else y_min + text_height + 5

    # Put text on image
    cv2.putText(output_image, label, (text_x, text_y), font, font_scale, text_color, thickness)
    
frame_rgb = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)

# Display the frame with the bounding box
plt.figure(figsize=(20, 20))
plt.subplot(1, 2, 1)
plt.imshow(frame_rgb)
plt.axis('off')
plt.title(f"Initial Input Image with Detections")
plt.show()
object_embeddings = get_clip_embeddings_for_detected_objects(query_image, yolo_model, clip_model)


## Iterate through directory to run object detection and similarity on other images

In [None]:
columns = ['media_id', 'frame_num', 'bbox', 'embedding_vec']
media_id =  '4291234'

# Create an empty DataFrame
embedding_df = pd.DataFrame(columns=columns)

file_path = "./Benthic-Mapping-highlight/benthic_mapping/model.pt"
yolo_model = YOLO(file_path) # Instantiate your model

SIMILARITY_THRESHOLD = 0.6
query_embedding = object_embeddings[4]['clip_embedding']
folder_path = './Benthic-Mapping-highlight/benthic_mapping/data/Images/'

# Get all .jpg files in the folder
jpg_files = glob.glob(os.path.join(folder_path, '*.JPG'))
print(jpg_files)
for l,image_path in enumerate(jpg_files):
        print(l)      
    # if l < 20:
        print(f"Processing: {image_path}")
    

    # cap = cv2.VideoCapture(video_path)
        query_image = cv2.imread(image_path)
        media_id = os.path.basename(image_path)
        print(media_id)
        objects = get_clip_embeddings_for_detected_objects(query_image, yolo_model, clip_model)
        if objects is None:
            print("No objects detected.")
            continue
        for detection in objects:
            emb, bbox = detection['clip_embedding'], detection['bbox']
            new_row = {
                    'media_id': media_id,
                    'frame_num': 'image',
                    'bbox': bbox,
                    'embedding_vec': emb
            }
            embedding_df.loc[len(embedding_df)] = new_row
            # Calculate cosine similarity
            similarity = cosine_similarity(query_embedding.numpy(), emb.numpy())[0][0]

            if similarity > SIMILARITY_THRESHOLD:
                similar_objects.append((media_id, bbox, similarity))
                print(f'Object added with similarity: {similarity}')
                #Optional to draw box
                x1, y1, x2, y2 = bbox
                cv2.rectangle(query_image, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(query_image, f"Similarity: {similarity:.2f}", (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)

## Setup fiftyone demo and embedding df

In [None]:
display(embedding_df)
image_dir = './Benthic-Mapping-highlight/benthic_mapping/data/Images/'
if "df-object-embeddings" in fo.list_datasets():
    fo.delete_dataset("df-object-embeddings")

dataset = fo.Dataset("df-object-embeddings")
# dataset = fo.utils.dataset.import_dataset(
#     dataset_type=fo.types.ImageDirectory,
#     dataset_dir=image_dir,
#     name='MDBC_images'
# )


df = embedding_df.copy()
for idx, row in df.iterrows():
    image_path = os.path.join(image_dir, row["media_id"])
    if not os.path.exists(image_path):
        continue

    # Get original image to compute bbox thumbnail
    image = cv2.imread(image_path)
    if image is None:
        continue

    h, w = image.shape[:2]
    x1, y1, x2, y2 = row["bbox"]

    # Convert to relative [x, y, w, h]
    x = x1 / w
    y = y1 / h
    box_w = (x2 - x1) / w
    box_h = (y2 - y1) / h

    # Create a new sample for this single detection
    sample = fo.Sample(filepath=image_path)

    detection = fol.Detection(
        label="object",
        bounding_box=[x, y, box_w, box_h],
        embedding=np.array(row["embedding_vec"]).squeeze().tolist()
    )

    sample["detections"] = fol.Detections(detections=[detection])

    dataset.add_sample(sample)



In [None]:
fob.compute_visualization(
    dataset,
    patches_field="detections",
    embeddings="embedding",            # field in each detection
    brain_key="clip_embedding",
    method="umap",
    thumbnails=True
)
embeddings_panel = fo.Panel(
    type="Embeddings",
    state=dict(brainResult="img_viz", colorByField="uniqueness"),
)
# fob.compute_visualization(
#     dataset, brain_key="gt_viz", 
# )

session = fo.launch_app(dataset)

## Print photos above threshold for similarity

In [None]:
folder_path = './Benthic-Mapping-highlight/benthic_mapping/data/Images/'
output_dir = 'images'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for i, (media_id, bbox, sim) in enumerate(similar_objects):
    print(f"Match {i + 1} - Frame: {media_id}, BBox: {bbox}, Similarity: {sim:.2f}")
    if sim > .92:
    # if sim > .78 and media_id == 'PS2222_20220601T162145Z_FWD_ROV01_IMG_0151.JPG':
        # Optionally save or display the frame with bounding box
        # cv2.imwrite(f"output_frame_{frame_num}.jpg", frame)
        # Open the video and seek to the specific frame
        image_path = os.path.join(folder_path, media_id)
        frame = cv2.imread(image_path)

        # Extract the bounding box
        x_min, y_min, x_max, y_max = bbox

        # Draw the bounding box on the frame
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)

        # Convert the frame to RGB for displaying with matplotlib
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Display the frame with the bounding box
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.imshow(frame_rgb)
        plt.axis('off')
        plt.title(f"Frame {media_id} with Bounding Box")

        # Crop the region corresponding to the bounding box
        cropped_region = frame[y_min:y_max, x_min:x_max]
        cropped_region_rgb = cv2.cvtColor(cropped_region, cv2.COLOR_BGR2RGB)

        # Display the cropped region
        plt.subplot(1, 2, 2)
        plt.imshow(cropped_region_rgb)
        plt.axis('off')
        plt.title(f"Cropped Region (Similarity: {sim:.2f})")

        plt.tight_layout()
        plt.show()

        # Save the frame with the bounding box to the "images" directory
        output_file = os.path.join(output_dir, f"frame_{media_id}_match_{i+1}.jpg")
        cv2.imwrite(output_file, frame)

In [None]:
folder_path = './Benthic-Mapping-highlight/benthic_mapping/data/Images/'
output_dir = 'images'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for i, (media_id, bbox, sim) in enumerate(similar_objects):
    print(f"Match {i + 1} - Frame: {media_id}, BBox: {bbox}, Similarity: {sim:.2f}")
    if .65 < sim < .67:
    # if sim > .78 and media_id == 'PS2222_20220601T162145Z_FWD_ROV01_IMG_0151.JPG':
        # Optionally save or display the frame with bounding box
        # cv2.imwrite(f"output_frame_{frame_num}.jpg", frame)
        # Open the video and seek to the specific frame
        image_path = os.path.join(folder_path, media_id)
        frame = cv2.imread(image_path)

        # Extract the bounding box
        x_min, y_min, x_max, y_max = bbox

        # Draw the bounding box on the frame
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)

        # Convert the frame to RGB for displaying with matplotlib
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Display the frame with the bounding box
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.imshow(frame_rgb)
        plt.axis('off')
        plt.title(f"Frame {media_id} with Bounding Box")

        # Crop the region corresponding to the bounding box
        cropped_region = frame[y_min:y_max, x_min:x_max]
        cropped_region_rgb = cv2.cvtColor(cropped_region, cv2.COLOR_BGR2RGB)

        # Display the cropped region
        plt.subplot(1, 2, 2)
        plt.imshow(cropped_region_rgb)
        plt.axis('off')
        plt.title(f"Cropped Region (Similarity: {sim:.2f})")

        plt.tight_layout()
        plt.show()

        # Save the frame with the bounding box to the "images" directory
        output_file = os.path.join(output_dir, f"frame_{media_id}_match_{i+1}.jpg")
        cv2.imwrite(output_file, frame)

In [None]:
folder_path = './Benthic-Mapping-highlight/benthic_mapping/data/Images/'
output_dir = 'images'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for i, (media_id, bbox, sim) in enumerate(similar_objects):
    print(f"Match {i + 1} - Frame: {media_id}, BBox: {bbox}, Similarity: {sim:.2f}")
    if .84 < sim < .86:
    # if sim > .78 and media_id == 'PS2222_20220601T162145Z_FWD_ROV01_IMG_0151.JPG':
        # Optionally save or display the frame with bounding box
        # cv2.imwrite(f"output_frame_{frame_num}.jpg", frame)
        # Open the video and seek to the specific frame
        image_path = os.path.join(folder_path, media_id)
        frame = cv2.imread(image_path)

        # Extract the bounding box
        x_min, y_min, x_max, y_max = bbox

        # Draw the bounding box on the frame
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)

        # Convert the frame to RGB for displaying with matplotlib
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Display the frame with the bounding box
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.imshow(frame_rgb)
        plt.axis('off')
        plt.title(f"Frame {media_id} with Bounding Box")

        # Crop the region corresponding to the bounding box
        cropped_region = frame[y_min:y_max, x_min:x_max]
        cropped_region_rgb = cv2.cvtColor(cropped_region, cv2.COLOR_BGR2RGB)

        # Display the cropped region
        plt.subplot(1, 2, 2)
        plt.imshow(cropped_region_rgb)
        plt.axis('off')
        plt.title(f"Cropped Region (Similarity: {sim:.2f})")

        plt.tight_layout()
        plt.show()

        # Save the frame with the bounding box to the "images" directory
        output_file = os.path.join(output_dir, f"frame_{media_id}_match_{i+1}.jpg")
        cv2.imwrite(output_file, frame)

## Optional, for video, run on specific frame to get initial frame for comparison, i.e. input. Can leave as image

In [None]:

file_path = "./Benthic-Mapping-highlight/benthic_mapping/model.pt"
yolo_model = YOLO(file_path) # Instantiate your model
video_path = '/tmp/PS2222_20220601T175323Z_FWD_ROV01_HD.mp4'
cap = cv2.VideoCapture(video_path)
similar_objects = []
SIMILARITY_THRESHOLD = 0.7
frame_idx = 0

while cap.isOpened():
    # print(f'reading frame {frame_idx}')
    ret, frame = cap.read()
    if frame_idx == 2700:
        # Save the first frame as a temporary image file
        temp_image_path = "first_frame.jpg"
        cv2.imwrite(temp_image_path, frame)


        query_image = Image.open(temp_image_path)
        query_image = cv2.imread(temp_image_path)
        query_image = cv2.resize(query_image, (1024, 1024), interpolation=cv2.INTER_CUBIC)
        query_embedding = get_clip_embeddings(query_image)

        # Convert the frame to RGB for displaying with matplotlib
        detections = detect_objects(query_image)
        output_image = query_image.copy()
        for i,detection in enumerate(detections):
            if i == 3:
                print(detection)
                x_min, y_min, x_max, y_max = detection["bbox"]
                # Draw rectangle and label
                cv2.rectangle(output_image, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)
        frame_rgb = cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)

        # Display the frame with the bounding box
        plt.figure(figsize=(20, 20))
        plt.subplot(1, 2, 1)
        plt.imshow(frame_rgb)
        plt.axis('off')
        plt.title(f"First Frame")
        plt.show()
        object_embeddings = get_clip_embeddings_for_detected_objects(query_image, yolo_model, clip_model)
    elif frame_idx >= 3000:
        break
    else:
        pass
    frame_idx += 1

cap.release()

## Iterate through video and find similar objects

In [None]:
columns = ['media_id', 'frame_num', 'bbox', 'embedding_vec']
media_id =  '4291234'

# # Create an empty DataFrame
embedding_df = pd.DataFrame(columns=columns)
video_path = './Benthic-Mapping-highlight/benthic_mapping/data/Images/PS2222_20220601T175323Z_FWD_ROV01_HD.mp4'
file_path = "./Benthic-Mapping-highlight/benthic_mapping/model.pt"
yolo_model = YOLO(file_path) # Instantiate your model
cap = cv2.VideoCapture(video_path)
similar_objects = []
SIMILARITY_THRESHOLD = 0.5
frame_idx = 0
# query_embedding = object_embeddings[0]['clip_embedding']
while cap.isOpened():
    print(f'reading frame {frame_idx}')
    ret, frame = cap.read()
    if not ret:
        print("End of video or error reading frame.")
        break
    if frame_idx % 50 == 0 and frame_idx <= 5000:
        objects = get_clip_embeddings_for_detected_objects(frame, yolo_model, clip_model)
        if objects is None:
            print("No objects detected.")
            continue
        for detection in objects:
            emb, bbox = detection['clip_embedding'], detection['bbox']
            new_row = {
                'media_id': media_id,
                'frame_num': frame_idx,
                'bbox': bbox,
                'embedding_vec': emb
            }
            embedding_df.loc[len(embedding_df)] = new_row
            # Calculate cosine similarity
            similarity = cosine_similarity(query_embedding.numpy(), emb.numpy())[0][0]

            if similarity > SIMILARITY_THRESHOLD:
                similar_objects.append((frame_idx, bbox, similarity))
                print(f'Object added with similarity: {similarity}')
                #Optional to draw box
                x1, y1, x2, y2 = bbox
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                cv2.putText(frame, f"Similarity: {similarity:.2f}", (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
    frame_idx += 1

cap.release()

## Output similar objects/frames from video

In [None]:
import os

output_dir = 'images'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for i, (frame_num, bbox, sim) in enumerate(similar_objects):
    print(f"Match {i + 1} - Frame: {frame_num}, BBox: {bbox}, Similarity: {sim:.2f}")

    if sim > .81:
        # Optionally save or display the frame with bounding box
        # cv2.imwrite(f"output_frame_{frame_num}.jpg", frame)
        # Open the video and seek to the specific frame
        video_capture = cv2.VideoCapture(video_path)
        video_capture.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
        success, frame = video_capture.read()
        video_capture.release()

        if not success:
            print(f"Unable to read frame {frame_num} from video.")
            continue

        # Extract the bounding box
        x_min, y_min, x_max, y_max = bbox

        # Draw the bounding box on the frame
        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)

        # Convert the frame to RGB for displaying with matplotlib
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Display the frame with the bounding box
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.imshow(frame_rgb)
        plt.axis('off')
        plt.title(f"Frame {frame_num} with Bounding Box")

        # Crop the region corresponding to the bounding box
        cropped_region = frame[y_min:y_max, x_min:x_max]
        cropped_region_rgb = cv2.cvtColor(cropped_region, cv2.COLOR_BGR2RGB)

        # Display the cropped region
        plt.subplot(1, 2, 2)
        plt.imshow(cropped_region_rgb)
        plt.axis('off')
        plt.title(f"Cropped Region (Similarity: {sim:.2f})")

        plt.tight_layout()
        plt.show()

        # Save the frame with the bounding box to the "images" directory
        output_file = os.path.join(output_dir, f"frame_{frame_num}_match_{i+1}.jpg")
        cv2.imwrite(output_file, frame)

## Querying Dataset Using Embeddings

In [None]:
display(query_embedding)

In [None]:
from IPython.display import display, HTML
def display_scrollable_df(df, max_height=300, max_width=1000):
    display(HTML(df.to_html(notebook=True, max_rows=1000, max_cols=1000,
                            border=0, classes='scroll-table')))
    styles = f"""
    <style>
    .scroll-table {{
        display: block;
        overflow: auto;
        max-height: {max_height}px;
        max-width: {max_width}px;
        border: 1px solid #ccc;
    }}
    </style>
    """
    display(HTML(styles))
display_scrollable_df(embedding_df)

In [None]:
# Compute cosine similarities
embedding_df['similarity'] = embedding_df['embedding_vec'].apply(lambda x: cosine_similarity(x, query_embedding)[0][0])

# Filter based on a similarity threshold
threshold = 0.94
filtered_df = embedding_df[embedding_df['similarity'] > threshold]

display_scrollable_df(filtered_df)

In [None]:
folder_path = './Benthic-Mapping-highlight/benthic_mapping/data/Images/'
media_id = filtered_df['media_id'].iloc[0]
image_path = os.path.join(folder_path, media_id)
frame = cv2.imread(image_path)

# Extract the bounding box
bbox = filtered_df['bbox'].iloc[0]
x_min, y_min, x_max, y_max = bbox

# Draw the bounding box on the frame
cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)

# Convert the frame to RGB for displaying with matplotlib
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

# Display the frame with the bounding box
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.imshow(frame_rgb)
plt.axis('off')
plt.title(f"Frame with Bounding Box")

# Crop the region corresponding to the bounding box
cropped_region = frame[y_min:y_max, x_min:x_max]
cropped_region_rgb = cv2.cvtColor(cropped_region, cv2.COLOR_BGR2RGB)

# Display the cropped region
plt.subplot(1, 2, 2)
plt.imshow(cropped_region_rgb)
plt.axis('off')

plt.tight_layout()
plt.show()

In [16]:
text_query = "orange coral"

inputs = clip_processor(text=[text_query], return_tensors="pt", padding=True)
with torch.no_grad():
    text_features = clip_model.get_text_features(**inputs)
    text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)  # normalize

In [None]:
image_matrix = np.stack(embedding_df["embedding_vec"].values)
text_vector = text_features.cpu().numpy()
print(text_vector.shape)
print(image_matrix.shape)
# Make sure text_vector is (1, 512)
text_vector = text_vector.squeeze()  # from (1, 1, 512) → (512,)
if text_vector.ndim == 1:
    text_vector = text_vector[np.newaxis, :]  # reshape to (1, 512)

# Make sure image_matrix is (N, 512)
if image_matrix.ndim == 3:
    image_matrix = image_matrix.squeeze()  # from (N, 1, 512) → (N, 512)
# Compute similarity
similarities = cosine_similarity(text_vector, image_matrix)[0]
embedding_df["text_similarity"] = similarities

In [None]:
top_k = 5
top_results = embedding_df.sort_values("text_similarity", ascending=False).head(top_k)
print(top_results[["media_id", "text_similarity"]])

In [None]:
folder_path = './Benthic-Mapping-highlight/benthic_mapping/data/Images/'
media_id = embedding_df['media_id'].iloc[948]
image_path = os.path.join(folder_path, media_id)
frame = cv2.imread(image_path)

# Extract the bounding box
bbox = embedding_df['bbox'].iloc[948]
x_min, y_min, x_max, y_max = bbox

# Draw the bounding box on the frame
cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color=(0, 255, 0), thickness=2)

# Convert the frame to RGB for displaying with matplotlib
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

# Display the frame with the bounding box
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.imshow(frame_rgb)
plt.axis('off')
plt.title(f"Frame with Bounding Box")

# Crop the region corresponding to the bounding box
cropped_region = frame[y_min:y_max, x_min:x_max]
cropped_region_rgb = cv2.cvtColor(cropped_region, cv2.COLOR_BGR2RGB)

# Display the cropped region
plt.subplot(1, 2, 2)
plt.imshow(cropped_region_rgb)
plt.axis('off')

plt.tight_layout()
plt.show()