In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision.transforms as T
import torch.nn.functional as F
from torchvision.io import read_image, ImageReadMode
from PIL import Image, ImageDraw, ImageFilter
import os
import json
import pickle
import numpy as np

class RefCocoG_Dataset(Dataset):
    full_annotations = None

    def __init__(self, root_dir, annotations_f, instances_f, split='train', transform=None, target_transform=None) -> None:
        super().__init__()

        self.root_dir = root_dir
        self.annotations_f = annotations_f
        self.instances_f = instances_f

        self.split = split

        self.transform = transform
        self.target_transform = target_transform

        self.get_annotations()
        self.image_names = list([
            self.annotations[id]['image']['actual_file_name']
            for id in self.annotations
        ])

    def get_annotations(self):
        if RefCocoG_Dataset.full_annotations:
            self.annotations = dict(filter(lambda match: match[1]['image']['split'] == self.split, RefCocoG_Dataset.full_annotations.items()))
            return

        # Load pickle data
        with open(os.path.join(self.root_dir, 'annotations', self.annotations_f), 'rb') as file:
            self.data = pickle.load(file)

        # Load instances
        with open(os.path.join(self.root_dir, 'annotations', self.instances_f), 'rb') as file:
            self.instances = json.load(file)

        # Match data between the two files and build the actual dataset
        self.annotations = {}

        images_actual_file_names = {}
        for image in self.instances['images']:
            images_actual_file_names[image['id']] = image['file_name']

        for image in self.data:
            if image['ann_id'] not in self.annotations:
                self.annotations[image['ann_id']] = {}

            self.annotations[image['ann_id']]['image'] = image
            self.annotations[image['ann_id']]['image']['actual_file_name'] = images_actual_file_names[image['image_id']]

        for annotation in self.instances['annotations']:
            if annotation['id'] not in self.annotations:
                continue

            self.annotations[annotation['id']]['annotation'] = annotation

        # Keep only samples from the given split
        RefCocoG_Dataset.full_annotations = self.annotations
        self.annotations = dict(filter(lambda match: match[1]['image']['split'] == self.split, self.annotations.items()))

    def __len__(self):
        # Return the number of images
        return len(self.image_names)

    def corner_size_to_corners(self, bounding_box):
        """
        Transform (top_left_x, top_left_y, width, height) bounding box representation
        into (top_left_x, top_left_y, bottom_right_x, bottom_right_y)
        """

        return [
            bounding_box[0],
            bounding_box[1],
            bounding_box[0] + bounding_box[2],
            bounding_box[1] + bounding_box[3]
        ]

    def __getitem__(self, idx):
        # Get the image name at the given index
        image_name = self.image_names[idx]

        # Load the image file as a PIL image
        image = Image.open(os.path.join(self.root_dir, 'images', image_name)).convert('RGB')
        # image = read_image(os.path.join(self.root_dir, 'images', image_name), ImageReadMode.RGB)
        
        image_id = list(self.annotations)[idx]

        # print(image_id)

        # Get the caption for the image
        prompts = [
            prompt['sent'] for prompt in self.annotations[image_id]['image']['sentences']
        ]

        # Get the bounding box for the prompts for the image
        bounding_box = self.corner_size_to_corners(self.annotations[image_id]['annotation']['bbox'])

        # Apply the transform if given
        if self.transform:
            image = self.transform(image)

        sample = [
            image,
            bounding_box,
            prompts,
        ]

        # Return the sample as a list
        return sample

In [None]:
dataset_train = RefCocoG_Dataset('refcocog', 'refs(umd).p', 'instances.json', split='train')
dataset_val = RefCocoG_Dataset('refcocog', 'refs(umd).p', 'instances.json', split='val')
dataset_test = RefCocoG_Dataset('refcocog', 'refs(umd).p', 'instances.json', split='test')

dataset_splits = [
    dataset_train,
    dataset_val,
    dataset_test
]

In [None]:
len(RefCocoG_Dataset.full_annotations), len(dataset_train.annotations), len(dataset_val.annotations), len(dataset_test.annotations)

In [None]:
def collate_differently_sized_prompts(batch):
    images = [item[0] for item in batch]
    bboxes = [item[1] for item in batch]
    prompts = [item[2] for item in batch]
    
    return list(images), list(bboxes), list(prompts)

def get_data(dataset_splits, batch_size=64, test_batch_size=256, num_workers=0):
    training_data = dataset_splits[0]
    validation_data = dataset_splits[1]
    test_data = dataset_splits[2]

    # Change shuffle to True for train
    train_loader = torch.utils.data.DataLoader(training_data, batch_size, shuffle=True, drop_last=True, collate_fn=collate_differently_sized_prompts, num_workers=num_workers)
    val_loader = torch.utils.data.DataLoader(validation_data, test_batch_size, shuffle=False, collate_fn=collate_differently_sized_prompts, num_workers=num_workers)
    test_loader = torch.utils.data.DataLoader(test_data, test_batch_size, shuffle=False, collate_fn=collate_differently_sized_prompts, num_workers=num_workers)

    return train_loader, val_loader, test_loader

In [None]:
train_loader, val_loader, test_loader = get_data(dataset_splits, batch_size=64, test_batch_size=64, num_workers=0)

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda:0") # First GPU
else:
    device = 'cpu'

In [None]:
if torch.cuda.is_available():
    yolo_models = [torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).to(f'cuda:{i}') for i in range(torch.cuda.device_count())]
else:
    yolo_models = [torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True).to(device)]

In [None]:
import clip

# clip_backbone = 'ViT-B/32'
# clip_backbone = 'ViT-L/14'
# clip_backbone = 'ViT-L/14@336px'
# clip_backbone = 'RN50x16'
# clip_backbone = 'RN50x64'
# clip_backbone = 'RN101'

clip_backbones = [
    'RN50x64'
    # 'RN50x16',
    # 'ViT-B/16'
    # 'ViT-L/14@336px',
    # 'ViT-B/32'
]

models, preprocesses = {}, {}

for clip_backbone in clip_backbones:
    models[clip_backbone] = []
    preprocesses[clip_backbone] = []

    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            model, preprocess = clip.load(clip_backbone, device=f'cuda:{i}')
            
            models[clip_backbone].append(model)
            preprocesses[clip_backbone].append(preprocess)
    else:
        model, preprocess = clip.load(clip_backbone, device=device)
        models[clip_backbone].append(model)
        preprocesses[clip_backbone].append(preprocess)

In [None]:
# next(models['RN50x16'][0].parameters()).device

In [None]:
import torch

def cosine_similarity(a: torch.Tensor, b: torch.Tensor):
    """
    Cosine Similarity

    Normalizes both tensors a and b. Returns <b, a.T> (inner product).
    """

    a_norm = a / a.norm(dim=-1, keepdim=True)
    b_norm = b / b.norm(dim=-1, keepdim=True)

    similarity = (b_norm @ a_norm.T)

    return similarity.cpu()

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.patches as patches

def visualise_scores(scores: torch.Tensor, images, texts: list[str]):
    for t_idx, text in enumerate(texts):
        for i_idx, image in enumerate(images):
            fig, ax = plt.subplots()
            ax.imshow(image)
            ax.set_title(f'Score: {scores[t_idx, i_idx]} / Prompt: {text}')

In [None]:
all_prompts = []

for batch_idx, (images, gt_bounding_boxes, prompts) in enumerate(train_loader):
    refined_prompts = ['This is ' + prompt for sample in prompts for prompt in sample]

    for prompt in refined_prompts:
        all_prompts.append(prompt)

In [None]:
import numpy as np

np.random.seed = 42
randomly_sampled_examples = np.random.choice(all_prompts, 5000)

In [None]:
text_features_full = {}

text = clip.tokenize(randomly_sampled_examples).to(device)

for clip_backbone in clip_backbones:
    encoder = models[clip_backbone][0]

    indices = range(text.shape[0])
    batches = np.array_split(indices, 10)

    text_features_full[clip_backbone] = []

    for batch in batches:
        with torch.no_grad():
            text_features_full[clip_backbone].append(encoder.encode_text(text[batch]))

In [None]:
for clip_backbone in clip_backbones:
    text_features_full[clip_backbone] = torch.cat(text_features_full[clip_backbone])

In [None]:
import torch
import torch.nn as nn
import clip
import numpy as np

import matplotlib.pyplot as plt
import matplotlib.patches as patches

class CirclesModel(nn.Module):
    def __init__(self, device=None, models=None, preprocesses=None, yolo_models=None, text_features_bias=None) -> None:
        super().__init__()
        
        if device:
            self.device = device
        else:
            self.device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
        
        if not models or not preprocesses:
            raise ValueError('Models and preprocesses for CLIP model should be provided')

        self.models = models
        self.preprocesses = preprocesses

        self.clip_backbones = list(self.models.keys())
        
        if not yolo_models:
            raise ValueError('Models for YOLO should be provided')
        self.yolo_models = yolo_models

        if not text_features_bias:
            self.text_features_bias = {}
            for backbone in self.clip_backbones:
                text_features_bias[backbone] = None
        else:
            self.text_features_bias = text_features_bias

        self.visual_augmentation = 2

        self.transform_to_tensor = T.Compose([
            transforms.ToTensor()
        ])

    def forward(self, indices, images, prompts_list):
        self.device = indices.device
        if indices.is_cuda:
            self.device_index = int(str(self.device)[-1])
        else:
            self.device_index = 0

        # -- Getting the right data and moving it to the correct device --

        # Images remain on the CPU because they are PIL Images, not Tensors
        # Converting to Tensors leads to errors with YOLO
        images = [images[i] for i in indices]

        prompts_list = [prompts_list[i] for i in indices]
        prompts_list = self.update_prompts_with_this_is(prompts_list)
        prompts_tensor = [clip.tokenize(prompt_list).to(self.device) for prompt_list in prompts_list]

        # -- Actual processing --

        bounding_boxes = self.get_bounding_boxes(images)

        # It contains the predicted bounding box for each image for each prompt
        # Then, it is a list of length len(images) and for each entry there is a
        # list with len(prompts[i]), where i is the i-th image 
        overall_outputs = []

        with torch.no_grad():
            for idx, prompts_tensor_for_sample in enumerate(prompts_tensor):
                # Image crops
                image_crops = self.get_visual_prompts(images[idx], bounding_boxes.pred[idx])

                preprocessed_image_crops = {}
                for backbone in self.clip_backbones:
                    preprocessed_image_crops[backbone] = torch.stack([self.preprocesses[backbone][self.device_index](image) for image in image_crops]).to(self.device)

                similarities = {}
                for backbone in self.clip_backbones:
                    pic_batches = np.array_split(range(len(preprocessed_image_crops[backbone])), len(preprocessed_image_crops[backbone]) // self.visual_augmentation)

                    visual_features = []
                    for pic_batch in pic_batches:
                        visual_features.append(self.models[backbone][self.device_index].encode_image(preprocessed_image_crops[backbone][pic_batch]))
                    visual_features = torch.cat(visual_features)

                    # print(visual_features.shape)

                    text_features = self.models[backbone][self.device_index].encode_text(prompts_tensor_for_sample)

                    if self.text_features_bias[backbone] is not None:
                        text_features = torch.cat([text_features, self.text_features_bias[backbone].to(self.device)])

                    similarities[backbone] = cosine_similarity(visual_features, text_features)


                similarity = torch.empty_like(similarities[self.clip_backbones[0]])
                for prompt_idx in range(similarity.shape[0]):
                    for proposal_idx in range(similarity.shape[1]):
                        similarity[prompt_idx, proposal_idx] = torch.mean(torch.stack([
                            similarities[backbone][prompt_idx, proposal_idx] for backbone in self.clip_backbones
                        ]))


                average = similarity.mean(dim=0)
                scores = (similarity - average)[range(len(prompts_tensor_for_sample))]

                final_scores = torch.empty((scores.shape[0], scores.shape[1] // self.visual_augmentation))

                for prompt_idx in range(final_scores.shape[0]):
                    for final_score_idx, proposal_idx in enumerate(range(0, scores.shape[1], self.visual_augmentation)):
                        final_scores[prompt_idx, final_score_idx] = torch.max(torch.stack([
                            scores[prompt_idx, proposal_idx + i] for i in range(self.visual_augmentation)
                        ]))
                # print(final_scores)

                
                _, max_indices = final_scores.max(dim=-1)
                try:
                    for max_idx in max_indices:
                        overall_outputs.append(
                            torch.tensor(bounding_boxes.xyxy[idx][max_idx, 0:4]).to(self.device)
                        )
                except:
                    for max_idx in max_indices:
                        overall_outputs.append(
                            torch.tensor((0, 0, 0, 0)).to(self.device)
                        )

        return torch.stack(overall_outputs)

    def get_prompts(self, sample):
        return [prompt['sent'] for prompt in sample['image']['sentences']]

    def update_prompts_with_this_is(self, prompts):
        return [['This is ' + prompt for prompt in sample] for sample in prompts]

    def get_bounding_boxes(self, pil_images):
        bounding_boxes = self.yolo_models[self.device_index](pil_images)
        # bounding_boxes.show()
        return bounding_boxes
    
    def draw_circle(self, image_alpha, bounding_box):
        new_img = Image.new('RGBA', image_alpha.size, (0, 0, 0, 0))
        draw = ImageDraw.Draw(new_img)
        draw.ellipse((bounding_box[0].item(), bounding_box[1].item(), bounding_box[2].item(), bounding_box[3].item()),
                     outline='red', width=2)

        new_img = Image.alpha_composite(image_alpha, new_img)

        return new_img

    def draw_circle_and_darken(self, image_alpha, bounding_box):
        circle_mask = Image.new('L', image_alpha.size, 0)
        draw = ImageDraw.Draw(circle_mask)
        draw.ellipse((bounding_box[0].item(), bounding_box[1].item(), bounding_box[2].item(), bounding_box[3].item()), fill=255)

        alpha = 0.15
        # alpha = 0.7
        darkened_image = Image.new('RGB', image_alpha.size, (0, 0, 0))
        darkened_image.paste(image_alpha.convert('RGB'), mask=circle_mask)
        blurred_mask = circle_mask.filter(ImageFilter.GaussianBlur(radius=10))
        darkened_image.putalpha(blurred_mask.point(lambda x: alpha * (255 - x)))
        darkened_image = Image.alpha_composite(image_alpha, darkened_image.convert('RGBA'))

        draw = ImageDraw.Draw(darkened_image)
        draw.ellipse((bounding_box[0].item(), bounding_box[1].item(), bounding_box[2].item(), bounding_box[3].item()),
                     outline='red', width=4)

        return darkened_image
    
    def draw_circle_small_and_darken(self, image_alpha, bounding_box):
        # radius = int(0.06 * min(image_alpha.size))
        # radius = int(min(
        #     bounding_box[2].item() - bounding_box[0].item(),
        #     bounding_box[3].item() - bounding_box[1].item()) / 2)
        # center_x =  (bounding_box[0].item() + bounding_box[2].item()) / 2
        # center_y =  (bounding_box[1].item() + bounding_box[3].item()) / 2

        bbox = (bounding_box[0].item(), bounding_box[1].item(), bounding_box[2].item(), bounding_box[3].item())
        # bbox = (center_x - radius, center_y - radius, center_x + radius, center_y + radius)

        circle_mask = Image.new('L', image_alpha.size, 0)
        draw = ImageDraw.Draw(circle_mask)
        draw.ellipse(bbox, fill=255)

        alpha = 0.15
        alpha = 0.4
        # thickness = int(0.01 * min(image_alpha.size))
        thickness = 3
        darkened_image = Image.new('RGB', image_alpha.size, (0, 0, 0))
        darkened_image.paste(image_alpha.convert('RGB'), mask=circle_mask)
        blurred_mask = circle_mask.filter(ImageFilter.GaussianBlur(radius=10))
        darkened_image.putalpha(blurred_mask.point(lambda x: alpha * (255 - x)))
        darkened_image = Image.alpha_composite(image_alpha, darkened_image.convert('RGBA'))

        draw = ImageDraw.Draw(darkened_image)
        draw.ellipse(bbox,
                     outline='red', width=thickness)

        return darkened_image
    
    def draw_circle_small_and_blur_and_darken(self, image_alpha, bounding_box):
        # radius = int(0.06 * min(image_alpha.size))
        # radius = int(min(
        #     bounding_box[2].item() - bounding_box[0].item(),
        #     bounding_box[3].item() - bounding_box[1].item()) / 2)
        # center_x =  (bounding_box[0].item() + bounding_box[2].item()) / 2
        # center_y =  (bounding_box[1].item() + bounding_box[3].item()) / 2

        bbox = (bounding_box[0].item(), bounding_box[1].item(), bounding_box[2].item(), bounding_box[3].item())
        # bbox = (center_x - radius, center_y - radius, center_x + radius, center_y + radius)

        circle_mask = Image.new('L', image_alpha.size, 0)
        draw = ImageDraw.Draw(circle_mask)
        draw.ellipse(bbox, fill=255)

        alpha = 0.3
        # alpha = 0.2
        # thickness = int(0.01 * min(image_alpha.size))
        thickness = 2
        darkened_image = Image.new('RGB', image_alpha.size, (0, 0, 0))
        darkened_image.paste(image_alpha.convert('RGB'), mask=circle_mask)
        blurred_mask = circle_mask.filter(ImageFilter.GaussianBlur(radius=10))
        darkened_image.putalpha(blurred_mask.point(lambda x: alpha * (255 - x)))
        darkened_image = Image.alpha_composite(image_alpha.filter(ImageFilter.GaussianBlur(radius=1.5)), darkened_image.convert('RGBA'))
        darkened_image.paste(image_alpha.convert('RGB'), mask=circle_mask)

        draw = ImageDraw.Draw(darkened_image)
        draw.ellipse(bbox,
                     outline='red', width=thickness)

        return darkened_image
    
    def draw_rectangle_and_darken(self, image_alpha, bounding_box):
        circle_mask = Image.new('L', image_alpha.size, 0)
        draw = ImageDraw.Draw(circle_mask)
        draw.rectangle((bounding_box[0].item(), bounding_box[1].item(), bounding_box[2].item(), bounding_box[3].item()), fill=255)

        alpha = 0.15
        # alpha = 0.7
        darkened_image = Image.new('RGB', image_alpha.size, (0, 0, 0))
        darkened_image.paste(image_alpha.convert('RGB'), mask=circle_mask)
        blurred_mask = circle_mask.filter(ImageFilter.GaussianBlur(radius=15))
        darkened_image.putalpha(blurred_mask.point(lambda x: alpha * (255 - x)))
        darkened_image = Image.alpha_composite(image_alpha, darkened_image.convert('RGBA'))

        draw = ImageDraw.Draw(darkened_image)
        draw.rectangle((bounding_box[0].item(), bounding_box[1].item(), bounding_box[2].item(), bounding_box[3].item()),
                     outline='red', width=2)

        return darkened_image

    def get_highlighted_bounding_boxes(self, image, bounding_boxes):
        """
        Bounding boxes in the form:
        [top left x, top left y, bottom right x, bottom right y, confidence, categoy]
        """

        highlighted_bounding_boxes = []

        image_width, image_height = image.size
        image_alpha = image.convert('RGBA')
        
        for bbox_idx, bounding_box in enumerate(bounding_boxes):
            # print(f'bbox: {bounding_box}')

            # new_img = Image.new('RGBA', image.size, (0, 0, 0, 0))
            # draw = ImageDraw.Draw(new_img)
            # draw.ellipse((bounding_box[0].item(), bounding_box[1].item(), bounding_box[2].item(), bounding_box[3].item()), outline='red', width=4)

            # new_img = Image.alpha_composite(image_alpha, new_img)

            # new_img = self.draw_circle(image_alpha, bounding_box)
            # new_img = self.draw_circle_and_darken(image_alpha, bounding_box)
            # new_img = self.draw_circle_small_and_darken(image_alpha, bounding_box)
            new_img = self.draw_circle_small_and_blur_and_darken(image_alpha, bounding_box)
            # new_img = self.draw_rectangle_and_darken(image_alpha, bounding_box)

            # fig, ax = plt.subplots()
            # ax.imshow(new_img)
            # ax.set_title(crop_centroid_normalized)


            highlighted_bounding_boxes.append(new_img)

        if len(highlighted_bounding_boxes) == 0:
            highlighted_bounding_boxes.append(image)
                
        return highlighted_bounding_boxes

    def get_image_with_marker(self, image, bbox, stroke_color='red', stroke_width=1):
        result = image.copy()
        draw = ImageDraw.Draw(result)
        draw.ellipse(bbox, outline=stroke_color, width=stroke_width)
        
        return result
    
    def get_image_with_marker_and_blur(self, image, bbox, stroke_color='red', stroke_width=1, blur_radius=1):
        result = image.filter(ImageFilter.GaussianBlur(radius=blur_radius))
        mask = Image.new('L', image.size, 0)
        draw = ImageDraw.Draw(mask)
        draw.rectangle(bbox, fill=255)
        result.paste(image, mask=mask)
        draw = ImageDraw.Draw(result)
        draw.ellipse(bbox, outline=stroke_color, width=stroke_width)
        
        return result

    def get_image_with_marker_and_grayscale(self, image, bbox, stroke_color='red', stroke_width=1):
        result = image.convert('L').convert('RGB')
        mask = Image.new('L', image.size, 0)
        draw = ImageDraw.Draw(mask)
        draw.rectangle(bbox, fill=255)
        result.paste(image, mask=mask)
        draw = ImageDraw.Draw(result)
        draw.ellipse(bbox, outline=stroke_color, width=stroke_width)

        return result

    def get_image_with_marker_and_blur_grayscale(self, image, bbox, stroke_color='red', stroke_width=1, blur_radius=1):
        result = image.filter(ImageFilter.GaussianBlur(radius=blur_radius)).convert('L').convert('RGB')
        mask = Image.new('L', image.size, 0)
        draw = ImageDraw.Draw(mask)
        draw.rectangle(bbox, fill=255)
        result.paste(image, mask=mask)
        draw = ImageDraw.Draw(result)
        draw.ellipse(bbox, outline=stroke_color, width=stroke_width)

        return result

    def get_visual_prompts(self, image, bounding_boxes):
        self.visual_augmentation = 2
        visual_prompts = []

        for bounding_box in bounding_boxes:
            bounding_box = (bounding_box[0].item(), bounding_box[1].item(), bounding_box[2].item(), bounding_box[3].item())

            # For the following line to work correctly bounding boxes should actually be removed from
            # YOLO's results, as that's what is actually used in the end
            # if (bounding_box[2] - bounding_box[0]) * (bounding_box[3] - bounding_box[1]) >= (image.size[0] * image.size[1]) * 0.8:
            #     continue

            stroke_color = 'red'
            stroke_width = 3
            blur_radius = 20

            # fig, ax = plt.subplots()
            # ax.imshow(self.get_image_with_marker(image, bounding_box, stroke_color=stroke_color, stroke_width=stroke_width))
            # fig, ax = plt.subplots()
            # ax.imshow(self.get_image_with_marker_and_blur(image, bounding_box, stroke_color=stroke_color, stroke_width=stroke_width, blur_radius=blur_radius))

            bbox_visual_prompts = [
                # self.get_image_with_marker(image, bounding_box, stroke_color=stroke_color, stroke_width=stroke_width),
                self.get_image_with_marker_and_blur(image, bounding_box, stroke_color=stroke_color, stroke_width=stroke_width, blur_radius=blur_radius),
                # self.get_image_with_marker_and_grayscale(image, bounding_box, stroke_color=stroke_color, stroke_width=stroke_width),
                self.get_image_with_marker_and_blur_grayscale(image, bounding_box, stroke_color=stroke_color, stroke_width=stroke_width, blur_radius=blur_radius),
            ]



            for el in bbox_visual_prompts:
                visual_prompts.append(el)

                # fig, ax = plt.subplots()
                # ax.imshow(el)

        if len(visual_prompts) == 0:
            # If no region proposal, return the whole image.
            # It is inserted as many times as each region would
            # be augmented to ensure consistency in the algorithm
            for _ in range(self.visual_augmentation):
                visual_prompts.append(image)
                
        return visual_prompts


    def get_cropped_bounding_boxes(self, image, bounding_boxes):
        """
        Bounding boxes in the form:
        [top left x, top left y, bottom right x, bottom right y, confidence, categoy]
        """

        cropped_bounding_boxes = []

        image_width, image_height = image.size
        
        for bbox_idx, bounding_box in enumerate(bounding_boxes):
            # print(f'bbox: {bounding_box}')

            cropped_img = image.crop((bounding_box[0].item(), bounding_box[1].item(), bounding_box[2].item(), bounding_box[3].item()))

            # cropped_img

            # Centroid: (min + (max - min) / 2) / dimension
            crop_centroid_normalized = (
                (bounding_box[0].item() + (bounding_box[2].item() - bounding_box[0].item()) / 2) / image_width,
                (bounding_box[1].item() + (bounding_box[3].item() - bounding_box[1].item()) / 2 ) / image_height
            )

            if crop_centroid_normalized[0] < 0.5:
                overlay = Image.new('RGBA', cropped_img.size, overlay_colors[0])
            elif crop_centroid_normalized[0] > 0.5:
                overlay = Image.new('RGBA', cropped_img.size, overlay_colors[1])
            else:
                overlay = Image.new('RGBA', cropped_img.size, overlay_colors[-1])
            blended = Image.alpha_composite(cropped_img.convert('RGBA'), overlay)
            cropped_bounding_boxes.append(blended)

            # blended.show()
            # fig, ax = plt.subplots()
            # ax.imshow(blended)
            # ax.set_title(crop_centroid_normalized)


            # cropped_bounding_boxes.append(cropped_img)

        if len(cropped_bounding_boxes) == 0:
            cropped_bounding_boxes.append(image)
                
        return cropped_bounding_boxes

circles_model = CirclesModel(models=models, preprocesses=preprocesses, yolo_models=yolo_models, text_features_bias=text_features_full)

overlay_colors = [
    # (0, 0, 0, 0),       # None,
    # (0, 0, 0, 0),       # None
    (255, 0, 0, 128),   # Red, alpha = 0.5
    (0, 255, 0, 128),   # Green, alpha = 0.5
    (0, 0, 255, 128),   # Blue, alpha = 0.5
    (0, 0, 0, 0),       # None
]

if torch.cuda.device_count() > 1:
    circles_model = torch.nn.DataParallel(circles_model)

In [None]:
from torchvision.ops import box_iou

def iou_metric(bounding_boxes, ground_truth_bounding_boxes):
    """
    Localization Accuracy Metric

    Intersection over Union (IoU) is a common metric measure for localization accuracy.
    """

    ground_truth_bounding_boxes = torch.tensor(ground_truth_bounding_boxes).unsqueeze(0).to(device)

    return box_iou(bounding_boxes, ground_truth_bounding_boxes)

def cosine_similarity_metric(bounding_boxes, ground_truth_bounding_boxes):
    """
    Cosine Similarity Metric

    Cosine similarity is a common metric measure for semantic similarity.
    """

    ground_truth_bounding_boxes = torch.tensor(ground_truth_bounding_boxes).to(device)
    
    return cosine_similarity(bounding_boxes, ground_truth_bounding_boxes)

### Tests

In [None]:
idx = 17 # man on the beach with frisbee
# idx = 20 # motorbikes
# idx = 22 # cows on a beach
# idx = 25 # three oranges and a banana
# idx = 32 # guy with a horse and two busses
# idx = 33 # luggage
# idx = 34 # man on bed in front of a window
# idx = 35 # two zebras
# idx = 36 # two horses
# idx = 38 # chairs around a table with some sweets on top
# idx = 39 # two monitors
# idx = 42 # folks playing wii
# idx = 43 # yellow vehicle and surfboard
# idx = 44 # two women playing tennis
# idx = 45 # woman with a thing of bananas
# idx = 46 # industrial kitchen stove
# idx = 47 # two guys, one has a beard
# idx = 49 # girl eating pizza
# idx = 50 # vertical fork
# idx = 51 # sandwiches
# idx = 54 # computer on the right
# idx = 57 # woman playing tennis

img = next(iter(test_loader))[0][idx]
bbox_gt = next(iter(test_loader))[1][idx]
prompt = next(iter(test_loader))[2][idx]

img

In [None]:
prompt, bbox_gt

In [None]:
# prompt = ['the man on the right with a red overlay', 'the man with a blue shirt']#, 'a photo of a man who is about to throw a frisbee'] # idx == 17

# prompt = ['the red motorcycle with a blue overlay'] # idx == 20
# prompt = ['a red & black color bike in ftont of the three guys'] # idx == 20

# prompt = ['the smaller animal'] # idx == 22

# prompt = ['the orange closest to the banana',
#     'orange with a green overlay',
#  'orange between other oranges and a banana',
    # 'A photo of a orange',
    # 'A photo of a dining table',
    # 'A photo of a banana'
    # ] # idx == 25

# prompt = ['the orange closest to the banana with a red overlay']

# prompt = ['near zebra with a red overlay', 'zebra eating grass with a red overlay'] # idx == 35

# prompt = ['the man with glasses'] # idx == 32

# prompt = [
#     'a man with beard wearing blue shirt with his friend',
#     'a man with a beard',
# ] # idx == 47

# prompt = ['the right computer in the right hand picture with a green overlay',
#  'the computer on the right in the right hand picture with a green overlay'] # idx == 54

# prompt = ['the woman on the right']# with a green overlay',
#  'the girl with the racket in the photo on the right with a green overlay'] # idx == 57

In [None]:
# transform = T.Compose([
#     T.Resize(size=224, interpolation=T.InterpolationMode.BICUBIC, max_size=None, antialias='warn'),
#     T.CenterCrop(size=(224, 224)),
#     T.ToTensor(),
# ])

# img_tensor = transform(img)
# print(img_tensor.shape)
# res = yolo_models[0](torch.stack([img_tensor]))

res = yolo_models[0](img)
# res.pred[0].cpu().numpy()[:, -1]
res.pred[0].cpu().numpy() 

In [None]:
# Draw a circle onto the image
from PIL import Image, ImageDraw, ImageFilter

for res_pred in res.pred[0]:
    bbox = res_pred[0:4].cpu().numpy()

    circle_mask = Image.new('L', img.size, 0)
    draw = ImageDraw.Draw(circle_mask)
    draw.ellipse(bbox, fill=255)

    alpha = 0.2
    darkened_image = Image.new('RGB', img.size, (0, 0, 0))
    darkened_image.paste(img, mask=circle_mask)
    blurred_mask = circle_mask.filter(ImageFilter.GaussianBlur(radius=10))
    darkened_image.putalpha(blurred_mask.point(lambda x: alpha * (255 - x)))
    darkened_image = Image.alpha_composite(img.convert('RGBA').filter(ImageFilter.GaussianBlur(radius=2)), darkened_image.convert('RGBA'))

    darkened_image.paste(img, mask=circle_mask)

    draw = ImageDraw.Draw(darkened_image)
    draw.ellipse(bbox, outline='red', width=4)

    # img_new = Image.new('RGBA', img.size, (0, 0, 0, 0))
    # draw = ImageDraw.Draw(img_new)
    # draw.ellipse(bbox, outline='red', width=4)

    # img_new = Image.alpha_composite(img.convert('RGBA'), img_new)

    fig, ax = plt.subplots()
    ax.imshow(darkened_image)

In [None]:
prompt

In [None]:
indices = torch.tensor([range(1)]).to(device)
outputs = circles_model(indices, [img], [prompt])#['the woman on the left, the girl with the racket in the photo on the left'])

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

output_idx = 0

# Loading the image
# img = img[output_idx]

# Preparing the output
fig, ax = plt.subplots()

# Display the image
ax.imshow(img)

colors = ['r', 'b', 'g', 'yellow', 'orange']

# print(outputs.shape, bbox.unsqueeze(0).shape)

gt_and_outputs = torch.cat([torch.tensor(bbox_gt).unsqueeze(0).to(device), outputs])
# print(gt_and_outputs, bbox_gt)

# Create a Rectangle patch
for bbox_idx, (bbox, color) in enumerate(zip(gt_and_outputs, colors)):
    bounding_box_coordinates = bbox.cpu()
    top_left_x, top_left_y = bounding_box_coordinates[0], bounding_box_coordinates[1]
    width, height = bounding_box_coordinates[2]- top_left_x, bounding_box_coordinates[3] - top_left_y

    # Parameters: (x, y), width, height
    rect = patches.Rectangle((top_left_x, top_left_y), width, height, linewidth=2 if bbox_idx == 0 else 1, edgecolor=color, facecolor='none')

    # Add the patch to the Axes
    ax.add_patch(rect)

ax.set_title(prompt)

### To compute average cosine similarity between embeddings

In [None]:
images, gt_bounding_boxes, prompts = next(iter(test_loader))

idx_to_use = 3

indices = torch.tensor(list(range(len(images)))).to(device)
outputs = circles_model(indices[0:2], images[idx_to_use:idx_to_use+2], prompts[idx_to_use:idx_to_use+2])

In [None]:
fig, ax = plt.subplots()
ax.imshow(images[idx_to_use].crop())

for output in outputs:
    fig, ax = plt.subplots()
    ax.imshow(images[idx_to_use].crop(output.cpu().numpy()))

In [None]:
overall_outputs = []

for batch_idx, (images, gt_bounding_boxes, prompts) in enumerate(test_loader):
    print(f'-- Batch index: {batch_idx} --')

    prompts_tensor = [clip.tokenize(prompt_list) for prompt_list in prompts]
    
    indices = torch.tensor(list(range(len(images)))).to(device)
    outputs = circles_model(indices, images, prompts)

    break

    overall_outputs.append(outputs)

In [None]:
cos_sim_cpu = []
for out in overall_outputs:
    for cos_sim_val in out:
        cos_sim_cpu.append(cos_sim_val.item())
cos_sim_cpu = np.array(cos_sim_cpu)
np.nanmean(cos_sim_cpu)

### To compute standard metrics

In [None]:
from torchvision.ops import boxes as box_ops

IoUs = []
cosine_similarities = []
  
for batch_idx, (images, gt_bounding_boxes, prompts) in enumerate(test_loader):
    print(f'-- Batch index: {batch_idx} --')

    prompts_tensor = [clip.tokenize(prompt_list) for prompt_list in prompts]
    
    indices = torch.tensor(list(range(len(images)))).to(device)
    outputs = circles_model(indices, images, prompts)

    outputs_grouped_by_sample = []
    outputs_idx = 0
    prompts_idx = 0
    while True:
        if not prompts_idx < len(images):
            break

        outputs_grouped_by_sample.append(
            outputs[outputs_idx : outputs_idx + len(prompts[prompts_idx])]
        )

        outputs_idx += len(prompts[prompts_idx])
        prompts_idx += 1

    for output_bboxes, gt_bboxes in zip(outputs_grouped_by_sample, gt_bounding_boxes):
        """
        There is one output bounding box for each prompt given in input.
        Note that each prompt for a given input is actually a list of prompts,
        therefore it can contain an arbitrary number of promps. Hence, there is
        a bounding box for each one of them.
        """

        result_ious = iou_metric(output_bboxes, gt_bboxes)
        result_cosine_similarity = cosine_similarity_metric(output_bboxes, gt_bboxes)

        for iou in result_ious:
            IoUs.append(iou)

        for cs in result_cosine_similarity:
            cosine_similarities.append(cs)

In [None]:
counter = 0
counter_threshold = 0
for iou in IoUs:
    if iou == 0:
        counter += 1
    # if iou < 0.5:
    #     counter_threshold += 1
    if iou >= 0.5:
        counter_threshold += 1
counter, counter_threshold, len(IoUs), counter_threshold / len(IoUs)

In [None]:
IoUs_to_cpu = np.array([tensor.item() if torch.is_tensor(tensor) else 0 for tensor in IoUs])
mIoU = np.nanmean(IoUs_to_cpu)

cosine_similarities_to_cpu = np.array([tensor.item() if torch.is_tensor(tensor) else 0 for tensor in cosine_similarities])
m_cos_sim = np.nanmean(cosine_similarities_to_cpu)

print('--- Metrics ---')
print(f'Mean Intersection over Union (mIoU): {mIoU}')
print(f'Mean Cosine Similarity: {m_cos_sim}')

In [None]:
np.savetxt('outcomes/iou_using_2_visual_augmentation_only_RN50x64.csv', IoUs_to_cpu, delimiter=',')
np.savetxt('outcomes/cossim_using_2_visual_augmentation_only_RN50x64.csv', cosine_similarities_to_cpu, delimiter=',')

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches

output_idx = 0

# Loading the image
img = images[output_idx]

# Preparing the output
fig, ax = plt.subplots()

# Display the image
ax.imshow(img)

colors = ['r', 'b', 'g']

# Create a Rectangle patch
for bbox, color in zip(outputs_grouped_by_sample[output_idx][1:2], colors):
    bounding_box_coordinates = bbox.cpu()
    top_left_x, top_left_y = bounding_box_coordinates[0], bounding_box_coordinates[1]
    width, height = bounding_box_coordinates[2]- top_left_x, bounding_box_coordinates[3] - top_left_y

    # Parameters: (x, y), width, height
    rect = patches.Rectangle((top_left_x, top_left_y), width, height, linewidth=1, edgecolor=color, facecolor='none')

    # Add the patch to the Axes
    ax.add_patch(rect)

ax.set_title(prompts[output_idx][1])

In [None]:
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
available_gpus