# OWL-ViT predictions
* To get the labels for each image predicted by OWL-ViT that would be later used by ProbCover.

In [1]:
from torchvision import datasets
import torch
import numpy as np
import requests
from PIL import Image
import torch
from transformers import OwlViTProcessor, OwlViTForObjectDetection

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
path2data="/home/ubuntu/mmdetection_od/mmdetection/data/coco/images/train2017"
path2json="/home/ubuntu/mmdetection_od/mmdetection/data/coco/annotations/instances_train2017.json"

In [3]:
train_dataset = datasets.CocoDetection(root=path2data, annFile=path2json)

loading annotations into memory...
Done (t=15.71s)
creating index...
index created!


In [4]:
# extract the indices for the annotated images 
indices_annotated_images=[]
#reading the filenames.txt file 
with open('/home/ubuntu/master_thesis/covering_lens/TypiClust/deep-al/pycls/datasets/annotated_train_images_indices_2017.txt', 'r') as f:
    lines = f.readlines()

    for line in lines:
        indices_annotated_images.append(line.strip())

In [5]:
# convert to int
int_indices_annotated_images = [int(x) for x in indices_annotated_images]
dataset=torch.utils.data.Subset(train_dataset, int_indices_annotated_images) 

In [6]:
len(dataset)

117266

In [7]:
from tqdm import tqdm
image_features = []
image_file_names=[]

processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")

#class of ms coco
texts =  [
    'human', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
    'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat',
    'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack',
    'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
    'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
    'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
    'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

dict_image = {}
#for idx in tqdm(range(len(dataset))):
for idx in tqdm(range(100000,len(dataset))):
    # get the image and its annotations



    img, target = dataset[idx]

    image_id = target[0]['image_id']
    image_info = train_dataset.coco.loadImgs(image_id)[0]
    image_file_name =image_info['file_name'] 

    dict_image[image_file_name]={}
    dict_image[image_file_name]['label'] =[]
    dict_image[image_file_name]['score']=[]

    inputs = processor(text=texts, images=img, return_tensors="pt")
    outputs = model(**inputs)

    # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
    target_sizes = torch.Tensor([img.size[::-1]])
    # Convert outputs (bounding boxes and class logits) to COCO API
    results = processor.post_process(outputs=outputs, target_sizes=target_sizes)

    i = 0  # Retrieve predictions for the first image for the corresponding text queries
    text = texts[i]
    boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

    # Print detected objects and rescaled box coordinates
    score_threshold = 0.1
    for box, score, label in zip(boxes, scores, labels):
        box = [round(i, 2) for i in box.tolist()]
        if score >= score_threshold:
            #print(f"Detected {texts[label]} with confidence {round(score.item(), 3)} at location {box}")
          
            dict_image[image_file_name]['label'].append(texts[label])
            dict_image[image_file_name]['score'].append(round(score.item(), 3))


100%|██████████| 10/10 [00:08<00:00,  1.17it/s]


In [8]:
import pandas as pd
df_mscoco=pd.DataFrame(dict_image)

In [9]:
df_mscoco.head()

Unnamed: 0,000000496053.jpg,000000496058.jpg,000000496059.jpg,000000496064.jpg,000000496065.jpg,000000496073.jpg,000000496078.jpg,000000496081.jpg,000000496089.jpg,000000496090.jpg,...,000000581887.jpg,000000581899.jpg,000000581900.jpg,000000581903.jpg,000000581904.jpg,000000581906.jpg,000000581909.jpg,000000581913.jpg,000000581921.jpg,000000581929.jpg
label,"[tv, tv, tv, handbag, human, tv, remote, remot...","[surfboard, human, surfboard, boat, surfboard,...","[traffic light, traffic light, bus, traffic li...","[microwave, microwave, clock, clock, clock, bo...",[],[train],"[bird, sports ball]","[truck, horse, horse, horse]",[],[],...,"[traffic light, car, car, car, truck, car, car...",[],"[chair, chair, handbag, chair, motorcycle]",[bottle],"[clock, handbag]",[],"[boat, boat, boat, boat]","[donut, donut, donut]","[human, snowboard]","[horse, horse, horse, horse]"
score,"[0.468, 0.478, 0.389, 0.126, 0.1, 0.469, 0.143...","[0.142, 0.104, 0.207, 0.127, 0.124, 0.389, 0.1...","[0.39, 0.407, 0.244, 0.104, 0.186, 0.137, 0.12...","[0.138, 0.16, 0.13, 0.102, 0.112, 0.135, 0.134...",[],[0.18],"[0.114, 0.134]","[0.129, 0.103, 0.189, 0.262]",[],[],...,"[0.354, 0.106, 0.18, 0.127, 0.185, 0.125, 0.17...",[],"[0.101, 0.125, 0.161, 0.184, 0.3]",[0.113],"[0.534, 0.118]",[],"[0.109, 0.107, 0.2, 0.18]","[0.276, 0.242, 0.128]","[0.129, 0.464]","[0.654, 0.145, 0.6, 0.111]"


In [10]:
# write dataframe to a csv file
df_mscoco.to_csv('predictions.csv', index=False)