In [1]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.78-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.78-py3-none-any.whl (921 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m921.5/921.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.14-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.78 ultralytics-thop-2.0.14


In [2]:
import os
import glob
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from torchvision.ops import box_iou
from PIL import Image
import json
from google import genai
import matplotlib.pyplot as plt

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


# Parse data path

In [3]:
keyframe_dir = '/kaggle/input/vqa-dataset/vqa_dataset/images'
all_image_paths = {}
for obj in sorted(os.listdir(keyframe_dir)):
    all_image_paths[obj] =  []
for data_part in sorted(all_image_paths.keys()):
    data_part_path = f'{keyframe_dir}/{data_part}'
    if os.path.isdir(data_part_path):
        image_path = sorted(os.listdir(data_part_path))
        keyframe_paths = [f'{data_part_path}/{image}' for image in image_path]
        all_image_paths[data_part] = keyframe_paths

In [4]:
len(all_image_paths.keys())

32

# Helper function

In [5]:
class VisualEncoding:
    def __init__(self,
                classes = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
                           'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
                           'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
                           'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
                           'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
                           'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
                           'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
                           'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
                           'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
                           'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
                           'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
                           'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
                           'scissors', 'teddy bear', 'hair drier', 'toothbrush'),
                colors = ('black', 'blue', 'brown', 'green', 'grey', 'orange_', 'pink', 'purple',
                          'red', 'white', 'yellow'),
                row_str = ["0", "1", "2", "3", "4", "5", "6"],
                col_str = ["a", "b", "c", "d", "e", "f", "g"]):
        
        self.classes = classes
        self.colors = colors
        self.classes2idx = dict()
        for i, class_ in enumerate(classes):
            self.classes2idx[class_] = i
        self.n_row = len(row_str)
        self.n_col = len(col_str)

        x_pts = np.linspace(0, 1, self.n_row+1)
        y_pts = np.linspace(0, 1, self.n_col+1)

        self.grid_bboxes = []
        self.grid_labels = []
        for i in range(self.n_row):
            for j in range(self.n_col):
                label = col_str[j] + row_str[i]
                self.grid_bboxes.append([x_pts[j], y_pts[i], x_pts[j+1], y_pts[i+1]])
                self.grid_labels.append(label)

        self.grid_bboxes = np.array(self.grid_bboxes)

In [6]:
import numpy as np

def compute_iou(box1, box2):
    """Compute IoU (Intersection over Union) between two bounding boxes."""
    x1, y1, x2, y2 = box1
    x1_p, y1_p, x2_p, y2_p = box2

    # Compute intersection
    xi1, yi1 = max(x1, x1_p), max(y1, y1_p)
    xi2, yi2 = min(x2, x2_p), min(y2, y2_p)
    intersection = max(0, xi2 - xi1) * max(0, yi2 - yi1)

    # Compute union
    area_box1 = (x2 - x1) * (y2 - y1)
    area_box2 = (x2_p - x1_p) * (y2_p - y1_p)
    union = area_box1 + area_box2 - intersection

    # Compute IoU
    return intersection / union if union > 0 else 0

def filter_overlapping_boxes(bboxes, labels, iou_threshold=0.7):
    """Remove objects that have the same label and overlap more than the given IoU threshold."""
    keep = []
    for i in range(len(bboxes)):
        discard = False
        for j in range(len(keep)):
            if labels[i] == labels[keep[j]]:  # Same label
                iou = compute_iou(bboxes[i], bboxes[keep[j]])
                if iou > iou_threshold:
                    discard = True
                    break
        if not discard:
            keep.append(i)
    
    return bboxes[keep], labels[keep]

# Run inference

In [7]:
!pip install -U -q "google-genai"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.7/130.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = YOLO('yolo11l.pt')  # pretrained YOLOv8n model

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11l.pt to 'yolo11l.pt'...


100%|██████████| 49.0M/49.0M [00:00<00:00, 208MB/s]


In [9]:

bs = 64
encoder = VisualEncoding()
count_object = {}
for obj in encoder.classes:
    count_object[obj] = 0
id2img = {}
img2id = {}
img_id = 0
annotations = []
not_found = []
for key, sub_image_paths in tqdm(all_image_paths.items()):
    video_ids = sorted(all_image_paths.keys())
    video_encoded_bboxes, video_encoded_classes, video_encoded_numbers = [], [], []
    for i in tqdm(range(0, len(sub_image_paths), bs)):
        # Support batchsize inferencing
        image_paths = sub_image_paths[i:i+bs]
        results = model(image_paths, conf=0.7, device=device, verbose=False)  # return a list of Results objects
        for j, result in enumerate(image_paths):
            id2img[img_id] = f'{image_paths[j].split("/")[-2]}/{image_paths[j].split("/")[-1]}'
            img2id[f'{image_paths[j].split("/")[-2]}/{image_paths[j].split("/")[-1]}'] = img_id

        for j, result in enumerate(results):
            bboxes = result.boxes.xyxy.cpu().numpy().copy().astype(int)
            labels = result.boxes.cls.cpu().numpy().copy().astype(int)
            filtered_bboxes, filtered_labels = filter_overlapping_boxes(bboxes, labels)
            
            if len(filtered_labels) == 0:
                not_found.append(image_paths[j])
                continue
            id2img[img_id] = f'{image_paths[j].split("/")[-2]}/{image_paths[j].split("/")[-1]}'
            img2id[f'{image_paths[j].split("/")[-2]}/{image_paths[j].split("/")[-1]}'] = img_id
            
            for label in filtered_labels:
                name = encoder.classes[label]
                count_object[name] += 1
            annotate = []
            for obj in encoder.classes:
                if count_object[obj] > 0 and count_object[obj] < 10:
                    annotate.append({
                        "question": f"How many {obj} in image?",
                        "answer": count_object[obj]
                    })
                    annotate.append({
                        "question": f"Are there any {obj} in image?",
                        "answer": f"Yes"
                    })
                    if len(filtered_labels) == 1:
                        annotate.append({
                            "question": "What is the name of the object in the image?",
                            "answer": obj
                        })
                else:
                    if np.random.uniform(0, 1) < 0.01:
                        annotate.append({
                            "question": f"How many {obj} in image?",
                            "answer": 0
                        })
                    if np.random.uniform(0, 1) < 0.02:
                        annotate.append({
                            "question": f"Are there any {obj} in image?",
                            "answer": "No"
                        })
                count_object[obj] = 0
            annotations.append({
                "image_id": img_id,
                "annotations": annotate
            })
            img_id+=1
            
import pandas as pd

pd.DataFrame({"id": id2img.keys(), "img": id2img.values()}).to_csv("id2img.csv", index=False)
pd.DataFrame({"img": img2id.keys(), "id": img2id.values()}).to_csv("img2id.csv", index=False)
with open(f'id2image.json', 'w') as f:
    f.write(json.dumps(id2img))
with open(f'image2id.json', 'w') as f:
    f.write(json.dumps(img2id))
with open(f'annotations.json', 'w') as f:
    f.write(json.dumps(annotations))
with open(f'error_image.json', 'w') as f:
    f.write(json.dumps(not_found))

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]



  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

In [10]:
# !rm /kaggle/working/yolo11l.pt

In [11]:
pd.read_csv("/kaggle/working/id2img.csv")

Unnamed: 0,id,img
0,0,airplane/14712178869_d91526ac2d_b.jpg
1,1,airplane/222143315_1b5f110ccf.jpg
2,2,airplane/234542399_6ae38148c0.jpg
3,3,airplane/aircraft-delta-wing-stealth-bomber-ra...
4,4,airplane/aircraft-double-decker-airport-army-a...
...,...,...
23006,23006,zebra/pexels-photo-995514.jpeg
23007,23007,zebra/pexels-photo-9986766.jpeg
23008,23008,zebra/zebra-print-border-frame-design.jpg
23009,23009,zebra/zebra-s4M-printer-front-1.jpg


In [12]:
with open(f'image2id.json', 'r') as f:
    img2id = json.load(f)

In [13]:
len(img2id.keys())

27661

In [14]:
with open(f'annotations.json', 'r') as f:
    annotations = json.load(f)

In [15]:
len(annotations)

23011