In [None]:
import requests
from PIL import Image, ImageDraw, ImageFont
import torch
import numpy as np
import matplotlib.pyplot as plt

from transformers import OwlViTProcessor, OwlViTForObjectDetection

processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")



In [None]:
# =========================== Add your Token here ===========================

%env API_TOKEN=

# ===========================================================================

In [54]:
import os
from openai import OpenAI
import base64
import requests
#from IPython.display import Image


client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("LAS_API_TOKEN"),
)

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

In [78]:
jpg_file = "dashcam/rgb-images/sample_images/image1.jpg"
botsort_file = "dashcam/rgb-images/sample_images_oldModel/image1.jpg"
vehicle_image = Image.open(jpg_file)
texts = [["a photo of a car", "a photo of a motor bike"]]
inputs = processor(text=texts, images=vehicle_image, return_tensors="pt")
outputs = model(**inputs)

# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([vehicle_image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to COCO API
results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)

i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

In [84]:
#
location_in_frame = []
boxes_new = []
scores_new = []
labels_new = []

for box in boxes:
    box = [round(loc, 2) for loc in box.tolist()]
    boxes_new.append(box)

for score in scores:
    score = round(score.item(), 3)
    scores_new.append(score)

for label in labels:
    label = text[label]
    labels_new.append(label)

outputs = [
    {'boxes': box, 'scores': score, 'labels': label}
    for box, score, label in zip(boxes_new, scores_new, labels_new)
]

#This gives us the 10 nearest boxes to the dashboard, and excludes when the algo detects the dashboard
filtered_list = list(filter(lambda d: d['boxes'][3] < 1200, outputs)) 
sorted_list = sorted(filtered_list, key=lambda x: x['boxes'][3], reverse=True)

#send image to gpt4o
for i in sorted_list:
    bbox = i['boxes']
    vehicle_image.crop((bbox[0], bbox[1], bbox[2], bbox[3]))
    cropped_image = vehicle_image.crop((bbox[0], bbox[1], bbox[2], bbox[3]))
    cropped_image.save('image_crop.jpg', quality=95)

    # Getting the base64 string
    base64_image = encode_image("image_crop.jpg")

    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "user", "content": [
                {"type": "text", "text": "what colour and what type of motor vehicle is this, four words or less"},
                {"type": "image_url", "image_url": 
                 {"url": f"data:image/jpeg;base64,{base64_image}"}
                }
            ]}],
            model="gpt-4o",
            )
    i['lmm_description'] = chat_completion.choices[0].message.content



In [85]:
for i in sorted_list:
    print(i)

{'boxes': [214.63, 765.04, 847.25, 1074.26], 'scores': 0.301, 'labels': 'a photo of a car', 'lmm_description': 'Green autorickshaw.'}
{'boxes': [1221.19, 811.12, 1473.65, 1016.53], 'scores': 0.217, 'labels': 'a photo of a car', 'lmm_description': 'Yellow sedan taxi'}
{'boxes': [862.17, 802.39, 1201.69, 1013.84], 'scores': 0.298, 'labels': 'a photo of a car', 'lmm_description': 'Yellow taxi cab.'}
{'boxes': [1221.7, 809.84, 1465.12, 992.51], 'scores': 0.149, 'labels': 'a photo of a car', 'lmm_description': 'Yellow sedan taxi'}
{'boxes': [1088.32, 803.21, 1205.94, 889.21], 'scores': 0.186, 'labels': 'a photo of a car', 'lmm_description': 'Brown SUV'}
{'boxes': [914.01, 801.21, 1161.29, 888.07], 'scores': 0.156, 'labels': 'a photo of a car', 'lmm_description': 'Yellow taxi cab.'}
{'boxes': [1173.08, 794.93, 1270.72, 874.43], 'scores': 0.107, 'labels': 'a photo of a car', 'lmm_description': 'Black sedan.'}
{'boxes': [1091.03, 792.67, 1196.95, 858.73], 'scores': 0.361, 'labels': 'a photo of

In [86]:
def plot_bounding_box(image, sorted_list):
    plotted_image = ImageDraw.Draw(image)
    for i in sorted_list:
        box_new = i['boxes']
        x0 = box_new[0]
        y0 = box_new[1]
        x1 = box_new[2]
        y1 = box_new[3]
        plotted_image.rectangle(((x0,y0), (x1,y1)),outline=(255,188,0),width=2)
        plotted_image.text((x0, y0), i['lmm_description'],(255,188,0),font=ImageFont.truetype(font="arial",size=34)) 
    plt.figure(figsize = (36,18))
    plt.imshow(np.array(image))
    plt.show()

In [None]:
#Load the image
plot_bounding_box(vehicle_image,sorted_list)


In [None]:
image = Image.open(botsort_file)
plt.figure(figsize = (36,18))
plt.imshow(np.array(image))
plt.show()

In [None]:
#Three edge cases
#1 - unknown objects e.g. fuel tanks, tuk-tuks
# solved with this approach
#2 - temporary obsfucation
#3 - class confusion e.g. truck112,car937,truck114,car948
# solved with this approach