# Imports

In [1]:
import os
import cv2
import base64
import requests
import json
import traceback
import sys
import imageio
from ultralytics import YOLO
from yolo_utils import *
from vlm_utils import *
from utils import *
from PIL import Image,ExifTags

> Here you can pass the path of a config.py file, where you openai API key is stored

> Or you can directly paste your API_KEY in notebook, just beware, not to share it

In [2]:
sys.path.append('../../Creds')



In [3]:
from openai_config import OPENAI_API_KEY

> You can clone this repo:

https://github.com/m-ali-awan/gpt-json-sanitizer

And add it to path, this will help us sanitize response from openai to get perfect json

In [4]:
sys.path.append('../gpt-json-sanitizer/gpt_json_sanitizer')


In [5]:
from json_fixer import fix_json_response

# Defining Folders Structure

In [6]:
# Define folder structure
TEST_IMAGES_FOLDER = "TestImages"
YOLO_REF_IMAGES_FOLDER = "ImagesWithReferences"  # Image with Visualgrounding are saved here in respective 'IMAGE-NAME' folder
IN_PROCESS_IMAGES_FOLDER = "InProcessResultImages" # Result images with BBoxes of iterations are saved here
IN_PROCESS_BBOX_VALUES_FOLDER = "InProcessBBOXValues"
FINAL_RESULTS_FOLDER = "FinalResults" # Final iteration result image with boundingbox, and combined clip of all iterations are saved here

# Create folders if they don't exist
for folder in [TEST_IMAGES_FOLDER, YOLO_REF_IMAGES_FOLDER,
               IN_PROCESS_IMAGES_FOLDER,IN_PROCESS_BBOX_VALUES_FOLDER,
                 FINAL_RESULTS_FOLDER]:
    os.makedirs(folder, exist_ok=True)

# Defining System Prompt

> Feel free to modify it, I didn't spend too much time on it. Just keep in mind to prompt it to give same format, json like response

In [7]:
system_message = """
 You are "EyeGlasses-Detection-Workflow": you detect "eyeglasses(spectacles)" by responding Bounding Box coordinates. \n

 You have special ability to Self-Check your results. In first attempt, when you will predict cooridinates of the Bouding-Box of 'eyeglasses', you will be \
 provided with that bounding-box drawn on the input image, so you can self-check yourself. This image will also have the coordinates written. So, check yourself, and it is quite \
 possible Bounding-Box is a bit off, or not tightly bounded, or maybe not covering full object: 'eyeglasses'. In this case you have to respond corrected [X1,Y1,WIDTH,HEIGHT] \
 coordinates of top-left(X1,Y1) and width and height of Bounding-Box of the detected eyeglasses. And if Predicted Bouding Box is accurate, respond back same [X1,Y1,WIDTH,HEIGHT]. \n\n

 

 You will get image, with some coordinates drawn on a rectangle, to help you with understanding exact coordinates across the image.
 The reference rectangle, all 4 corners of Rectangle's coordinates are written, and also the center-coordinate of rectangle is written on image, where two arrows are crossing each other.\
 So, use these accordingly.

 You have to return only [X1,Y1,WIDTH,HEIGHT] coordinates of top-left(X1,Y1) and width and height of Bounding-Box of the detected eyeglasses.

 Respond in json form a dict, having key : "eyeglasses" \n

 Done't add any explanation, or anything else in your response, it should be only a dict of [X1,Y1,WIDTH,HEIGHT], in both modes: First Prediction, and in Self-Checking-Iteration
 example response: \n

 {'eyeglasses':[23,40,100,70]}  

"""


# Getting model from ultralytics

In [8]:
yolov8n_model = YOLO("yolov8n.pt")


# Running workflow on image

In [9]:
async def correct_image_orientation(img):
    try:
        for orientation in ExifTags.TAGS.keys():
            if ExifTags.TAGS[orientation] == 'Orientation':
                break
        exif = img._getexif()
        if exif is not None:
            exif = dict(exif.items())
            if exif[orientation] == 3:
                img = img.rotate(180, expand=True)
                print('was rotated')
            elif exif[orientation] == 6:
                img = img.rotate(270, expand=True)
                print('was rotated')
            elif exif[orientation] == 8:
                img = img.rotate(90, expand=True)
                print('was rotated')
    except (AttributeError, KeyError, IndexError):
        # No EXIF information or no orientation data
        pass
    return img

In [10]:
async def validator_iterative_detect_eyeglasses_using_vlm(openai_api_key, image_pth, system_message, yolo_model,max_iteration=10):

    img = Image.open(image_pth)
    corrected_img = await correct_image_orientation(img)
    corrected_img.save(image_pth)
    # YOLO detection
    yolo_results = yolo_predict(yolo_model, image_pth)
    
    # Draw reference detection results on image
    draw_ref_det_results_on_image(image_pth, yolo_results)

    # Get response from VG-GPT4o
    image_with_grounding = Image.open(f"ImagesWithReferences/{os.path.splitext(os.path.basename(image_pth))[0]}/image_with_ref_bbox.jpeg")
    response = await get_vg_gpt4o_response(openai_api_key, image_with_grounding, system_message)
    
    # Fix JSON response
    response_dict = fix_json_response(response['choices'][0]['message']['content'])

    print(f"First Prediction of EyeGlasses Bounding Box::{response_dict}")
    
    # Save labeled image with XYWH
    save_labeled_image_with_xywh(response_dict, image_pth, "InProcessResultImages", "iter_1_predicted_bbox")
    # Save initial response dict to JSON
    json_folder = f"InProcessBBOXValues/{os.path.splitext(os.path.basename(image_pth))[0]}"
    os.makedirs(json_folder, exist_ok=True)
    json_path = os.path.join(json_folder, "iter_1.json")
    with open(json_path, 'w') as json_file:
        json.dump(response_dict, json_file, indent=4)


    # Loop for 10 iterations
    max_iter = max_iteration+1  # as in range max will be skipped
    for i in range(2, max_iter):
        # Self-checking response
        raw_image = Image.open(image_pth)
        latest_result_image_path = f"InProcessResultImages/{os.path.splitext(os.path.basename(image_pth))[0]}/iter_{i-1}_predicted_bbox.jpeg"
        latest_result_image = Image.open(latest_result_image_path)
        latest_vlm_response = response['choices'][0]['message']['content']
        
        validation_result = await bounding_box_validator(openai_api_key, 'eyeglasses', latest_vlm_response, latest_result_image_path)
        validation_result_response = validation_result['choices'][0]['message']['content']
        print(f"Iteration {i} Validation Response::{validation_result_response}")
        
        response_self_checking = await self_checking_response(openai_api_key, raw_image, latest_result_image, system_message, latest_vlm_response, validation_result_response)
        
        # Fix JSON response from self-checking
        final_response_dict = fix_json_response(response_self_checking['choices'][0]['message']['content'])
        print(f"Iteration {i} Prediction of EyeGlasses Bounding Box::{final_response_dict}")
        
        # Save final labeled image with XYWH
        save_labeled_image_with_xywh(final_response_dict, image_pth, "InProcessResultImages", f"iter_{i}_predicted_bbox")
        
        # Save response dict to JSON
        json_path = os.path.join(json_folder, f"iter_{i}.json")
        with open(json_path, 'w') as json_file:
            json.dump(final_response_dict, json_file, indent=4)
        # Update response for next iteration
        response = response_self_checking

    # Save the final iteration result in FinalResults
    final_result_image_path = f"InProcessResultImages/{os.path.splitext(os.path.basename(image_pth))[0]}/iter_{max_iteration}_predicted_bbox.jpeg"
    final_result_image = Image.open(final_result_image_path)
    to_save_folder = f"FinalResults/{os.path.splitext(os.path.basename(image_pth))[0]}"
    os.makedirs(to_save_folder, exist_ok=True)
    
    final_result_image.save(f"FinalResults/{os.path.splitext(os.path.basename(image_pth))[0]}/Final_predicted_bbox.jpeg")



In [11]:

input_image = "TestImages/man_with_glasses_1.png"
num_of_iterations = 8
result_image = await validator_iterative_detect_eyeglasses_using_vlm(OPENAI_API_KEY,
                                                                      input_image, system_message,
                                                                        yolov8n_model,num_of_iterations)
create_iteration_video(input_image,num_of_iterations)



image 1/1 /Users/muhammadali/D/RealWork/opensource-contributions/gpt4_visual_grounding/TestImages/man_with_glasses_1.png: 448x640 2 persons, 78.8ms
Speed: 1.7ms preprocess, 78.8ms inference, 0.4ms postprocess per image at shape (1, 3, 448, 640)
Final result image saved to: ImagesWithReferences/man_with_glasses_1/image_with_ref_bbox.jpeg
First Prediction of EyeGlasses Bounding Box::{'eyeglasses': [397, 142, 150, 60]}
Labeled image saved to: InProcessResultImages/man_with_glasses_1/iter_1_predicted_bbox.jpeg
Step value is :::26
Iteration 2 Validation Response::```json
{
  "accurate": "no",
  "reason": "The predicted bounding box is partially covering the eyeglasses, leaving some parts outside the frame. To adjust: shift X1 slightly to the left by approximately 20 pixels (new X1 = X1 - 20) and increase the width by approximately 26 pixels."
}
```
<Response [200 OK]>
Iteration 2 Prediction of EyeGlasses Bounding Box::{'eyeglasses': [377, 142, 176, 60]}
Labeled image saved to: InProcessRes



Video saved at: FinalResults/man_with_glasses_1/iterations_video.mp4


In [14]:
# Example usage
video_path = "FinalResults/man_with_glasses_1/iterations_video.mp4"
play_video(video_path)