<h2><b>Importing libraries</b></h2>

In [None]:
from openai import OpenAI

import cv2
import base64
import getpass
import json
import numpy as np

<h2><b>Importing example image</b></h2>

In [None]:
example_img = cv2.imread('example.png')

height, width, _ = example_img.shape

<h2><b>Tool function & Tool description</b></h2>

In [None]:
cropping_tool = {
    "type": "function",
    "function": {
        "name": "crop_image",
        "description": "Crops out part of the image in shape of rectangle, at chosen coordinates.",
        "parameters": {
            "type": "object",
            "properties": {
                "startX": {
                    "type": "number",
                    "description": "x coordinate of the rectangle top-left point"
                },
                "endX": {
                    "type": "number",
                    "description": "x coordinate of the rectangle bottom-right point"
                },
                "startY": {
                    "type": "number",
                    "description": "y coordinate of the rectangle top-left point"
                },
                "endY": {
                    "type": "number",
                    "description": "y coordinate of the rectangle bottom-right point"
                }
            },
            "required": ["startX", "endX", "startY", "endY"],
            "additionalProperties": False
        },
        "strict": True
    }
}

In [None]:
def crop_image(image, sets_of_coordinates):
    cropped_elements = []

    for coordinates in sets_of_coordinates:
        cropped_element = image[coordinates[2]:coordinates[3], coordinates[0]:coordinates[1]]
        cropped_elements.append(cropped_element)
    
    return cropped_elements

<h2><b>Model setup</b></h2>

In [None]:
key = getpass.getpass("Enter API key for OpenAI:")
client = OpenAI(api_key=key)

Model = "gpt-4o-mini"
Max_tokens = 500

<h2><b>Initial system message & two-shot examples</b></h2>

In [None]:
System_message = {
    "role": "developer",
    "content": [
        {
            "type": "text",
            "text": "You will be provided with an image of textbook page for learning Luxembourgish and the dimensions of the image (Width, Height).\
                     Your task is to analyze it, and decide if there are parts of the image (pictures, drawings, etc.) that can be usefull during the learning process of Luxembourgish.\
                     If you notice the relevant part of the image, call the crop_image tool with the correct parameters (x and y positions of top-left and bottom-right points).\
                     In the response, explain your decision."
        }
    ]
}

In [None]:
example_image1 = cv2.imread("example1.jpg")
example_image2 = cv2.imread("example2.jpg")

example_data1 = "498 736" 
example_data2 = "507 734"

_, buffer1 = cv2.imencode(".jpg", example_image1)
_, buffer2 = cv2.imencode(".jpg", example_image2)

example1_image_bytes = buffer1.tobytes()
example2_image_bytes = buffer2.tobytes()

example1_image_base64 = base64.b64encode(example1_image_bytes).decode("utf-8")
example2_image_base64 = base64.b64encode(example2_image_bytes).decode("utf-8")

In [None]:
Example_message1 = {
        "role": "user",
        "content": [
            {"type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{example1_image_base64}"}},
            {"type": "text", "text": example_data1}
        ]
    }
Example_message2 = {
        "role": "user",
        "content": [
            {"type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{example2_image_base64}"}},
            {"type": "text", "text": example_data2}
        ]
    }

In [None]:
Example_response1 = {
    "role": "assistant",
    "content": "The set of 4 consecutive images correspond to different birthday celebrations and are highly relevant to one of the exercises."
}
Example_response2 = {
    "role": "assistant",
    "content": "The image shows a man walking next to the bakery. The clock on the left indicates that the picture is relevant to the exercise."
}

Example_function1 = {
    "role": "function",
    "name": "crop_image",
    "content": "{\"startX\": 74, \"endX\": 188, \"startY\": 98, \"endY\": 484}",
}
Example_function2 = {
    "role": "function",
    "name": "crop_image",
    "content": "{\"startX\": 182, \"endX\": 258, \"startY\": 32, \"endY\": 134}",
}

<h2><b>Encoding the input data for the GPT model</b></h2>

In [None]:
data_string = str(width) + " " + str(height)

_, buffer = cv2.imencode(".jpg", example_img)
image_bytes = buffer.tobytes()

image_base64 = base64.b64encode(image_bytes).decode("utf-8")

<h2><b>Calling the GPT to analyze the image</b></h2>

In [None]:
User_message = {
    "role": "user",
    "content": [
        {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{image_base64}"
            }
        },
        {
            "type": "text",
            "text": data_string
        }
    ]
}

response = client.chat.completions.create(
    model = Model,
    messages = [System_message, 
                Example_message1, Example_response1, Example_function1, 
                Example_message2, Example_response2, Example_function2, 
                User_message],
    max_tokens = Max_tokens,
    tools = [cropping_tool],
    tool_choice="required"
)

print(response.choices[0].message.content)

<h2><b>Capturing the tools calls and saving the results</b></h2>

In [None]:
cropped_elements = []

if response.choices[0].message.tool_calls is not None:
    for i in range(len(response.choices[0].message.tool_calls)):
        tool_call = response.choices[0].message.tool_calls[i]
        
        args = json.loads(tool_call.function.arguments) # Parsing the json arguments
    
        cropped_elements = crop_image(example_img, [[args["startX"], args["endX"], args["startY"], args["endY"]]]) 

In [None]:
for i in range(len(cropped_elements)):
    cv2.imwrite("cropped_images/cropped_image" + str(i + 1) + ".jpg", cropped_elements[i])    