<h2><b>Importing libraries</b></h2>

In [None]:
import chromadb
import cv2
import base64
import getpass

from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
from chromadb.utils.data_loaders import ImageLoader
from openai import OpenAI

<h2><b>Importing example image</b></h2>

In [None]:
image = cv2.imread('example.png')

height, width, _ = image.shape

<h2><b>Initializing chromaDB collection</b></h2>

In [None]:
chroma_client = chromadb.Client()

embedding_function = OpenCLIPEmbeddingFunction()
data_loader = ImageLoader()

collection = chroma_client.create_collection(
    name="image_collection",
    embedding_function=embedding_function,
    data_loader=data_loader
)

<h2><b>Preparing images</b></h2>

In [None]:
images = []
images_mapping = {}

images_num = 4

# Preparing pairs id-path (path is relative)
for i in range(images_num):
    images_mapping["id" + str(i)] = r"images\image" + str(i+1) + ".png"

# Preparing images
for i in range(images_num):
    image = cv2.imread(r"images\image" + str(i+1) + ".png")
    images.append(image)

<h2><b>Adding data to the collection</b></h2>

In [None]:
collection.add(
    ids=["id" + str(i) for i in range(images_num)],
    images=[images[i] for i in range(images_num)],
    metadatas=[{"chapter": "6", "topic": "Famill a Frenn", "page": "83"} for i in range(images_num)]
)

<h2><b>Model setup</b></h2>

In [None]:
key = getpass.getpass("Enter API key for OpenAI:")
client = OpenAI(api_key=key)

Model = "gpt-4o-mini"
Max_tokens = 500

<h2><b>Tool function & description</b></h2>

In [None]:
retrieve_tool = {
    "type": "function",
    "function": {
        "name": "retrieve_image",
        "description": "Retrieve the image from the textbook, best matching given description.",
        "parameters": {
            "type": "object",
            "properties": {
                "image_description": {
                    "type": "string",
                    "description": "Text description of the image from the textbook."
                },
                "chapter": {
                    "type": "number",
                    "description": "Number of the chapter in which the image is placed."
                },
                "topic": {
                    "type": "string",
                    "description": "Name of the topic in which the image is placed."
                },
                "page": {
                    "type": "number",
                    "description": "Number of the page in which the image is placed."
                }
            },
            "required": ["image_description", "chapter", "topic", "page"],
            "additionalProperties": False
        },
        "strict": True
    }
}

In [None]:
def retrieve_image(image_description, chapter, topic, page):
    result_id = collection.query(
        query_texts=[image_description],
        n_results=1,
        where={
            "$and": [
                {"chapter": {"$eq": chapter}},
                {"topic": {"$eq": topic}},
                {"page": {"$eq": page}}
            ]
        }
    )

    if not result_id["ids"][0]:
        return None

    result_path = images_mapping.get(result_id["ids"][0][0])
    result_image = cv2.imread(result_path)
    
    return result_image

<h2><b>Encoding the input data for the GPT model</b></h2>

In [None]:
_, buffer = cv2.imencode(".jpg", image)

image_bytes = buffer.tobytes()

image_base64 = base64.b64encode(image_bytes).decode("utf-8")

<h2><b>Initial system message & one-shot example</b></h2>

In [None]:
System_message = {
    "role": "developer",
    "content": [
        {
            "type": "text",
            "text": "You're the tutor chatbot, which carries on a conversation with a human student about the lesson about the topic \"Famill a Frenn\", on the page 83, from the chapter 6 of the Luxembourgish textbook.\
                     Your task is to provide the best education to the human student.\
                     If you decide that during the conversation the picture from the book would be beneficial to the student's learning process, use the retrieve_image tool and describe the image you want to retrieve from it.\
                     You'll be provided with the image of the currently discuissed page from the textbook for learning Luxembourgish."
        }
    ]
}

In [None]:
example_image = cv2.imread("example.jpg")

_, buffer = cv2.imencode(".jpg", example_image)

example_image_bytes = buffer.tobytes()

example_image_base64 = base64.b64encode(example_image_bytes).decode("utf-8")

In [None]:
Example_message = {
    "role": "user",
    "content": [
        {
            "type": "text",
            "text": "I am currently working on the third exercise \"Wie schreift d'Annonce.\", but I have a problem with determining what kind of announcement does announcement number 3 represent.\
                     Provide me with both textual explanation and the image I am describing."
        },
        {
            "type": "image_url",
            "image_url": { "url": f"data:image/jpeg;base64,{example_image_base64}" }
        }
    ]
}

In [None]:
Example_response = {
    "role": "assistant",
    "content": "In announcement number 3, the text reads:\
                \"Léif Bomi Christine!\
                Mir gratuléieren dir ganz häerzlech fir deng 70 Joer a wënschen dir alles Guddes! Deng Kanner, Enkelkanner an déi ganz Famill.\"\
                Explanation:\
                The phrase \"70 Joer\" indicates that it is a birthday announcement.\
                The term \"Bomi\" (which means grandmother) shows that the announcement is dedicated to an elderly family member.\
                The senders of the message are \"Deng Kanner, Enkelkanner an déi ganz Famill\" (your children, grandchildren, and the whole family), which confirms that this is a family celebration.\
                Answer to the Exercise:\
                For Exercise 3 (\"Wie schreift d'Annonce?\"), you need to match the announcement to its sender. Based on the text, the correct answer is:\
                \"dem Christine seng Kanner an Enkelkanner\" (Christine's children and grandchildren).\
                Let me know if you need further clarification!"
}

Example_function = {
    "role": "function",
    "name": "retrieve_image",
    "content": "{\"image_description\": \"An elderly woman with short white hair and glasses, celebrating her 70th birthday.\", \"chapter\": 6, \"topic\": \"\", \"page\": 83}",
}

<h2><b>Calling the GPT to answer the student question</b></h2>

In [None]:
User_message = {
    "role": "user",
    "content": [
        {
            "type": "text",
            "text": "I am currently working on the second exercise \"Wat feire si? Kraizt un.\", but I have a problem with determining what kind of announcement does announcement number 4 represent.\
                     Provide me with both textual explanation and the image I am describing."
        },
        {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{image_base64}"
            }
        }
    ]
}

response = client.chat.completions.create(
    model = Model,
    messages = [System_message,
                Example_message, Example_response, Example_function,
                User_message],
    max_tokens = Max_tokens,
    tools = [retrieve_tool],
)

print(response.choices[0].message.content)

<h2><b>Capturing the tools calls and saving the results</b></h2>

In [None]:
if response.choices[0].message.tool_calls is not None:
    tool_call = response.choices[0].message.tool_calls[0]

    args = json.loads(tool_call.function.arguments)

    image = retrieve_image(args["image_description"], str(args["chapter"]), args["topic"], str(args["page"]))

    if image is not None:
        cv2.imshow("retrieved_image", image)
        
        cv2.waitKey(0)
        cv2.destroyAllWindows()

In [None]:
image = retrieve_image("An elderly woman with short white hair and glasses, celebrating her 70th birthday.", str(6), "Famill a Frenn", str(83))

if image is not None:
    cv2.imshow("retrieved_image", image)
        
    cv2.waitKey(0)
    cv2.destroyAllWindows()