In [1]:
import base64
from openai import OpenAI
from google.colab import userdata

In [2]:
def encode_image(image_path: str) -> str:
    """Encode image to base64 string for OpenAI API."""
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def multimodal_agent(image_path: str, text_input: str) -> str:
    """
    A multimodal agent using OpenAI GPT-4V API.
    Takes image + text and returns model output.
    """
    # Fetch API key from Colab secrets
    openai_api_key = userdata.get("OPEN_AI_KEY")
    client = OpenAI(api_key=openai_api_key)

    # Encode image
    image_b64 = encode_image(image_path)

    # Send request
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # or "gpt-4o"
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": text_input},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{image_b64}"}
                    }
                ]
            }
        ],
        max_tokens=300,
    )

    return response.choices[0].message.content


if __name__ == "__main__":
    img_path = "/content/sample image.jpg" # Replace with your image path
    user_text = "What is happening in this picture?"

    output = multimodal_agent(img_path, user_text)
    print("Output:", output)

Output: In the picture, a person is cleaning a hospital room. They are wearing blue scrubs and yellow gloves, using a mop to clean the floor. A hospital bed is in the room, along with a cleaning cart labeled "CAUTION." The setting appears to be a clinical environment, indicating an effort to maintain cleanliness and hygiene.
