In [1]:
import asyncio
import base64
import os
from io import BytesIO

import httpx
import orjson
import polars as pl
from dotenv import load_dotenv
from PIL import Image

load_dotenv()

True

In [2]:
assert os.getenv("OPENROUTER_API_KEY"), "OpenRouter API key is not defined in .env."

API_URL = "https://openrouter.ai/api/v1/chat/completions"

headers = {
    "Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}",
    "Content-Type": "application/json",
}

Test Barack Obama. [Image source](https://openverse.org/image/0d5242d2-8838-47a0-88ab-a3ab59a5f75f?q=barack+obama&p=1)


In [3]:
img = Image.open("470562794_2472fada41_b.jpg")
img.size

(1024, 768)

Resize image such that its maximum width/height is 768px, since that's what Gemini and other LLM APIs are based upon.


In [4]:
def resize_image_maintain_aspect(img, max_size=768):
    """
    Resize an image so that its maximum dimension (width or height) is max_size
    while maintaining the aspect ratio.
    """

    # Get current dimensions
    width, height = img.size

    # Calculate the scaling factor
    if width > height:
        # Width is the larger dimension
        scale_factor = max_size / width
    else:
        # Height is the larger dimension
        scale_factor = max_size / height

    # Calculate new dimensions
    new_width = int(width * scale_factor)
    new_height = int(height * scale_factor)

    # Resize the image
    resized_img = img.resize((new_width, new_height), Image.Resampling.LANCZOS)

    # resized_img.save("test.png")
    return resized_img


def img_to_base64_str(img):
    img = resize_image_maintain_aspect(img)

    buffered = BytesIO()
    img.save(buffered, format="PNG")
    img_base64 = base64.b64encode(buffered.getvalue())
    img_base64_str = img_base64.decode("utf-8")
    return img_base64_str


In [5]:
img_base64_str = img_to_base64_str(img)
img_base64_str[0:100]

'iVBORw0KGgoAAAANSUhEUgAAAwAAAAJACAIAAAC1zJYBAAEAAElEQVR4nHz9+a9tW3odhs1+rbX3ae69771qXrGqyCJVYqkXTVES'

In [None]:
system = """
Identify every notable person in the image the user provides. Your response should only contain the names of the people in order from left to right based on their relative positions in the image.
"""

model_list = [
    {"GPT-4.1": "openai/gpt-4.1"},
    {"Claude Sonnet 4": "anthropic/claude-sonnet-4"},
    {"Gemini 2.5 Flash": "google/gemini-2.5-flash"},
    # {"Grok 4": "x-ai/grok-4"},  # not liking image input
    {"Llama 4 Scout": "meta-llama/llama-4-scout"},
    {"Mistral Small 3.2": "mistralai/mistral-small-3.2-24b-instruct"},
    {"Qwen 2.5-VL": "qwen/qwen2.5-vl-72b-instruct"},
    # {"MiniMax - 01": "minimax/minimax-01"},
    # {"GLM 4.1V": "thudm/glm-4.1v-9b-thinking"},  # not liking image input
    # {"Amazon Nova Pro": "amazon/nova-pro-v1"},
]

In [51]:
async def query_image_async(model_kv, client, system, img_base64_str):
    model_name, model_openrouter = list(model_kv.items())[0]
    params = {
        "model": model_openrouter,
        "messages": [
            {"role": "system", "content": system.strip()},
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{img_base64_str}"},
                    }
                ],
            },
        ],
        # specifying provider is needed to ensure nonquantized models
        "provider": {
            "order": ["novita", "openai", "anthropic", "mistral", "google-ai-studio"],
            "allow_fallbacks": False,
        },
        "temperature": 0.0,  # for reproducibility (given same provider)
        "seed": 42,  # for reproducibility (given same provider)
        "max_tokens": 1000,  # for sanity
    }

    r = await client.post(
        url=API_URL, headers=headers, data=orjson.dumps(params), timeout=60.0
    )
    try:
        return {
            "model": model_name,
            "response": r.json()["choices"][0]["message"]["content"],
        }
    except Exception as e:
        print(r.json())
        return {
            "model": model_name,
            "response": "<ERROR>",
        }


async def query_models_async(model_list, client, system, img_base64_str):
    queries = [
        query_image_async(model, client, system, img_base64_str) for model in model_list
    ]

    results = await asyncio.gather(*queries)
    return results

In [43]:
client = httpx.AsyncClient()

results = await query_models_async(model_list, client, system, img_base64_str)

with pl.Config() as cfg:
    cfg.set_tbl_formatting("ASCII_MARKDOWN")
    cfg.set_fmt_str_lengths(10**5)
    cfg.set_tbl_width_chars(-1)
    cfg.set_tbl_hide_column_data_types(True)
    cfg.set_tbl_hide_dataframe_shape(True)

    print(pl.from_dicts(results))

| model            | response                                                                                                                                                                                                                            |
|------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GPT-4.1          | Sorry, I can't help with that.                                                                                                                                                                                                      |
| Claude Sonnet 4  | I can see a person speaking in what appears to be a library or bookstore setting, with an audience seated and listening. However, I cannot identify who the specific individuals are in this image based on their appearance alone

Functionalize to test more images.


In [52]:
async def test_llms(img_path):
    img = Image.open(img_path)
    img = img_to_base64_str(img)

    results = await query_models_async(model_list, client, system, img)

    with pl.Config() as cfg:
        cfg.set_tbl_formatting("ASCII_MARKDOWN")
        cfg.set_fmt_str_lengths(10**5)
        cfg.set_tbl_width_chars(-1)
        cfg.set_tbl_hide_column_data_types(True)
        cfg.set_tbl_hide_dataframe_shape(True)

        print(pl.from_dicts(results))

Priscilla Chan and Mark Zuckerberg: https://openverse.org/image/622d49f2-5a3b-417c-b044-180589557688?q=mark+zuckerberg&p=8


In [56]:
await test_llms("8827232234_bfeab50afb_b.jpg")

| model            | response                                                                                                                                                                                                                                       |
|------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GPT-4.1          | Sorry, I can't help with that.                                                                                                                                                                                                                 |
| Claude Sonnet 4  | I can see two people walking together in the foreground of this street scene, but I cannot identify who they are based on their appearance. The image shows them walking on a cobblestone street 

In [54]:
await test_llms("profpic.webp")

| model            | response                                                                                                                                                                                                         |
|------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GPT-4.1          | Sorry, I can't identify this person.                                                                                                                                                                             |
| Claude Sonnet 4  | I can see one person in this image - a young man wearing a gray North Face jacket with autumn foliage in the background. However, I cannot identify who this specific person is based on their appearance alone. |
| Gemini 2.5 Flash | There are no notable people present in this image. 

Fantastic Four poster: https://press.disney.co.uk/news/marvel-studios-the-fantastic-four-first-steps-all-new-trailer-&-poster-now-available


In [59]:
await test_llms("12_blue_teaser2_4x5_ig_2609a9ad.jpeg")

| model            | response                                                                                                                                                                                                                                                                                                                                         |
|------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| GPT-4.1          | Sorry, I can't help with that.                                                                                                                                                                                                                                     