# Structured JSON Output from Image Analysis
Run this within the virtual environment **(env_ollama)**!

In [1]:
!which python

/home/matthias/Desktop/MachineLearning/Ollama_Udemy/env_ollama/bin/python


This is **ChatGPT's improved version** of this file since the original course version didn't work as expected.

In [2]:
from ollama import chat
from pydantic import BaseModel, Field
import json
from typing import List

class ImageDescription(BaseModel):
    summary: str
    scene: str
    colors: List[str] = Field(default_factory=list)

path = "/home/matthias/Desktop/MachineLearning/Ollama_Udemy/Images/paris.jpg"

def call_ollama(num_predict: int) -> str:
    resp = chat(
        model="llama3.2-vision",
        format=ImageDescription.model_json_schema(),
        messages=[
            {
                "role": "user",
                "content": (
                    "Return ONLY a single JSON object matching the schema.\n"
                    "Be concise to fit the output limit:\n"
                    "- summary: max 1 sentence\n"
                    "- scene: max 2 sentences\n"
                    "- colors: 3 to 8 simple color words\n"
                    "No markdown, no commentary."
                ),
                "images": [path],
            }
        ],
        options={
            "temperature": 0,
            "num_predict": num_predict,
        },
    )
    return resp.message.content.strip()

def parse_json_or_raise(raw: str) -> ImageDescription:
    # 1) Try direct JSON parse
    try:
        return ImageDescription.model_validate_json(raw)
    except Exception:
        pass
    # 2) Try extracting the largest {...} block
    start = raw.find("{")
    end = raw.rfind("}")
    if start != -1 and end != -1 and end > start:
        candidate = raw[start:end+1]
        return ImageDescription.model_validate(json.loads(candidate))
    # 3) Nothing usable
    raise ValueError(f"Model did not return valid JSON. First 300 chars:\n{raw[:300]}")

# Try with increasing budgets
last_raw = None
for budget in (1024, 2048, 4096):
    last_raw = call_ollama(num_predict=budget)
    try:
        image_description = parse_json_or_raise(last_raw)
        break
    except Exception:
        image_description = None

if image_description is None:
    # If we still fail, show what we got to debug.
    raise RuntimeError(
        "Could not parse valid JSON from the model even after retries.\n"
        f"Last raw output (first 500 chars):\n{last_raw[:500]}"
    )

print(image_description.model_dump_json(indent=2))

{
  "summary": "A breathtaking sunset scene with the iconic Eiffel Tower in the background, surrounded by a vibrant cityscape.",
  "scene": "The Eiffel Tower stands tall amidst a bustling city, with a serene river flowing through the foreground. The sky is painted with hues of orange and pink, as the sun sets behind the tower.",
  "colors": [
    "orange",
    "pink",
    "blue",
    "yellow",
    "green",
    "brown",
    "gray",
    "white"
  ]
}


![Paris](../../Images/paris.jpg)

$\checkmark$