In [2]:
import os
from azure.ai.vision.imageanalysis import ImageAnalysisClient
from azure.core.credentials import AzureKeyCredential

In [3]:
VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT")
VISION_API_KEY = os.getenv("AZURE_VISION_KEY")

client = ImageAnalysisClient(
    endpoint=VISION_ENDPOINT,
    credential=AzureKeyCredential(VISION_API_KEY)
)

In [20]:
def analyze_image(image_bytes: bytes) -> dict:
    """
    Runs Azure Vision analysis:
    - Dense Captions
    - OCR (Read)
    - Object Detection
    - People / Celebrity Detection (if available)
    - Landmark Detection (if available)
    """

    # Allowed features in the new Image Analysis SDK:
    #   - "Caption" / "DenseCaptions"
    #   - "Read"
    #   - "ObjectDetection"
    #   - "People"
    #   - "SmartCrops"
    #   - "Tags"
    #
    # Celebrity/landmark detection is included under "People" and "Tags"
    # depending on region/model availability.

    features = [
        "DenseCaptions",
        "Read",
        "Objects",
        "People",
        "Tags",
    ]

    result = client.analyze(
        image_data=image_bytes,
        visual_features=features,
        gender_neutral_caption=False
    )

    summary = {}

    # ---------------------------------------------------------
    # Dense Captions
    # ---------------------------------------------------------
    if hasattr(result, "dense_captions") and result.dense_captions:
        summary["dense_captions"] = []
        for cap in result.dense_captions.list:
            summary["dense_captions"].append({
                "text": cap.text,
                "confidence": cap.confidence,
                "bbox": {
                    "x": cap.bounding_box.x,
                    "y": cap.bounding_box.y,
                    "w": cap.bounding_box.width,
                    "h": cap.bounding_box.height,
                },
            })

   # -----------------------------------------------------
    # Tags (may contain landmarks)
    # -----------------------------------------------------
    if hasattr(result, "tags") and result.tags:
        summary["tags"] = [
            {"tag": tag.name, "confidence": tag.confidence}
            for tag in result.tags.list
        ]

        # possible landmark detection
        summary["landmarks"] = [
            {"name": tag.name, "confidence": tag.confidence}
            for tag in result.tags.list
            if "landmark" in tag.name.lower()
        ]

    # -----------------------------------------------------
    # OCR (Read)
    # -----------------------------------------------------
    if hasattr(result, "read") and result.read and result.read.blocks:
        all_text = []
        for block in result.read.blocks:
            for line in block.lines:
                all_text.append(line.text)
        summary["ocr_text"] = "\n".join(all_text)

    # # -----------------------------------------------------
    # # Object Detection (Objects)
    # # -----------------------------------------------------
    # if hasattr(result, "objects") and result.objects:
    #     summary["objects"] = []
    #     for obj in result.objects.list:
    #         tag = obj.tags[0] if obj.tags else None
    #         summary["objects"].append({
    #             "name": tag.name if tag else "object",
    #             "confidence": tag.confidence if tag else None,
    #             "bbox": {
    #                 "x": obj.bounding_box.x,
    #                 "y": obj.bounding_box.y,
    #                 "w": obj.bounding_box.width,
    #                 "h": obj.bounding_box.height,
    #             },
    #         })

    # # -----------------------------------------------------
    # # People (celebrity detection embedded)
    # # -----------------------------------------------------
    # if hasattr(result, "people") and result.people:
    #     summary["people"] = []
    #     for person in result.people.list:
    #         entry = {
    #             "confidence": person.confidence,
    #             "bbox": {
    #                 "x": person.bounding_box.x,
    #                 "y": person.bounding_box.y,
    #                 "w": person.bounding_box.width,
    #                 "h": person.bounding_box.height,
    #             }
    #         }
    #         if getattr(person, "name", None):
    #             entry["celebrity_name"] = person.name
    #         summary["people"].append(entry)

 

    return summary

In [21]:
with open("sample_images\\sample01.jpg","rb") as f:
    res = analyze_image(f.read())
    print(res)

{'dense_captions': [{'text': 'a group of people holding signs', 'confidence': 0.9424399137496948, 'bbox': {'x': 0, 'y': 0, 'w': 1000, 'h': 580}}, {'text': 'a woman taking a selfie', 'confidence': 0.9134508371353149, 'bbox': {'x': 783, 'y': 188, 'w': 209, 'h': 369}}, {'text': 'a man holding a cellphone', 'confidence': 0.7785478234291077, 'bbox': {'x': 384, 'y': 158, 'w': 233, 'h': 348}}, {'text': 'a man in a blue suit', 'confidence': 0.7847445607185364, 'bbox': {'x': 79, 'y': 192, 'w': 294, 'h': 269}}, {'text': 'a woman holding a sign', 'confidence': 0.8923084735870361, 'bbox': {'x': 533, 'y': 140, 'w': 217, 'h': 280}}, {'text': 'a man with a beard', 'confidence': 0.811130702495575, 'bbox': {'x': 0, 'y': 122, 'w': 143, 'h': 224}}, {'text': 'a man in a hat and leather jacket', 'confidence': 0.6910561919212341, 'bbox': {'x': 830, 'y': 31, 'w': 163, 'h': 269}}, {'text': 'a woman speaking into a microphone', 'confidence': 0.7274485230445862, 'bbox': {'x': 242, 'y': 149, 'w': 148, 'h': 322}}

In [22]:
with open("sample_images\\sample02.jpg","rb") as f:
    res = analyze_image(f.read())
    print(res)

{'dense_captions': [{'text': 'a group of jockeys riding horses on grass', 'confidence': 0.8134883046150208, 'bbox': {'x': 0, 'y': 0, 'w': 1920, 'h': 1079}}, {'text': 'a jockey riding a horse', 'confidence': 0.7614299058914185, 'bbox': {'x': 809, 'y': 224, 'w': 465, 'h': 727}}, {'text': 'a group of jockeys riding horses on grass', 'confidence': 0.812589168548584, 'bbox': {'x': 62, 'y': 100, 'w': 1769, 'h': 921}}, {'text': 'a group of horses on a track', 'confidence': 0.6950061321258545, 'bbox': {'x': 300, 'y': 148, 'w': 625, 'h': 885}}, {'text': 'a jockey on a horse', 'confidence': 0.7583951354026794, 'bbox': {'x': 440, 'y': 10, 'w': 311, 'h': 544}}, {'text': 'a jockey riding a horse', 'confidence': 0.7707228064537048, 'bbox': {'x': 64, 'y': 527, 'w': 310, 'h': 409}}, {'text': 'a jockey riding a horse in a race', 'confidence': 0.7399764060974121, 'bbox': {'x': 1181, 'y': 171, 'w': 555, 'h': 782}}, {'text': 'a jockey on a horse', 'confidence': 0.7769850492477417, 'bbox': {'x': 873, 'y': 

In [23]:
with open("sample_images\\sample04.jpg","rb") as f:
    res = analyze_image(f.read())
    print(res)

{'dense_captions': [{'text': 'a man in a suit holding up his hand', 'confidence': 0.8234245181083679, 'bbox': {'x': 0, 'y': 0, 'w': 960, 'h': 1165}}, {'text': 'a man in a suit holding up his hand', 'confidence': 0.8233250379562378, 'bbox': {'x': 21, 'y': 43, 'w': 916, 'h': 1097}}, {'text': "close-up of a black and white photo of a man's head", 'confidence': 0.7811080813407898, 'bbox': {'x': 522, 'y': 476, 'w': 162, 'h': 351}}, {'text': 'a man in a suit and tie', 'confidence': 0.8933911323547363, 'bbox': {'x': 204, 'y': 458, 'w': 670, 'h': 696}}, {'text': 'close-up of a man with a mustache', 'confidence': 0.9058632850646973, 'bbox': {'x': 372, 'y': 72, 'w': 289, 'h': 391}}, {'text': 'a black and white photo of a lamp', 'confidence': 0.7153570652008057, 'bbox': {'x': 826, 'y': 167, 'w': 129, 'h': 528}}, {'text': "a close-up of a man's lips", 'confidence': 0.8710940480232239, 'bbox': {'x': 476, 'y': 322, 'w': 114, 'h': 79}}, {'text': 'a close-up of a hand', 'confidence': 0.889273703098297