In [12]:
# vision.py
from azure.cognitiveservices.vision.computervision import ComputerVisionClient
from msrest.authentication import CognitiveServicesCredentials
from azure.cognitiveservices.vision.computervision.models import OperationStatusCodes
import os
from io import BytesIO
import time


In [9]:
endpoint = os.getenv("AZURE_VISION_ENDPOINT")
key = os.getenv("AZURE_VISION_KEY")
client = ComputerVisionClient(endpoint, CognitiveServicesCredentials(key))

In [3]:
def analyze_image_bytes(image_bytes):
    """Return dict: captions, tags (with confidence), ocr_text (if any)."""
    from io import BytesIO
    stream = BytesIO(image_bytes)
    # describe
    desc = client.describe_image_in_stream(stream, max_candidates=3)
    captions = [{"text": c.text, "confidence": c.confidence} for c in (desc.captions or [])]
    # Reset stream pointer
    stream.seek(0)
    # tags
    tags_res = client.tag_image_in_stream(stream)
    tags = [{"name": t.name, "confidence": t.confidence} for t in (tags_res.tags or [])]
    # OCR (read)
    stream.seek(0)
    ocr = client.read_in_stream(stream, raw=True)
    # azure read is async — get result
    operation_location_remote = ocr.headers["Operation-Location"]
    operation_id = operation_location_remote.split("/")[-1]
    import time
    while True:
        result = client.get_read_result(operation_id)
        if result.status not in ['notStarted', 'running']:
            break
        time.sleep(0.5)
    ocr_text = ""
    if result.status == 'succeeded':
        for page in result.analyze_result.read_results:
            for line in page.lines:
                ocr_text += line.text + "\n"
    return {"captions": captions, "tags": tags, "ocr_text": ocr_text}

In [17]:
def analyze_image_bytes3(image_bytes: bytes) -> dict:
    """
    Returns:
      - captions: list of {text, confidence}
      - tags: list of {name, confidence}
      - ocr_text: extracted text
      - landmarks: list of detected landmarks
      - celebrities: list of detected celebrities
    """

    stream = BytesIO(image_bytes)

    # --------------------------------------------------------
    # DESCRIBE IMAGE (captions)
    # --------------------------------------------------------
    desc = client.describe_image_in_stream(stream, max_candidates=3)
    captions = [
        {"text": c.text, "confidence": c.confidence}
        for c in (desc.captions or [])
    ]

    # Reset
    stream.seek(0)

    # --------------------------------------------------------
    # TAGS
    # --------------------------------------------------------
    tag_res = client.tag_image_in_stream(stream)
    tags = [
        {"name": t.name, "confidence": t.confidence}
        for t in (tag_res.tags or [])
    ]

    # Reset
    stream.seek(0)

    # --------------------------------------------------------
    # OCR (Read API — async)
    # --------------------------------------------------------
    ocr_raw = client.read_in_stream(stream, raw=True)
    op_location = ocr_raw.headers["Operation-Location"]
    op_id = op_location.split("/")[-1]

    # Poll the result
    while True:
        read_res = client.get_read_result(op_id)
        if read_res.status not in ["notStarted", "running"]:
            break
        time.sleep(0.5)

    ocr_text = ""
    if read_res.status == OperationStatusCodes.succeeded:
        for page in read_res.analyze_result.read_results:
            for line in page.lines:
                ocr_text += line.text + "\n"

    # Reset
    stream.seek(0)

    # # --------------------------------------------------------
    # # CELEBRITY DETECTION  (Domain = "celebrities")
    # # --------------------------------------------------------
    # celeb_res = client.analyze_image_by_domain_in_stream(
    #     "celebrities",
    #     stream
    # )

    # celebrities = []
    # if celeb_res and celeb_res.result:
    #     for person in celeb_res.result.get("celebrities", []):
    #         celebrities.append({
    #             "name": person.get("name"),
    #             "confidence": person.get("confidence"),
    #             "faceRectangle": person.get("faceRectangle")
    #         })

    # # Reset
    # stream.seek(0)

    # --------------------------------------------------------
    # LANDMARK DETECTION  (Domain = "landmarks")
    # --------------------------------------------------------
    land_res = client.analyze_image_by_domain_in_stream(
        "landmarks",
        stream
    )

    landmarks = []
    if land_res and land_res.result:
        for lm in land_res.result.get("landmarks", []):
            landmarks.append({
                "name": lm.get("name"),
                "confidence": lm.get("confidence")
            })

    return {
        "captions": captions,
        "tags": tags,
        "ocr_text": ocr_text,
        #"celebrities": celebrities,
        "landmarks": landmarks
    }

In [18]:
with open("sample_images\\sample01.jpg","rb") as f:
    res = analyze_image_bytes3(f.read())
    print(res)

{'captions': [{'text': 'a group of people holding signs', 'confidence': 0.6494020819664001}], 'tags': [{'name': 'clothing', 'confidence': 0.9988632798194885}, {'name': 'human face', 'confidence': 0.9978727102279663}, {'name': 'person', 'confidence': 0.9968821406364441}, {'name': 'woman', 'confidence': 0.9683936834335327}, {'name': 'text', 'confidence': 0.956762969493866}, {'name': 'man', 'confidence': 0.9565277099609375}, {'name': 'smile', 'confidence': 0.9443103075027466}, {'name': 'newspaper', 'confidence': 0.8428946137428284}, {'name': 'outdoor', 'confidence': 0.8321356773376465}, {'name': 'group', 'confidence': 0.7874609231948853}, {'name': 'sign', 'confidence': 0.7766597867012024}, {'name': 'people', 'confidence': 0.7492111921310425}], 'ocr_text': 'TRUMI\nTRUMP\nWe WeTAFAGAIN!\nMAKE AMERICA GREAT AGAIN\nTRUMP\nthe silent majority\nMEINERICA GREAT AGAIN!\nT\nSTANDS WITH\nTRUMP\nSUME\nNEM\nWent may\nRUMP\nS W\nAMERI GREAT AGAIN!\nTR\nSTAN\nTRUMP\nthe site\nRUI\nARE AMERICA GREAT AGA

In [19]:
with open("sample_images\\sample02.jpg","rb") as f:
    res = analyze_image_bytes3(f.read())
    print(res)

{'captions': [{'text': 'a group of jockeys racing horses', 'confidence': 0.5538452863693237}], 'tags': [{'name': 'horse', 'confidence': 0.9770833253860474}, {'name': 'stadium', 'confidence': 0.9752413034439087}, {'name': 'outdoor', 'confidence': 0.9523265957832336}, {'name': 'building', 'confidence': 0.9427610039710999}, {'name': 'rein', 'confidence': 0.9391779899597168}, {'name': 'bridle', 'confidence': 0.9323909282684326}, {'name': 'stallion', 'confidence': 0.9239025712013245}, {'name': 'horse tack', 'confidence': 0.9214749336242676}, {'name': 'horse supplies', 'confidence': 0.9210265874862671}, {'name': 'flat racing', 'confidence': 0.9156216979026794}, {'name': 'halter', 'confidence': 0.9028061628341675}, {'name': 'mare', 'confidence': 0.8987149596214294}, {'name': 'equestrianism', 'confidence': 0.8742722868919373}, {'name': 'saddle', 'confidence': 0.8724848031997681}, {'name': 'horse racing', 'confidence': 0.870876133441925}, {'name': 'jockey', 'confidence': 0.8687554597854614}, {'