In [None]:
# Load JSON
input_path = Path("../memes.json")
output_path = input_path
with input_path.open("r", encoding="utf-8") as f:
    memes: Dict[str, Dict[str, Any]] = json.load(f)

# Remove descriptions
for _, meme in memes.items():
    # Remove image description
    if "image_description" in meme:
        del meme["image_description"]

    # Remove text descriptions from text options
    text_options = meme.get("text_options", [])
    for opt in text_options:
        if "description" in opt:
            del opt["description"]

# Write updated file
with output_path.open("w", encoding="utf-8") as f:
    json.dump(memes, f, ensure_ascii=False, indent=2)

print(f"Removed descriptions from memes. Wrote: {output_path}")

Removed descriptions from memes. Wrote: ../memes.json


In [52]:
!pip install pillow



In [None]:
# If needed: %pip install pillow

import json
from pathlib import Path
from typing import Dict, Any, List, Tuple
from PIL import Image, ImageDraw, ImageFont

base_dir = Path("/Users/o_sho/Documents/Coding.nosync/Personal/mcp test")
images_dir = base_dir / "memes_images"
output_dir = base_dir / "annotated_meme_images"
input_path = base_dir / "memes.json"

output_dir.mkdir(parents=True, exist_ok=True)


def load_font_for_box(box_w: int, box_h: int):
    size = max(12, min(int(min(box_w, box_h) * 0.6), 128))
    for name in ["DejaVuSans-Bold.ttf", "Arial.ttf", "Helvetica.ttf"]:
        try:
            return ImageFont.truetype(name, size=size)
        except Exception:
            continue
    return ImageFont.load_default()


def text_size(draw, text: str, font) -> Tuple[int, int]:
    try:
        bbox = draw.textbbox((0, 0), text, font=font, stroke_width=2)
        return (bbox[2] - bbox[0], bbox[3] - bbox[1])
    except Exception:
        return draw.textsize(text, font=font)


def normalize_box_to_expected_pixels(
    pos: Dict[str, Any], expected_w: float, expected_h: float
) -> Tuple[float, float, float, float]:
    # Accept pixels, fractions (0..1), or percents (0..100)
    left = float(pos.get("left", 0))
    top = float(pos.get("top", 0))
    width = float(pos.get("width", 0))
    height = float(pos.get("height", 0))

    mx = max(left, top, width, height)

    if mx <= 1.0:
        # Treat as fractions of expected size
        left *= expected_w
        width *= expected_w
        top *= expected_h
        height *= expected_h
    elif mx <= 100.0 and any(
        isinstance(v, float) and not v.is_integer() for v in [left, top, width, height]
    ):
        # Heuristic: treat as percentages if values look like percents (and not all ints)
        left = (left / 100.0) * expected_w
        width = (width / 100.0) * expected_w
        top = (top / 100.0) * expected_h
        height = (height / 100.0) * expected_h
    # else: assume pixels already relative to expected_w/expected_h

    return left, top, width, height


def annotate_image(image_path: Path, meme: Dict[str, Any], out_path: Path) -> None:
    text_options: List[Dict[str, Any]] = meme.get("text_options", [])
    if not isinstance(text_options, list) or not text_options:
        return

    try:
        img = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Skip (open error): {image_path.name}: {e}")
        return

    im_w, im_h = img.size
    expected_w = float(meme.get("width", im_w) or im_w)
    expected_h = float(meme.get("height", im_h) or im_h)

    # Scale from expected (CSS/display) space -> actual image pixels
    sx = im_w / max(expected_w, 1.0)
    sy = im_h / max(expected_h, 1.0)

    draw = ImageDraw.Draw(img)

    def clamp(v, lo, hi):
        return max(lo, min(hi, v))

    for idx, opt in enumerate(text_options):
        pos = opt.get("updated_position")
        if not isinstance(pos, dict):
            continue

        # Normalize to expected/display pixels, then scale to actual image pixels
        left, top, width, height = normalize_box_to_expected_pixels(
            pos, expected_w, expected_h
        )
        left = int(round(left * sx))
        top = int(round(top * sy))
        width = int(round(width * sx))
        height = int(round(height * sy))

        # print("left, top, width, height:", left, top, width, height)
        # print("sx, sy:", sx, sy)

        # # Clamp to image bounds
        left = clamp(left, 0, im_w - 1)
        top = clamp(top, 0, im_h - 1)
        right = clamp(left + max(width, 1), 1, im_w)
        bottom = clamp(top + max(height, 1), 1, im_h)

        # Draw rectangle
        draw.rectangle([(left, top), (right, bottom)], outline=(255, 0, 0), width=3)

        # Center index label
        label = str(idx)
        font = load_font_for_box(right - left, bottom - top)
        tw, th = text_size(draw, label, font)
        tx = left + ((right - left) - tw) // 2
        ty = top + ((bottom - top) - th) // 2

        draw.text(
            (tx, ty),
            label,
            font=font,
            fill=(255, 255, 255),
            stroke_width=2,
            stroke_fill=(0, 0, 0),
        )

    try:
        img.save(out_path)
    except Exception as e:
        print(f"Skip (save error): {out_path.name}: {e}")


# Load and annotate all images
with input_path.open("r", encoding="utf-8") as f:
    memes: Dict[str, Dict[str, Any]] = json.load(f)

count = 0
for _, meme in memes.items():
    filename = meme.get("filename")
    if not filename:
        continue
    src = images_dir / filename
    if not src.exists():
        print(f"Missing image: {src}")
        continue
    dst = output_dir / filename
    annotate_image(src, meme, dst)
    count += 1

print(f"Annotated {count} images -> {output_dir}")

Annotated 1935 images -> /Users/o_sho/Documents/Coding.nosync/Personal/mcp test/annotated_meme_images


In [70]:
# Run annotate_image only on This-Is-Fine
meme = memes["This-Is-Fine"]
filename = meme.get("filename")
if filename:
    src = images_dir / filename
    if src.exists():
        dst = output_dir / filename
        annotate_image(src, meme, dst)
        count = 1
    else:
        print(f"Missing image: {src}")
        count = 0
else:
    print("Missing filename for This-Is-Fine")
    count = 0

expected_w, expected_h: 580.0 282.0
im_w, im_h: 580 282
left, top, right, bottom: 203 152 340 183
left, top, right, bottom: 8 8 192 61


In [None]:
import os
import json
import mimetypes
import google.generativeai as genai


def analyze_meme_with_gemini(
    image_path: str,
    image_width: int,
    image_height: int,
    text_options: list,
    model: str = "gemini-2.5-pro",
    temperature: float = 0.4,
):
    """
    Input:
      image_path: local path to image
      image_width, image_height: image dimensions
      text_options: list of dicts like:
          [{"position": {"left": 4, "top": 208, "width": 386, "height": 74}, "description": ""}, ...]
    Output:
      dict with:
        - "image_description": detailed description of the meme
        - "text_descriptions": list of per-box descriptions
    """

    api_key = "key"

    mime, _ = mimetypes.guess_type(image_path)
    mime = mime or "image/jpeg"

    with open(image_path, "rb") as f:
        img_bytes = f.read()

    genai.configure(api_key=api_key)
    model_obj = genai.GenerativeModel(model)

    # JSON schema for rigid structure
    schema = {
        "type": "object",
        "properties": {
            "image_description": {"type": "string"},
            "text_descriptions": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "index": {"type": "integer"},
                        "position": {
                            "type": "object",
                            "properties": {
                                "left": {"type": "number"},
                                "top": {"type": "number"},
                                "width": {"type": "number"},
                                "height": {"type": "number"},
                            },
                            "required": ["left", "top", "width", "height"],
                        },
                        "description": {"type": "string"},
                    },
                    "required": ["index", "position", "description"],
                },
            },
        },
        "required": ["image_description", "text_descriptions"],
    }

    prompt = f"""
You are an expert meme analyst. Describe the provided meme image in detail.

For the image:
- Explain what is happening, who or what is depicted, and any pop culture reference very briefly.
- Describe the characters and how they’re commonly referred to.
- Explain the situation and how this meme is typically used culturally.
- Write the description as natural, expressive text (embedding-ready).
- Share only what is relavant to the cultural and meme usage of the image.
- Keep it concise (3-4 sentences)

For each text region:
- Use the provided pixel positions as context - pay very close attention to the position of the text to ensure that you are describing the correct text region.
- Describe what kind of text typically goes there and why.
- Mention spatial or contextual clues if relevant.
- Explain the text's relevance to the meme
- Output all results in a single structured JSON following the schema.
"""

    user_data = {
        "image_size": {"width": image_width, "height": image_height},
        "text_options": text_options,
    }

    generation_config = {
        "response_mime_type": "application/json",
        "response_schema": schema,
        "temperature": temperature,
    }

    resp = model_obj.generate_content(
        contents=[
            {
                "role": "user",
                "parts": [
                    {"text": prompt},
                    {"inline_data": {"mime_type": mime, "data": img_bytes}},
                    {"text": json.dumps(user_data)},
                ],
            }
        ],
        generation_config=generation_config,
    )

    return json.loads(resp.text)

In [None]:
with open("../memes.json", "r") as f:
    memes = json.load(f)
    meme = memes["Distracted-Boyfriend"]

# Construct full image path from filename
image_path = os.path.join("../memes_images", meme["filename"])

output = analyze_meme_with_gemini(
    image_path,
    meme["width"],
    meme["height"],
    meme["text_options"],
)

E0000 00:00:1760930392.002102 2546074 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [44]:
output

{'image_description': 'The "Distracted Boyfriend" meme is a stock photo showing a man looking back at another woman while his girlfriend looks on in disgust. The man represents a person or entity, his girlfriend symbolizes a current responsibility or sensible choice, and the woman in red represents a new, tempting, but often unwise, alternative. This format is widely used to humorously critique poor decision-making, infidelity to a brand or idea, or the allure of new trends over existing commitments.',
 'text_descriptions': [{'index': 1,
   'position': {'left': 589, 'top': 259, 'width': 174, 'height': 133},
   'description': 'This text label is placed over the girlfriend, who has a shocked and disapproving expression. It represents the current, sensible, or responsible option that is being neglected or betrayed in favor of something new.'},
  {'index': 2,
   'position': {'left': 370, 'top': 211, 'width': 208, 'height': 102},
   'description': 'This text label is placed on the man looki