# Annotate & Ingest — Manual Labeling Tool

Label indexed image crops and send annotations directly to Qdrant.

**Workflow:** Run cells top-to-bottom. For each crop:
- Type the text you see → Enter
- Type a topic tag → Enter (or Enter to skip)
- Enter with no text = skip crop
- Type `q` = stop annotating

In [None]:
import base64
import io
from datetime import datetime, timezone
from pathlib import Path

import matplotlib.pyplot as plt
from PIL import Image
from qdrant_client.models import SparseVector

from rag.client import get_client
from rag.config import settings
from rag.embeddings.bge_embedder import embed_dense, embed_sparse
from rag.schema import ensure_collection_exists
from ocr_agent.tools import _apply_high_contrast

INPUT_DIR = Path("../data/input")

client = get_client()
ensure_collection_exists(client)
print("Connected.")

In [None]:
def fetch_unannotated(limit: int = 20) -> list:
    """Fetch points with empty confirmed_text from Qdrant."""
    results = []
    offset = None
    while len(results) < limit:
        points, offset = client.scroll(
            collection_name=settings.COLLECTION_NAME,
            limit=min(100, limit - len(results)),
            offset=offset,
            with_payload=True,
            with_vectors=False,
        )
        for p in points:
            if not p.payload.get("confirmed_text", ""):
                results.append(p)
                if len(results) >= limit:
                    break
        if offset is None:
            break
    return results


def load_crop(source_image_id: str, region: dict) -> Image.Image | None:
    """Load source image and crop to region coords."""
    for ext in (".jpeg", ".jpg", ".png", ".tiff", ".bmp"):
        path = INPUT_DIR / f"{source_image_id}{ext}"
        if path.exists():
            img = Image.open(path).convert("RGB")
            x, y, w, h = region["x"], region["y"], region["w"], region["h"]
            return img.crop((x, y, x + w, y + h))
    # Try base64 from payload as fallback
    return None


def load_crop_from_b64(b64: str) -> Image.Image:
    """Decode a base64 image crop."""
    return Image.open(io.BytesIO(base64.b64decode(b64))).convert("RGB")


def display_crop(crop: Image.Image, idx: int, total: int):
    """Show original and enhanced crop side by side."""
    enhanced = _apply_high_contrast(crop)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 4))
    ax1.imshow(crop)
    ax1.set_title(f"Original  [{idx+1}/{total}]")
    ax1.axis("off")
    ax2.imshow(enhanced, cmap="gray")
    ax2.set_title(f"Enhanced  [{idx+1}/{total}]")
    ax2.axis("off")
    plt.tight_layout()
    plt.show()


def update_annotation(point_id: str, text: str, topic_tag: str, chunk_type: str):
    """Update a Qdrant point with text embeddings and annotation metadata."""
    dense_vecs = embed_dense([text])
    sparse_vecs = embed_sparse([text])

    client.set_payload(
        collection_name=settings.COLLECTION_NAME,
        payload={
            "confirmed_text": text,
            "raw_ocr_text": text,
            "confidence_score": 1.0,
            "from_human_review": True,
            "chunk_type": chunk_type,
            "topic_tags": [topic_tag] if topic_tag else [],
            "annotated_at": datetime.now(timezone.utc).isoformat(),
        },
        points=[point_id],
    )

    client.update_vectors(
        collection_name=settings.COLLECTION_NAME,
        points=[
            {
                "id": point_id,
                "vector": {
                    "text_dense": dense_vecs[0],
                    "text_sparse": sparse_vecs[0],
                },
            }
        ],
    )
    return point_id


print(f"Helpers loaded. Images dir: {INPUT_DIR.resolve()}")

In [None]:
# Fetch unannotated crops and start annotation loop
crops = fetch_unannotated(limit=30)
print(f"Found {len(crops)} unannotated crops.\n")

annotated_count = 0

for idx, point in enumerate(crops):
    payload = point.payload
    source_id = payload.get("source_image_id", "?")
    region = payload.get("region_coords", {})

    # Try loading crop from local file, fall back to stored base64
    crop_img = load_crop(source_id, region)
    if crop_img is None:
        b64 = payload.get("image_crop_base64", "")
        if b64:
            crop_img = load_crop_from_b64(b64)
        else:
            print(f"  [{idx+1}/{len(crops)}] Cannot load crop for {source_id}, skipping.")
            continue

    display_crop(crop_img, idx, len(crops))
    print(f"  Source: {source_id}  Region: y={region.get('y')}, h={region.get('h')}")

    text = input(f"  [{idx+1}/{len(crops)}] Text (Enter=skip, q=stop): ").strip()
    if text.lower() == "q":
        print("Stopping.")
        break
    if not text:
        print("  Skipped.")
        continue

    tag = input("  Topic tag (Enter=skip): ").strip()

    # Auto chunk_type
    wc = len(text.split())
    chunk_type = "word" if wc <= 2 else ("phrase" if wc <= 6 else "sentence")

    pid = update_annotation(point.id, text, tag, chunk_type)
    annotated_count += 1
    print(f"  Annotated: {pid} ({chunk_type})\n")

print(f"\nDone. Annotated {annotated_count} crops.")

In [None]:
# Stats
info = client.get_collection(settings.COLLECTION_NAME)
total = info.points_count

annotated = 0
tags: dict[str, int] = {}
offset = None
while True:
    points, offset = client.scroll(
        collection_name=settings.COLLECTION_NAME,
        limit=100, offset=offset, with_payload=True, with_vectors=False,
    )
    for p in points:
        if p.payload.get("confirmed_text", ""):
            annotated += 1
            for t in p.payload.get("topic_tags", []):
                tags[t] = tags.get(t, 0) + 1
    if not points or offset is None:
        break

print(f"Collection: {settings.COLLECTION_NAME}")
print(f"Total points:  {total}")
print(f"  Annotated:   {annotated}")
print(f"  Unannotated: {total - annotated}")
if tags:
    print("\nTopic tags:")
    for tag, count in sorted(tags.items(), key=lambda x: -x[1]):
        print(f"  {tag}: {count}")