# Planogram Compliance with Roboflow CLIP API

This notebook uses Roboflow's workflow API to:
1. Detect and crop products from shelf images
2. Generate CLIP embeddings for each crop
3. Match crops against card thumbnail database using cosine similarity

In [2]:
# Install required dependencies
!pip install inference-sdk pillow numpy matplotlib

Collecting inference-sdk
  Using cached inference_sdk-0.62.4-py3-none-any.whl.metadata (20 kB)
Collecting pillow
  Using cached pillow-12.0.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.8 kB)
Collecting numpy
  Using cached numpy-2.3.5-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl.metadata (52 kB)
Collecting requests<3.0.0,>=2.32.0 (from inference-sdk)
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json~=0.6.0 (from inference-sdk)
  Using cached dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting opencv-python<=4.10.0.84,>=4.8.1.78 (from inference-sdk)
  Using cached opencv_python-4.10.0.84-cp37-abi3-macosx_11_0_arm64.whl.metadata (20 kB)
Collecting pillow
  Using cached pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting supervision>=0.26 (from inference-sdk)
  Downloading supervision-0.27.0-py3-none-any.whl.

In [12]:
import os
import numpy as np
from pathlib import Path
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from inference_sdk import InferenceHTTPClient

# Configuration
API_KEY = "W7ZLH2wzvGvK9yzBOpAv"
WORKSPACE = "sonic-eyes-ventures"
WORKFLOW_ID = "custom-workflow-4"
MIN_CROP_SIZE = 20  # Minimum width/height in pixels

# Paths
TEST_IMAGE = "test.png"
CARD_THUMBNAILS_DIR = "card_thumbnails (1)"

# Initialize Roboflow client
client = InferenceHTTPClient(
    api_url="https://serverless.roboflow.com",
    api_key=API_KEY
)

print("✓ Roboflow client initialized")

✓ Roboflow client initialized


## Step 1: Process Shelf Image (test.png)
Run the shelf image through Roboflow workflow to detect products and get CLIP embeddings

In [13]:
# Run workflow on test.png to get crop detections and embeddings
print(f"Processing {TEST_IMAGE}...")

shelf_result = client.run_workflow(
    workspace_name=WORKSPACE,
    workflow_id=WORKFLOW_ID,
    images={"image": TEST_IMAGE}
)

print(f"\n✓ Workflow completed")
print(f"\nRaw result structure:")
print(shelf_result)

Processing test.png...

✓ Workflow completed

Raw result structure:
[{'model': [[0.5006752014160156, 0.42468011379241943, -0.2923490107059479, -0.025572679936885834, 0.21318945288658142, -0.2589154839515686, 0.5769736766815186, 0.2872065603733063, 0.16146627068519592, 0.3118032217025757, 0.30420446395874023, 0.032862454652786255, -0.03863456845283508, -0.17639613151550293, 0.06438212096691132, -0.19098708033561707, -0.4248603582382202, 0.32134389877319336, 0.3941648602485657, -0.36425602436065674, 0.04175768792629242, 0.11021264642477036, -0.11569178104400635, 0.1273941695690155, -0.147600457072258, 0.3234085440635681, 0.1222507581114769, -0.20629657804965973, -0.4248005747795105, -0.11779582500457764, 0.4497871696949005, 0.11980880051851273, -0.15969493985176086, 0.08671273291110992, -0.7794615030288696, -0.06081423908472061, -0.6399855613708496, -0.24207955598831177, 0.11417797952890396, 0.935333251953125, -0.03641737997531891, -0.09097038209438324, 0.3869969844818115, -0.64194381237

In [14]:
# Extract crops and embeddings from workflow result
# Note: Adjust based on actual workflow output structure

shelf_crops = []

# Parse the workflow result to extract:
# - Bounding boxes (x, y, width, height)
# - CLIP embeddings for each crop
# This structure depends on your workflow output format

# Example parsing (adjust based on actual output):
for output in shelf_result:
    if 'predictions' in output or 'detections' in output:
        # Extract detection data
        detections = output.get('predictions', output.get('detections', []))
        
        for detection in detections:
            # Get bounding box
            x = detection.get('x', 0)
            y = detection.get('y', 0)
            width = detection.get('width', 0)
            height = detection.get('height', 0)
            
            # Filter crops that are too small
            if width >= MIN_CROP_SIZE and height >= MIN_CROP_SIZE:
                # Get CLIP embedding (adjust key based on workflow output)
                embedding = detection.get('clip_embedding', detection.get('embedding', None))
                
                if embedding is not None:
                    shelf_crops.append({
                        'bbox': (x, y, width, height),
                        'embedding': np.array(embedding),
                        'confidence': detection.get('confidence', 1.0)
                    })

print(f"✓ Found {len(shelf_crops)} valid crops (>{MIN_CROP_SIZE}px)")
if shelf_crops:
    print(f"  Sample crop shape: {shelf_crops[0]['bbox']}")
    print(f"  Embedding dimension: {shelf_crops[0]['embedding'].shape}")

✓ Found 0 valid crops (>20px)


## Step 2: Process Card Thumbnails (Option A - Roboflow API)
Generate CLIP embeddings for all card thumbnails using Roboflow API

In [None]:
# Get all card thumbnail paths
card_dir = Path(CARD_THUMBNAILS_DIR)
card_paths = sorted(list(card_dir.glob('*.jpg')) + list(card_dir.glob('*.png')))

print(f"Found {len(card_paths)} card thumbnails")

# Process each card through Roboflow to get CLIP embeddings
card_embeddings = []
card_names = []

for i, card_path in enumerate(card_paths):
    if i % 10 == 0:
        print(f"Processing card {i+1}/{len(card_paths)}...")
    
    try:
        # Run workflow on card thumbnail
        card_result = client.run_workflow(
            workspace_name=WORKSPACE,
            workflow_id=WORKFLOW_ID,
            images={"image": str(card_path)},
            use_cache=True
        )
        
        # Extract embedding from result
        # Adjust based on your workflow output structure
        embedding = None
        
        for output in card_result:
            # Try to get the whole image embedding or first detection
            if 'embedding' in output:
                embedding = output['embedding']
                break
            elif 'clip_embedding' in output:
                embedding = output['clip_embedding']
                break
            elif 'predictions' in output and len(output['predictions']) > 0:
                embedding = output['predictions'][0].get('embedding')
                break
        
        if embedding is not None:
            card_embeddings.append(np.array(embedding))
            card_names.append(card_path.name)
    
    except Exception as e:
        print(f"  Warning: Failed to process {card_path.name}: {e}")
        continue

# Convert to numpy array for efficient similarity computation
card_embeddings = np.stack(card_embeddings, axis=0)

print(f"\n✓ Successfully processed {len(card_names)} cards")
print(f"  Embeddings shape: {card_embeddings.shape}")

## Option B: Local CLIP Embeddings (Alternative - Commented Out)
Use local CLIP model to generate embeddings instead of Roboflow API

In [None]:
# # OPTION B: Generate card embeddings locally with CLIP
# # Uncomment this cell to use local CLIP model instead of Roboflow API
# 
# !pip install "torch>=2.2" "transformers>=4.45" pillow
# 
# import torch
# from transformers import CLIPProcessor, CLIPModel
# 
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# MODEL_ID = "openai/clip-vit-base-patch32"
# 
# processor = CLIPProcessor.from_pretrained(MODEL_ID)
# model = CLIPModel.from_pretrained(MODEL_ID).to(DEVICE).eval()
# 
# @torch.inference_mode()
# def embed_image_local(path: str) -> np.ndarray:
#     img = Image.open(path).convert("RGB")
#     inputs = processor(images=img, return_tensors="pt").to(DEVICE)
#     out = model.get_image_features(**inputs)
#     vec = torch.nn.functional.normalize(out.squeeze(0), dim=0)
#     return vec.cpu().numpy()
# 
# # Generate embeddings for all card thumbnails
# card_dir = Path(CARD_THUMBNAILS_DIR)
# card_paths = sorted(list(card_dir.glob('*.jpg')) + list(card_dir.glob('*.png')))
# 
# card_embeddings = []
# card_names = []
# 
# for i, card_path in enumerate(card_paths):
#     if i % 50 == 0:
#         print(f"Processing {i}/{len(card_paths)}...")
#     
#     try:
#         embedding = embed_image_local(str(card_path))
#         card_embeddings.append(embedding)
#         card_names.append(card_path.name)
#     except Exception as e:
#         print(f"Warning: Failed to process {card_path.name}: {e}")
#         continue
# 
# card_embeddings = np.stack(card_embeddings, axis=0)
# print(f"✓ Generated {len(card_names)} embeddings locally")
# print(f"  Shape: {card_embeddings.shape}")

## Step 3: Match Crops to Cards using Cosine Similarity

In [None]:
# Normalize embeddings for cosine similarity
# (If Roboflow already returns normalized vectors, this may not be needed)
def normalize(embeddings):
    norms = np.linalg.norm(embeddings, axis=-1, keepdims=True)
    return embeddings / (norms + 1e-8)

card_embeddings_norm = normalize(card_embeddings)

# Match each crop to the best card
matches = []

for i, crop in enumerate(shelf_crops):
    # Normalize crop embedding
    crop_embedding = normalize(crop['embedding'].reshape(1, -1))
    
    # Compute cosine similarity with all cards
    similarities = card_embeddings_norm @ crop_embedding.T
    similarities = similarities.squeeze()
    
    # Find best match
    best_idx = int(np.argmax(similarities))
    best_score = float(similarities[best_idx])
    
    # Get top 3 matches
    top3_idx = np.argsort(-similarities)[:3]
    top3_matches = [(card_names[idx], float(similarities[idx])) for idx in top3_idx]
    
    matches.append({
        'crop_id': i,
        'bbox': crop['bbox'],
        'best_match': card_names[best_idx],
        'similarity': best_score,
        'top3': top3_matches
    })
    
    print(f"\nCrop {i} @ {crop['bbox']}:")
    print(f"  Best: {card_names[best_idx]} (score: {best_score:.4f})")
    print(f"  Top 3: {top3_matches}")

print(f"\n✓ Matched {len(matches)} crops to cards")

## Step 4: Visualize Results

In [None]:
# Load and display the shelf image with matches
shelf_img = Image.open(TEST_IMAGE)

fig, ax = plt.subplots(1, 1, figsize=(16, 10))
ax.imshow(shelf_img)

# Draw bounding boxes with labels
for match in matches:
    x, y, w, h = match['bbox']
    
    # Convert from center coords to corner coords if needed
    # (Adjust based on your bbox format)
    x1 = x - w/2
    y1 = y - h/2
    
    # Draw rectangle
    rect = patches.Rectangle(
        (x1, y1), w, h,
        linewidth=2,
        edgecolor='lime',
        facecolor='none'
    )
    ax.add_patch(rect)
    
    # Add label with card name and similarity score
    label = f"{match['best_match']}\n{match['similarity']:.3f}"
    ax.text(
        x1, y1 - 5,
        label,
        bbox=dict(boxstyle='round', facecolor='lime', alpha=0.8),
        fontsize=8,
        verticalalignment='bottom'
    )

ax.axis('off')
plt.title(f"Planogram Compliance - {len(matches)} Products Detected", fontsize=16)
plt.tight_layout()
plt.show()

print(f"\n✓ Visualization complete!")

## Summary Statistics

In [None]:
# Print summary statistics
if matches:
    similarities = [m['similarity'] for m in matches]
    
    print("=" * 50)
    print("SUMMARY")
    print("=" * 50)
    print(f"Total crops detected: {len(shelf_crops)}")
    print(f"Valid crops (>{MIN_CROP_SIZE}px): {len(matches)}")
    print(f"Card database size: {len(card_names)}")
    print(f"\nSimilarity Scores:")
    print(f"  Mean: {np.mean(similarities):.4f}")
    print(f"  Median: {np.median(similarities):.4f}")
    print(f"  Min: {np.min(similarities):.4f}")
    print(f"  Max: {np.max(similarities):.4f}")
    print(f"\nConfidence levels:")
    print(f"  High (>0.9): {sum(s > 0.9 for s in similarities)}")
    print(f"  Medium (0.7-0.9): {sum(0.7 <= s <= 0.9 for s in similarities)}")
    print(f"  Low (<0.7): {sum(s < 0.7 for s in similarities)}")
else:
    print("No matches found. Check workflow output structure.")