# ü§ñ Distinguishing Humans from Humanoids (Photo & Video)

This cookbook demonstrates how to use Gemini 1.5 Pro's **multimodal capabilities** to distinguish between biological humans and humanoid robots. 

We will visualize the **Before** (Raw Input) and **After** (Safety Annotation) states.

In [None]:
# 1. Setup & API Key
# Using %pip as recommended to avoid environment issues
%pip install -q -U google-genai pillow matplotlib python-dotenv

from google import genai
from google.genai import types
import os
import json
import re
import PIL.Image
import PIL.ImageDraw
import PIL.ImageFont
from IPython.display import display, Markdown
from dotenv import load_dotenv

# Load API Key from .env file
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

if not api_key:
    print("‚ùå Error: GEMINI_API_KEY not found in .env file.")
else:
    client = genai.Client(api_key=api_key)
    print("‚úÖ Client initialized with gemini-robotics-er-1.5-preview.")

## üì∏ Part 1: Image Analysis (Visualization)

We will prompt the model to return bounding boxes, then draw them to show the "After" state.

In [None]:
def extract_json(text):
    # Helper to clean markdown code blocks from response
    match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
    if match:
        return match.group(1)
    return text

def analyze_and_visualize(image_path):
    if not os.path.exists(image_path):
        print(f"‚ùå Error: {image_path} not found.")
        return

    print("üñºÔ∏è BEFORE: Raw Input Image")
    img = PIL.Image.open(image_path)
    display(img)
    
    # Convert to bytes for new API
    with open(image_path, 'rb') as f:
        image_bytes = f.read()

    # --- GEMINI ROBOTICS CALL ---
    print("üß† Analyzing with Gemini Robotics ER 1.5...")
    
    prompt = """
    Analyze this scene for safety classification. 
    
    1. DETECT all bipedal figures in the image.
    2. For each figure, CLASSIFY as either 'HUMAN' or 'HUMANOID_ROBOT'.
    3. Return bounding boxes as a JSON array with labels.
    
    Output format: 
    [{"box_2d": [ymin, xmin, ymax, xmax], "label": "HUMAN (confidence)"}, ...]
    normalized to 0-1000.
    """
    
    try:
        response = client.models.generate_content(
            model="gemini-robotics-er-1.5-preview",
            contents=[
                types.Part.from_bytes(
                    data=image_bytes,
                    mime_type='image/jpeg',
                ),
                prompt
            ],
            config=types.GenerateContentConfig(
                temperature=0.5,
                thinking_config=types.ThinkingConfig(thinking_budget=1024) # Enable thinking for reasoning
            )
        )
    except Exception as e:
        print(f"‚ùå API Error: {e}")
        return
    
    # --- PARSE & DRAW ---
    try:
        text_output = response.text
        print(text_output) # Debug output
        
        data = json.loads(extract_json(text_output))
        draw = PIL.ImageDraw.Draw(img)
        width, height = img.size
        
        for item in data:
            box = item.get('box_2d')
            label = item.get('label')
            if not box: continue
            
            # Un-normalize coordinates
            ymin, xmin, ymax, xmax = box
            xmin = int(xmin / 1000 * width)
            xmax = int(xmax / 1000 * width)
            ymin = int(ymin / 1000 * height)
            ymax = int(ymax / 1000 * height)
            
            color = "red" if "HUMAN" in label.upper() else "green"
            
            draw.rectangle([xmin, ymin, xmax, ymax], outline=color, width=4)
            draw.text((xmin, ymin-15), label, fill=color)
            
        print("\nüéØ AFTER: Annotated Safety View")
        display(img)
        
    except Exception as e:
        print(f"‚ö†Ô∏è Could not visualize: {e}")
        print(response.text)

# RUN VISUALIZATION
image_path = "../assets/test1.jpg"
analyze_and_visualize(image_path)