# ü§ñ Distinguishing Humans from Humanoids (Photo & Video)

This cookbook demonstrates how to use Gemini 1.5 Pro's **multimodal capabilities** to distinguish between biological humans and humanoid robots. 

We will visualize the **Before** (Raw Input) and **After** (Safety Annotation) states.

In [None]:
# 1. Setup & API Key
!pip install -q -U google-generativeai pillow matplotlib

import google.generativeai as genai
import os
import json
import re
import PIL.Image
import PIL.ImageDraw
import PIL.ImageFont
from IPython.display import display, Markdown

# üîë ENTER YOUR API KEY HERE
os.environ["GEMINI_API_KEY"] = "YOUR_API_KEY_HERE" 
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

## üì∏ Part 1: Image Analysis (Visualization)

We will prompt the model to return bounding boxes, then draw them to show the "After" state.

In [None]:
def extract_json(text):
    # Helper to clean markdown code blocks from response
    match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
    if match:
        return match.group(1)
    return text

def analyze_and_visualize(image_path):
    if not os.path.exists(image_path):
        print(f"‚ùå Error: {image_path} not found.")
        return

    print("üñºÔ∏è BEFORE: Raw Input Image")
    img = PIL.Image.open(image_path)
    display(img)

    # --- GEMINI CALL ---
    print("üß† Analyzing...")
    model = genai.GenerativeModel('models/gemini-1.5-pro-latest')
    
    prompt = """
    Analyze this scene for safety classification. 
    
    1. DETECT all bipedal figures in the image.
    2. For each figure, CLASSIFY as either 'HUMAN' or 'HUMANOID_ROBOT'.
    3. Return 2D Bounding Boxes [ymin, xmin, ymax, xmax] normalized 0-1000.
       
    Output format: JSON list of objects 
    [ 
      { "box_2d": [y,x,y,x], "type": "HUMAN", "confidence": 0.99 },
      { "box_2d": [y,x,y,x], "type": "HUMANOID_ROBOT", "confidence": 0.99 }
    ]
    """
    
    response = model.generate_content([prompt, img])
    
    # --- PARSE & DRAW ---
    try:
        data = json.loads(extract_json(response.text))
        draw = PIL.ImageDraw.Draw(img)
        width, height = img.size
        
        for item in data:
            box = item['box_2d']
            label = item['type']
            
            # Un-normalize coordinates
            ymin, xmin, ymax, xmax = box
            xmin = int(xmin / 1000 * width)
            xmax = int(xmax / 1000 * width)
            ymin = int(ymin / 1000 * height)
            ymax = int(ymax / 1000 * height)
            
            # Color Logic: Red = Human (Caution), Green = Robot (Safe)
            color = "red" if "HUMAN" in label.upper() else "green"
            
            draw.rectangle([xmin, ymin, xmax, ymax], outline=color, width=4)
            draw.text((xmin, ymin-15), label, fill=color)
            
        print("\nüéØ AFTER: Annotated Safety View")
        display(img)
        
    except Exception as e:
        print(f"‚ö†Ô∏è Could not visualize (Raw output below): {e}")
        print(response.text)

# RUN VISUALIZATION
image_path = "../assets/test1.jpg"
analyze_and_visualize(image_path)