# Triton API Person Detection

This notebook demonstrates person detection using YOLOv11 deployed on OpenShift AI with Triton Inference Server via direct HTTP/REST API calls.

## Prerequisites:
- ONNX model deployed on OpenShift AI with Triton runtime
- InferenceService endpoint accessible
- Network connectivity to OpenShift cluster

In [None]:
import sys
sys.path.insert(0, '..')

import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import time
import requests
import urllib3

# Disable SSL warnings for self-signed certificates
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

from src.detection.visualizer import draw_detections, create_detection_summary

%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)

print("Libraries imported successfully")

## 1. Configure Triton Endpoint

Update the endpoint URL to match your OpenShift AI deployment.

In [None]:
# Configure endpoint (update with your actual endpoint)
# Get the URL with: oc get inferenceservice -n train-detection
ENDPOINT_URL = "https://train-detection-model-train-detection.apps.cluster-rk6mx.rk6mx.sandbox492.opentlc.com"

# IMPORTANT: Use Triton model name, not InferenceService name
# The InferenceService name is "train-detection-model" (from oc get inferenceservice)
# But the Triton model name is "yolo11n" (from S3 directory structure)
MODEL_NAME = "yolo11n"

# Confidence threshold for detections
CONF_THRESHOLD = 0.25

# API endpoints
HEALTH_URL = f"{ENDPOINT_URL}/v2/health/ready"
METADATA_URL = f"{ENDPOINT_URL}/v2/models/{MODEL_NAME}"
INFER_URL = f"{ENDPOINT_URL}/v2/models/{MODEL_NAME}/infer"

print(f"Endpoint URL: {ENDPOINT_URL}")
print(f"Model Name: {MODEL_NAME}")
print(f"\nAPI Endpoints:")
print(f"  Health: {HEALTH_URL}")
print(f"  Metadata: {METADATA_URL}")
print(f"  Inference: {INFER_URL}")

## 2. Helper Functions for Inference

Functions for preprocessing images, calling Triton API, and postprocessing results.

In [None]:
def preprocess_image(image_path, target_size=640):
    """Preprocess image for YOLOv11 inference."""
    # Read image
    img = cv2.imread(str(image_path))
    if img is None:
        raise ValueError(f"Could not read image: {image_path}")
    
    orig_shape = img.shape[:2]  # H, W
    
    # Resize to target size (letterbox)
    img_resized = cv2.resize(img, (target_size, target_size))
    
    # Convert BGR to RGB
    img_rgb = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
    
    # Normalize to [0, 1]
    img_normalized = img_rgb.astype(np.float32) / 255.0
    
    # Transpose from HWC to CHW
    img_chw = np.transpose(img_normalized, (2, 0, 1))
    
    # Add batch dimension [3,640,640] -> [1,3,640,640]
    # Required when model has max_batch_size > 0
    img_batch = np.expand_dims(img_chw, axis=0)
    
    return img_batch, orig_shape


def postprocess_yolo_output(output_data, orig_shape, conf_threshold=0.25, input_size=640):
    """Postprocess YOLO output to extract person detections."""
    # YOLO output shape: [1, 84, 8400] or [84, 8400]
    # 84 = 4 bbox coords + 80 class scores
    # 8400 = number of predictions
    
    # Reshape if needed
    if isinstance(output_data, list):
        output_data = np.array(output_data)
    
    # Handle different output shapes
    if len(output_data.shape) == 3:
        output_data = output_data[0]  # Remove batch dim
    
    # Transpose to [8400, 84]
    if output_data.shape[0] == 84:
        output_data = output_data.T
    
    # Extract bboxes and scores
    boxes = output_data[:, :4]  # x, y, w, h
    scores = output_data[:, 4:]  # class scores
    
    # Get person class (class 0 in COCO)
    person_scores = scores[:, 0]
    
    # Filter by confidence
    mask = person_scores > conf_threshold
    boxes = boxes[mask]
    person_scores = person_scores[mask]
    
    # Scale boxes to original image size
    scale_x = orig_shape[1] / input_size
    scale_y = orig_shape[0] / input_size
    
    detections = []
    for box, score in zip(boxes, person_scores):
        x, y, w, h = box
        
        # Convert from center format to corner format
        x1 = (x - w/2) * scale_x
        y1 = (y - h/2) * scale_y
        x2 = (x + w/2) * scale_x
        y2 = (y + h/2) * scale_y
        
        detections.append({
            'bbox': [float(x1), float(y1), float(x2), float(y2)],
            'confidence': float(score),
            'class_id': 0,
            'class_name': 'person'
        })
    
    return detections


def infer_call(image_path, conf_threshold=0.25, debug=False):
    """Generic inference call - works with any KServe V2 compatible backend."""
    start_time = time.time()
    
    # Preprocess
    img_data, orig_shape = preprocess_image(image_path)
    
    # Prepare request (KServe V2 format)
    request_data = {
        "inputs": [{
            "name": "images",
            "shape": list(img_data.shape),
            "datatype": "FP32",
            "data": img_data.flatten().tolist()
        }]
    }
    
    if debug:
        print("=== DEBUG: Request ===")
        print(f"Image shape: {img_data.shape}")
        print(f"Request shape: {request_data['inputs'][0]['shape']}")
        print(f"Data length: {len(request_data['inputs'][0]['data'])}")
        print(f"URL: {INFER_URL}")
    
    # Send request
    response = requests.post(
        INFER_URL,
        json=request_data,
        verify=False,
        timeout=30
    )
    
    # Handle errors
    if response.status_code != 200:
        print(f"=== ERROR {response.status_code} ===")
        print(f"Response: {response.text}")
        response.raise_for_status()
    
    # Parse response
    result = response.json()
    
    if debug:
        print("\n=== DEBUG: Response ===")
        print(f"Status: {response.status_code}")
        print(f"Outputs: {len(result.get('outputs', []))}")
        if result.get('outputs'):
            print(f"Output shape: {result['outputs'][0].get('shape', 'N/A')}")
            print(f"Output data length: {len(result['outputs'][0].get('data', []))}")
    
    output_data = result["outputs"][0]["data"]
    
    # Postprocess
    detections = postprocess_yolo_output(output_data, orig_shape, conf_threshold)
    
    inference_time = (time.time() - start_time) * 1000
    
    return {
        'detections': detections,
        'inference_time_ms': inference_time,
        'orig_shape': orig_shape
    }

print("Helper functions defined successfully")

## 3. Health Check

In [None]:
# Check if endpoint is healthy
try:
    response = requests.get(HEALTH_URL, verify=False, timeout=10)
    is_healthy = response.status_code == 200
    
    if is_healthy:
        print("✓ Endpoint is healthy")
    else:
        print(f"✗ Endpoint returned status code: {response.status_code}")
except Exception as e:
    print(f"✗ Health check failed: {e}")
    print("Please verify:")
    print("  1. InferenceService is deployed and running")
    print("  2. Endpoint URL is correct")
    print("  3. Network connectivity to OpenShift cluster")

## 4. Get Model Metadata

In [None]:
# Get model metadata
try:
    response = requests.get(METADATA_URL, verify=False, timeout=10)
    response.raise_for_status()
    
    metadata = response.json()
    
    print("Model Metadata:")
    print(f"  Name: {metadata['name']}")
    print(f"  Platform: {metadata.get('platform', 'N/A')}")
    print(f"  Versions: {metadata.get('versions', [])}")
    print(f"\n  Inputs:")
    for inp in metadata.get('inputs', []):
        print(f"    - {inp['name']}: {inp['datatype']} {inp['shape']}")
    print(f"\n  Outputs:")
    for out in metadata.get('outputs', []):
        print(f"    - {out['name']}: {out['datatype']} {out['shape']}")
        
except Exception as e:
    print(f"Could not retrieve model metadata: {e}")

## 5. Test Inference on Sample Image

In [None]:
# Download sample image
import urllib.request

sample_image_url = "https://ultralytics.com/images/bus.jpg"
sample_image_path = "sample_image.jpg"

urllib.request.urlretrieve(sample_image_url, sample_image_path)
print(f"Downloaded sample image to: {sample_image_path}")

In [None]:
# Run inference via API (with debug enabled first time)
result = infer_call(sample_image_path, conf_threshold=CONF_THRESHOLD, debug=True)

print(f"\nInference Time: {result['inference_time_ms']:.2f} ms")
print(f"Image Shape: {result['orig_shape']}")
print(f"Number of persons detected: {len(result['detections'])}")

# Print detection details
for i, det in enumerate(result['detections']):
    bbox = det['bbox']
    print(f"Person {i+1}: confidence={det['confidence']:.2f}, bbox=[{bbox[0]:.1f}, {bbox[1]:.1f}, {bbox[2]:.1f}, {bbox[3]:.1f}]")

In [None]:
# Visualize results
image = cv2.imread(sample_image_path)

# Convert detections to format expected by visualizer
class Detection:
    def __init__(self, bbox, confidence):
        self.bbox = bbox
        self.confidence = confidence

detections_obj = [Detection(d['bbox'], d['confidence']) for d in result['detections']]
annotated_image = draw_detections(image, detections_obj)
annotated_rgb = cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)

plt.figure(figsize=(14, 10))
plt.imshow(annotated_rgb)
plt.title(f"Triton API Detection: {len(result['detections'])} persons - {result['inference_time_ms']:.2f} ms")
plt.axis('off')
plt.show()

## 6. Compare with Local Model

Compare API inference results with local YOLOv11 model.

In [None]:
from src.detection.yolo_detector import YOLODetector
from src.utils.config import Config

# Load local detector
local_detector = YOLODetector(
    model_path=str(Config.get_model_path('yolo11n.pt')),
    conf_threshold=CONF_THRESHOLD
)

# Run local inference
start_time = time.time()
local_image, local_detections = local_detector.process_image(sample_image_path)
local_time = (time.time() - start_time) * 1000

print("\nComparison:")
print(f"  API Detections: {len(result['detections'])} in {result['inference_time_ms']:.2f} ms")
print(f"  Local Detections: {len(local_detections)} in {local_time:.2f} ms")
print(f"  Difference: {abs(len(result['detections']) - len(local_detections))} detections")
print(f"  Time Overhead: {(result['inference_time_ms'] - local_time):.2f} ms")

## 7. Benchmark API Performance

In [None]:
# Benchmark API inference
num_runs = 10
times = []
detection_counts = []

print(f"Running {num_runs} API inference requests...")
for i in range(num_runs):
    result_bench = infer_call(sample_image_path, conf_threshold=CONF_THRESHOLD)
    times.append(result_bench['inference_time_ms'])
    detection_counts.append(len(result_bench['detections']))
    print(f"  Run {i+1}: {result_bench['inference_time_ms']:.2f} ms, {len(result_bench['detections'])} detections")

print(f"\nBenchmark Results ({num_runs} runs):")
print(f"  Average Time: {np.mean(times):.2f} ms")
print(f"  Min Time: {np.min(times):.2f} ms")
print(f"  Max Time: {np.max(times):.2f} ms")
print(f"  Std Dev: {np.std(times):.2f} ms")
print(f"  Throughput: {1000/np.mean(times):.2f} requests/sec")
print(f"  Detection Consistency: {np.std(detection_counts):.2f} std dev")

## 8. Test Multiple Images

In [None]:
# Download additional test images
test_images = [
    "https://ultralytics.com/images/bus.jpg",
    "https://ultralytics.com/images/zidane.jpg"
]

for idx, img_url in enumerate(test_images):
    img_path = f"test_image_{idx}.jpg"
    urllib.request.urlretrieve(img_url, img_path)
    
    # Run inference
    result_img = infer_call(img_path, conf_threshold=CONF_THRESHOLD)
    
    # Visualize
    image = cv2.imread(img_path)
    detections_obj = [Detection(d['bbox'], d['confidence']) for d in result_img['detections']]
    annotated = draw_detections(image, detections_obj)
    annotated_rgb = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
    
    plt.figure(figsize=(10, 7))
    plt.imshow(annotated_rgb)
    plt.title(f"Image {idx+1}: {len(result_img['detections'])} persons - {result_img['inference_time_ms']:.2f} ms")
    plt.axis('off')
    plt.show()
    
    summary = create_detection_summary(detections_obj)
    print(f"\nImage {idx+1} Summary: {summary}")

## Summary

This notebook demonstrated:
- Connecting to Triton model endpoint on OpenShift AI
- Health checking and metadata retrieval via HTTP
- Running inference via direct REST API calls (no KServe client wrapper)
- Comparing API vs local inference
- Performance benchmarking

**Key Advantages of Direct HTTP**:
- ✅ **Transparent**: See exactly what's sent/received
- ✅ **Debuggable**: Easy to troubleshoot issues
- ✅ **Flexible**: Adjust to Triton's batching configuration
- ✅ **Simple**: No complex client abstraction layer

The API adds network overhead but provides:
- Centralized model serving
- Auto-scaling capabilities
- Version management
- Multi-client access