# Inference Time Profiling

## Purpose

This notebook measures inference performance:

1. **GPU Inference Time**: Time per image and FPS on GPU
2. **CPU Inference Time**: Time per image and FPS on CPU
3. **mAP Computation Time**: Full evaluation loop time

## Methodology

- Batch size = 1 (single image inference)
- Warm-up runs to stabilize GPU/CPU
- GPU synchronization for accurate timing
- Multiple runs (N=100) for averaging
- Separate timing for inference vs full evaluation

In [None]:
import os
import sys
import time
from pathlib import Path

import torch
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Subset

# Add project root to path
project_root = Path(".").resolve().parent
sys.path.insert(0, str(project_root))

from models.maskrcnn_model import get_custom_maskrcnn
from datasets.isaid_dataset import iSAIDDataset
from training.trainer import collate_fn, create_datasets
from training.transforms import get_transforms

print(f"Project root: {project_root}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## 1. Configuration & W&B Setup

In [None]:
import wandb

# Login to W&B
try:
    # For Kaggle
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    wandb_key = user_secrets.get_secret("wandb_key")
    wandb.login(key=wandb_key)
except:
    # For local - assumes you've already run 'wandb login'
    wandb.login()

print("✓ Logged into W&B")

In [None]:
# Model configuration
NUM_CLASSES = 16  # iSAID has 15 classes + background

# W&B artifact configuration
WANDB_ENTITY = "marek-olnk-put-pozna-"
WANDB_PROJECT = "isaid-custom-segmentation"
ARTIFACT_NAME = "isaid-maskrcnn-final:v0"

# Dataset path
DATA_ROOT = Path("/kaggle/input/isaid-patches")  # Change for local
if not DATA_ROOT.exists():
    DATA_ROOT = project_root / "data" / "iSAID_patches"

# Timing settings
WARMUP_RUNS = 10
TIMING_RUNS = 100
EVAL_SUBSET_SIZE = 200  # Number of images for mAP timing

print(f"\nConfiguration:")
print(f"  Number of classes: {NUM_CLASSES}")
print(f"  W&B Artifact: {WANDB_ENTITY}/{WANDB_PROJECT}/{ARTIFACT_NAME}")
print(f"  Data root: {DATA_ROOT}")
print(f"  Warmup runs: {WARMUP_RUNS}")
print(f"  Timing runs: {TIMING_RUNS}")
print(f"  Eval subset size: {EVAL_SUBSET_SIZE}")

## 2. Download Model from W&B

In [None]:
# Download model artifact from W&B
print("Downloading model from W&B...")
run = wandb.init(project=WANDB_PROJECT, entity=WANDB_ENTITY, job_type="timing")
artifact = run.use_artifact(f'{WANDB_ENTITY}/{WANDB_PROJECT}/{ARTIFACT_NAME}', type='model')
artifact_dir = artifact.download()

print(f"✓ Model downloaded to: {artifact_dir}")

# Set checkpoint path
CHECKPOINT_PATH = Path(artifact_dir) / "best_map_model.pth"
print(f"  Checkpoint: {CHECKPOINT_PATH}")

## 3. Load Model

In [None]:
# Create model
print("Creating model architecture...")
model = get_custom_maskrcnn(
    num_classes=NUM_CLASSES,
    pretrained_backbone=False,
)

# Load checkpoint
if CHECKPOINT_PATH.exists():
    print(f"Loading checkpoint from {CHECKPOINT_PATH}...")
    checkpoint = torch.load(CHECKPOINT_PATH, map_location="cpu")

    # Extract model weights from checkpoint
    if 'model_state_dict' in checkpoint:
        model_weights = checkpoint['model_state_dict']
        epoch = checkpoint.get('epoch', 'unknown')
        print(f"  Checkpoint from epoch: {epoch}")
    else:
        model_weights = checkpoint

    model.load_state_dict(model_weights)
    print("✓ Checkpoint loaded successfully")
else:
    raise FileNotFoundError(f"Checkpoint not found at {CHECKPOINT_PATH}")

model.eval()
print("\n✓ Model ready for timing")

## 4. Load Test Data

In [None]:
# Load validation dataset
print("Loading validation dataset...")
val_dataset = iSAIDDataset(
    root=DATA_ROOT,
    split='val',
    transforms=get_transforms(train=False)
)

# Get a single image for inference timing
single_image, _ = val_dataset[0]
print(f"Single image shape: {single_image.shape}")

# Create subset for mAP timing
subset_indices = list(range(min(EVAL_SUBSET_SIZE, len(val_dataset))))
val_subset = Subset(val_dataset, subset_indices)
val_loader = DataLoader(
    val_subset,
    batch_size=1,
    shuffle=False,
    num_workers=2,
    collate_fn=collate_fn
)

print(f"✓ Validation dataset loaded: {len(val_dataset)} images")
print(f"✓ Subset for mAP timing: {len(val_subset)} images")

## 5. GPU Inference Timing

In [None]:
if torch.cuda.is_available():
    print("=" * 60)
    print("GPU INFERENCE TIMING")
    print("=" * 60)

    # Move model and image to GPU
    device = torch.device("cuda")
    model = model.to(device)
    image = single_image.to(device)

    # Warm-up
    print(f"\nWarming up GPU ({WARMUP_RUNS} runs)...")
    for _ in range(WARMUP_RUNS):
        with torch.no_grad():
            _ = model([image])

    # Timing runs with synchronization
    torch.cuda.synchronize()
    print(f"Running {TIMING_RUNS} inference passes...")
    start_time = time.time()

    for _ in range(TIMING_RUNS):
        with torch.no_grad():
            _ = model([image])

    torch.cuda.synchronize()
    elapsed_time = time.time() - start_time

    # Calculate metrics
    time_per_image_gpu = elapsed_time / TIMING_RUNS
    fps_gpu = 1 / time_per_image_gpu

    print(f"\n✓ GPU Timing Complete")
    print(f"  Total time: {elapsed_time:.2f} seconds")
    print(f"  Time per image: {time_per_image_gpu*1000:.2f} ms")
    print(f"  FPS: {fps_gpu:.2f}")

    gpu_results = {
        'time_ms': time_per_image_gpu * 1000,
        'fps': fps_gpu,
        'total_time': elapsed_time
    }
else:
    print("CUDA not available - skipping GPU timing")
    gpu_results = None

## 6. CPU Inference Timing

In [None]:
print("\n" + "=" * 60)
print("CPU INFERENCE TIMING")
print("=" * 60)
print("\n⚠️  Warning: CPU inference will be slow!\n")

# Move model and image to CPU
device = torch.device("cpu")
model = model.to(device)
image = single_image.to(device)

# Warm-up
print(f"Warming up CPU ({WARMUP_RUNS} runs)...")
for _ in range(WARMUP_RUNS):
    with torch.no_grad():
        _ = model([image])

# Timing runs (no synchronization needed for CPU)
print(f"Running {TIMING_RUNS} inference passes...")
start_time = time.time()

for _ in range(TIMING_RUNS):
    with torch.no_grad():
        _ = model([image])

elapsed_time = time.time() - start_time

# Calculate metrics
time_per_image_cpu = elapsed_time / TIMING_RUNS
fps_cpu = 1 / time_per_image_cpu

print(f"\n✓ CPU Timing Complete")
print(f"  Total time: {elapsed_time:.2f} seconds")
print(f"  Time per image: {time_per_image_cpu*1000:.2f} ms")
print(f"  FPS: {fps_cpu:.2f}")

cpu_results = {
    'time_ms': time_per_image_cpu * 1000,
    'fps': fps_cpu,
    'total_time': elapsed_time
}

## 7. mAP Computation Time (Subset)

This measures the full evaluation loop including:
- Inference
- Matching predictions to ground truth
- IoU computation
- mAP calculation

**Note:** This runs on the full subset (not just one image like inference timing above).

In [None]:
from training.trainer import Trainer

print("\n" + "=" * 60)
print("mAP COMPUTATION TIME (Subset)")
print("=" * 60)
print(f"\nEvaluating on {len(val_subset)} images...\n")

# Move model to best available device
eval_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(eval_device)

# Create a temporary trainer just for evaluation
# We don't need optimizer or training setup, just the compute_map method
temp_trainer = Trainer(
    train_dataset=val_subset,  # Dummy, not used
    val_dataset=val_subset,
    model=model,
    batch_size=1,
    num_workers=2,
    device=eval_device
)

# Time the full evaluation
start_time = time.time()
map_score, mean_iou = temp_trainer.compute_map(
    val_loader,
    iou_threshold=0.5,
    score_threshold=0.5,
    max_samples=len(val_subset)  # Evaluate all subset images
)
eval_time_subset = time.time() - start_time

print(f"\n✓ Evaluation Complete")
print(f"  Total time: {eval_time_subset:.2f} seconds ({eval_time_subset/60:.2f} minutes)")
print(f"  Time per image: {eval_time_subset/len(val_subset)*1000:.2f} ms")
print(f"  Images per second: {len(val_subset)/eval_time_subset:.2f}")
print(f"\n  mAP@0.5: {map_score:.4f}")
print(f"  Mean IoU: {mean_iou:.4f}")

subset_eval_results = {
    'num_images': len(val_subset),
    'total_time_sec': eval_time_subset,
    'time_per_image_ms': eval_time_subset / len(val_subset) * 1000,
    'images_per_sec': len(val_subset) / eval_time_subset,
    'map_50': map_score,
    'mean_iou': mean_iou
}

## 8. Summary Tables

In [None]:
print("\n" + "=" * 70)
print("INFERENCE TIMING SUMMARY")
print("=" * 70)

# Inference timing table
print("\n### Inference Time\n")
print("| Device | Time per Image (ms) | FPS   | Speedup |")
print("| ------ | ------------------- | ----- | ------- |")

if gpu_results:
    print(f"| GPU    | {gpu_results['time_ms']:>19.2f} | {gpu_results['fps']:>5.2f} | 1.00x   |")
    speedup = cpu_results['time_ms'] / gpu_results['time_ms']
    print(f"| CPU    | {cpu_results['time_ms']:>19.2f} | {cpu_results['fps']:>5.2f} | {speedup:>6.2f}x |")
else:
    print(f"| CPU    | {cpu_results['time_ms']:>19.2f} | {cpu_results['fps']:>5.2f} | N/A     |")

# mAP computation time table
print("\n### mAP Computation Time\n")
print("| Metric                    | Value          |")
print("| ------------------------- | -------------- |")
print(f"| Evaluation subset size    | {subset_eval_results['num_images']} images     |")
print(f"| Total evaluation time     | {subset_eval_results['total_time_sec']:.2f} seconds   |")
print(f"| Time per image            | {subset_eval_results['time_per_image_ms']:.2f} ms        |")
print(f"| Images per second         | {subset_eval_results['images_per_sec']:.2f}           |")
print(f"| mAP@0.5                   | {subset_eval_results['map_50']:.4f}         |")
print(f"| Mean IoU                  | {subset_eval_results['mean_iou']:.4f}         |")

# Notes
print("\n### Notes\n")
print("- **Inference time**: Pure model forward pass (batch=1, no NMS overhead)")
print("- **mAP computation time**: Full evaluation including matching, IoU, and metric calculation")
print("- **Subset evaluation**: Measured on {0} images for practical timing".format(subset_eval_results['num_images']))
if gpu_results:
    print(f"- **GPU speedup**: CPU is {speedup:.1f}x slower than GPU")

print("\n" + "=" * 70)

## 9. Export Results

In [None]:
import json

# Create output directory
output_dir = project_root / "analysis_results"
output_dir.mkdir(parents=True, exist_ok=True)

# Prepare export data
export_data = {
    'timing_config': {
        'warmup_runs': WARMUP_RUNS,
        'timing_runs': TIMING_RUNS,
        'eval_subset_size': EVAL_SUBSET_SIZE,
        'image_shape': list(single_image.shape),
    },
    'inference_timing': {
        'gpu': gpu_results if gpu_results else 'Not available',
        'cpu': cpu_results,
    },
    'map_computation': subset_eval_results,
}

# Save to JSON
output_file = output_dir / "inference_timing.json"
with open(output_file, 'w') as f:
    json.dump(export_data, f, indent=2)

print(f"\n✓ Timing results saved to: {output_file}")

# Create summary CSV
timing_data = []
if gpu_results:
    timing_data.append({
        'Device': 'GPU',
        'Time_per_Image_ms': f"{gpu_results['time_ms']:.2f}",
        'FPS': f"{gpu_results['fps']:.2f}"
    })
timing_data.append({
    'Device': 'CPU',
    'Time_per_Image_ms': f"{cpu_results['time_ms']:.2f}",
    'FPS': f"{cpu_results['fps']:.2f}"
})

timing_df = pd.DataFrame(timing_data)
timing_df.to_csv(output_dir / "inference_timing.csv", index=False)
print(f"✓ Timing summary saved to: {output_dir / 'inference_timing.csv'}")