# 06 â€“ Benchmark Report

**Goal:** Aggregate all metrics from notebooks 01â€“05 and produce a final comparison report.

**Outputs:**
- Combined leaderboard table (Dice, IoU, HD95, volume accuracy, inference time)
- Radar chart (multi-metric model comparison)
- Longitudinal volume accuracy plot
- Final recommendations for production pipeline

In [None]:
import sys, os
from pathlib import Path
import numpy as np
import nibabel as nib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

NOTEBOOK_DIR = Path(os.getcwd())
REPO_ROOT    = NOTEBOOK_DIR.parent.parent
DATA_ROOT    = REPO_ROOT / 'P01'
BRATS_DIR    = DATA_ROOT / 'BraTS'
MASK_DIR     = DATA_ROOT / 'tumor segmentation'

OUTPUTS_DIR  = NOTEBOOK_DIR.parent / 'outputs'
REPORT_DIR   = OUTPUTS_DIR / '06_benchmark'
REPORT_DIR.mkdir(parents=True, exist_ok=True)

sys.path.insert(0, str(NOTEBOOK_DIR.parent / 'utils'))
from dicom_utils import get_p01_brats_paths, get_p01_mask_paths, load_nifti
from metrics import BenchmarkTracker, compute_volume_from_nifti, dice_coefficient, iou_score, Timer
from visualisation import plot_benchmark_bar, plot_benchmark_radar, plot_longitudinal_volume

brats_paths = get_p01_brats_paths(BRATS_DIR)
mask_paths  = get_p01_mask_paths(MASK_DIR)

print('Loading metrics CSVs from previous notebooks...')

In [None]:
# â”€â”€ Load all CSV outputs from notebooks 02â€“05 â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
all_dfs = []

for csv_path in sorted(OUTPUTS_DIR.rglob('*_metrics.csv')):
    try:
        df = pd.read_csv(csv_path)
        df['source_file'] = csv_path.name
        all_dfs.append(df)
        print(f'  Loaded: {csv_path.relative_to(OUTPUTS_DIR)} ({len(df)} rows)')
    except Exception as e:
        print(f'  Could not load {csv_path}: {e}')

if all_dfs:
    combined = pd.concat(all_dfs, ignore_index=True)
    print(f'\nTotal records: {len(combined)}')
else:
    print('No metrics CSV files found. Run notebooks 02-05 first.')
    combined = pd.DataFrame(columns=['model','timepoint','dice','iou','volume_pred_cm3','inference_s','vram_gb','hd95_mm'])

In [None]:
# â”€â”€ Re-compute all metrics from saved prediction NIfTIs â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
fresh_tracker = BenchmarkTracker()

MODEL_DIRS = {
    'nnunet':        OUTPUTS_DIR / '02_nnunet' / 'predictions',
    'medgemma':      OUTPUTS_DIR / '03_medgemma',
    'llava_med':     OUTPUTS_DIR / '03_medgemma',
    'sam3__box':     OUTPUTS_DIR / '04_sam',
    'sam2__box':     OUTPUTS_DIR / '04_sam',
    'sam__box_stub': OUTPUTS_DIR / '04_sam',
    'majority_vote': OUTPUTS_DIR / '05_ensemble',
    'staple':        OUTPUTS_DIR / '05_ensemble',
    'union':         OUTPUTS_DIR / '05_ensemble',
    'intersection':  OUTPUTS_DIR / '05_ensemble',
}

PRED_PATTERNS = {
    'nnunet':        lambda tp: f'P01_{tp}_pred.nii.gz',
    'medgemma':      lambda tp: f'medgemma_{tp}_pred.nii.gz',
    'llava_med':     lambda tp: f'llava_med_{tp}_pred.nii.gz',
    'sam3__box':     lambda tp: f'sam3_{tp}_box_pred.nii.gz',
    'sam2__box':     lambda tp: f'sam2_{tp}_box_pred.nii.gz',
    'sam__box_stub': lambda tp: f'sam_{tp}_box_pred.nii.gz',
    'majority_vote': lambda tp: f'majority_vote_{tp}.nii.gz',
    'staple':        lambda tp: f'staple_{tp}.nii.gz',
    'union':         lambda tp: f'union_{tp}.nii.gz',
    'intersection':  lambda tp: f'intersection_{tp}.nii.gz',
}

timepoints = list(brats_paths.keys())

for model_name, pred_dir in MODEL_DIRS.items():
    for tp in timepoints:
        gt_path = mask_paths.get(tp)
        if not gt_path:
            continue
        pred_filename = PRED_PATTERNS[model_name](tp)
        pred_path = pred_dir / pred_filename

        if not pred_path.exists():
            fresh_tracker.add_mock(model=model_name, timepoint=tp)
            continue

        gt_arr, _, _   = load_nifti(gt_path)
        pred_arr, _, _ = load_nifti(str(pred_path))
        spacing        = tuple(float(s) for s in nib.load(gt_path).header.get_zooms()[:3])

        fresh_tracker.add(model=model_name, timepoint=tp,
                          pred=pred_arr, gt=gt_arr, spacing=spacing)

fresh_df = fresh_tracker.to_dataframe()
print(f'Fresh computation: {len(fresh_df)} records')

In [None]:
# â”€â”€ Merge and deduplicate â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
all_data = pd.concat([combined, fresh_df], ignore_index=True)
all_data = all_data.drop_duplicates(subset=['model', 'timepoint'], keep='last')
all_data = all_data[all_data['dice'].notna()].copy()

print(f'Total valid records: {len(all_data)}')
print(all_data[['model','timepoint','dice','iou','volume_pred_cm3','inference_s']].to_string(index=False))

In [None]:
# â”€â”€ Leaderboard â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
leaderboard = (
    all_data.groupby('model')[['dice','iou','hd95_mm','volume_pred_cm3','inference_s']]
    .mean().round(4)
    .sort_values('dice', ascending=False)
)

gt_volumes   = {tp: compute_volume_from_nifti(p) for tp, p in mask_paths.items()}
mean_gt_vol  = np.mean(list(gt_volumes.values()))
leaderboard['vol_error_cm3'] = (leaderboard['volume_pred_cm3'] - mean_gt_vol).abs().round(4)
leaderboard['vol_error_pct'] = ((leaderboard['vol_error_cm3'] / (mean_gt_vol+1e-6))*100).round(2)

print('=== FINAL LEADERBOARD ===')
print(leaderboard.to_string())
leaderboard.to_csv(REPORT_DIR / 'final_leaderboard.csv')
all_data.to_csv(REPORT_DIR / 'all_metrics_raw.csv', index=False)
print('\nSaved: outputs/06_benchmark/final_leaderboard.csv')

In [None]:
# â”€â”€ Dice bar chart â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
lb = leaderboard.reset_index()
fig = plot_benchmark_bar(lb, metric='dice', title='Segmentation Dice Score â€“ All Models & Strategies')
plt.savefig(REPORT_DIR / 'dice_leaderboard.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# â”€â”€ Radar chart â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
radar_metrics = [m for m in ['dice','iou','inference_s','hd95_mm']
                 if m in lb.columns and lb[m].notna().any()]
if len(radar_metrics) >= 3:
    fig = plot_benchmark_radar(lb, metrics=radar_metrics)
    plt.savefig(REPORT_DIR / 'radar_chart.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print('Not enough metrics for radar chart.')

In [None]:
# â”€â”€ Longitudinal volume accuracy â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
long_volumes = {'Ground Truth': gt_volumes}
for model_name in all_data['model'].unique():
    sub = all_data[all_data['model'] == model_name]
    tp_vol = dict(zip(sub['timepoint'], sub['volume_pred_cm3']))
    if any(v is not None for v in tp_vol.values()):
        long_volumes[model_name] = tp_vol

tp_order = list(brats_paths.keys())
fig = plot_longitudinal_volume(long_volumes, timepoint_labels=tp_order)
plt.savefig(REPORT_DIR / 'longitudinal_volumes.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# â”€â”€ Preprocessing summary â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
preproc_csv = OUTPUTS_DIR / '01_preprocessing' / 'preprocessing_summary.csv'
if preproc_csv.exists():
    preproc_df = pd.read_csv(preproc_csv)
    print('=== Preprocessing Comparison ===')
    print(preproc_df.to_string(index=False))
    preproc_df.to_csv(REPORT_DIR / 'preprocessing_summary.csv', index=False)
else:
    print('Preprocessing summary not available - run notebook 01 first.')

In [None]:
# â”€â”€ Final recommendations â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print('=' * 60)
print('FINAL RECOMMENDATIONS â€“ OncoFlow Phase 4')
print('=' * 60)

if len(leaderboard) > 0:
    ensemble_models = ['majority_vote','staple','union','intersection']
    best_single = leaderboard[
        ~leaderboard.index.isin(ensemble_models + ['weighted_oracle'])
    ].index[0] if len(leaderboard) > 0 else 'N/A'
    best_ensemble = leaderboard[
        leaderboard.index.isin(ensemble_models)
    ].index[0] if any(leaderboard.index.isin(ensemble_models)) else 'majority_vote'

    print(f'\nBest single model : {best_single}')
    print(f'Best ensemble     : {best_ensemble}')

recs = {
    'Processing pipeline':  'SimpleITK (A1) for speed; dcm2niix (A2) for clinical compatibility',
    'nnU-Net role':         'Primary volumetric segmenter â€” train on BraTS 2024',
    'MedGemma role':        'RAG / report text generation (NOT segmentation)',
    'SAM3 role':            'Interactive refinement in UI + box-prompted ensemble member',
    'Ensemble strategy':    'Majority vote (default); STAPLE with calibration data',
    'Agreement threshold':  '>= 0.90 auto-report | 0.75-0.89 flag | < 0.75 manual',
}
print('\n--- Recommendations ---')
for k, v in recs.items():
    print(f'  {k:25s}: {v}')

In [None]:
# â”€â”€ Save HTML report â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
import base64, shutil

# Consolidate all PNGs into report dir
for src in OUTPUTS_DIR.rglob('*.png'):
    dst = REPORT_DIR / src.name
    if dst != src:
        shutil.copy(src, dst)

html_sections = ['<html><head><title>OncoFlow Phase 4 Report</title><style>body{font-family:sans-serif;max-width:1200px;margin:auto;padding:20px} table{border-collapse:collapse;width:100%} th,td{border:1px solid #ddd;padding:8px;text-align:center} th{background:#4a7fc1;color:white}</style></head><body>']
html_sections.append('<h1>OncoFlow Phase 4 â€“ ML Exploration Report</h1>')
html_sections.append('<p><b>Patient:</b> P01 | <b>Timepoints:</b> baseline, fu1, fu2, fu3, fu4</p>')
html_sections.append('<h2>Leaderboard</h2>')
html_sections.append(leaderboard.to_html(float_format='%.4f') if len(leaderboard) > 0 else '<p>No data â€” run notebooks 02-05 first</p>')

for img_path in sorted(REPORT_DIR.glob('*.png')):
    with open(img_path, 'rb') as f:
        b64 = base64.b64encode(f.read()).decode()
    html_sections.append(f'<h2>{img_path.stem.replace("_", " ").title()}</h2>')
    html_sections.append(f'<img src="data:image/png;base64,{b64}" width="900"/>')

html_sections.append('</body></html>')
report_path = REPORT_DIR / 'exploration_report.html'
report_path.write_text('\n'.join(html_sections))
print(f'Report saved: {report_path}')

## ðŸ“‹ Summary

| Dimension | Decision |
|-----------|----------|
| **Processing pipeline** | SimpleITK (A1) for production speed; A3 (N4+zscore) for training data |
| **Segmentation** | nnU-Net v2 (primary) + SAM3 box-prompted (secondary) |
| **VLM** | MedGemma-1.5 for clinical summary generation, NOT segmentation |
| **Ensemble** | Majority vote; upgrade to STAPLE with labelled data |
| **Agreement flagging** | â‰¥0.90 auto / 0.75â€“0.89 review / <0.75 manual |

**Next steps (Phase 5â€“7):**
1. Fine-tune nnU-Net on internal data
2. Integrate MedGemma into RAG pipeline
3. Wire SAM3 interactive mode into frontend