# Benchmarking MongoDB

In this notebook we run some benchmarks with MongoDB.

In [None]:
# Run from the top of the repo
%cd ..

In [None]:
results = {}

In [None]:
# Ensure MongoDB is running
! docker compose -f docker-compose.mongo.yaml up -d

In [None]:
# Ensure the database is empty
import pymongo

client = pymongo.MongoClient(port=27018)
client.drop_database("boom-benchmarking")

In [None]:
# Create collections and indexes
database = client.get_database("boom-benchmarking")
ztf_collection = database.get_collection("ztf_alerts")
ned_collection = database.get_collection("ned_alerts")

In [None]:
%load_ext calkit.magics

In [None]:
%%stage --name ztf-avro-to-mongo-docs \
    --dep-path "data/ztf_public_20250614" \
    --environment py \
    --verbose \
    --out-storage none \
    --out documents

# Load ZTF alerts into the database
import glob
import fastavro
from tqdm.auto import tqdm


def ra_to_longitude(ra):
    """Convert RA (0-360) to longitude (-180 to 180)."""
    if ra > 180:
        return ra - 360
    else:
        return ra


ztf_avro_fpaths = glob.glob("data/ztf_public_20250614/*.avro")

print(f"Found {len(ztf_avro_fpaths)} ZTF alerts")

print("Converting to documents")
documents = []
for alert_avro_fpath in tqdm(ztf_avro_fpaths):
    with open(alert_avro_fpath, "rb") as f:
        reader = fastavro.reader(f)
        for alert in reader:
            alert_fmt = {
                "object_id": alert["objectId"],
                "cand_id": alert["candid"],
                "candidate": alert["candidate"],
                # Coordinates are a GeoJSON object
                "coordinates": {
                    "type": "Point",
                    "coordinates": [
                        ra_to_longitude(alert["candidate"]["ra"]),
                        alert["candidate"]["dec"],
                    ],
                },
            }
            documents.append(alert_fmt)
# This line intentionally left blank

In [None]:
len(documents)

In [None]:
# Create geospatial index on coordinates
try:
    ztf_collection.drop_index("coordinates_geospatial_index")
except Exception:
    pass
ztf_collection.create_index(
    [("coordinates", pymongo.GEOSPHERE)],
    name="coordinates_geospatial_index",
    unique=False,
)

## Insert ZTF alerts

In [None]:
import time
from tqdm.auto import tqdm
from copy import deepcopy

# Insert all the alerts into the database
print("Inserting ZTF alerts into the database")

n_iterations = 5 # TODO: This should be a project parameter?
n_inserted = []
insert_times = []

for _ in tqdm(range(n_iterations)):
    t0 = time.time()
    ztf_collection.insert_many(deepcopy(documents))
    t1 = time.time()
    n_inserted.append(len(documents))
    insert_times.append(t1 - t0)

results["n_ztf_alerts_inserted"] = n_inserted
results["ztf_alerts_insert_time_s"] = insert_times

In [None]:
results

In [None]:
# Load NED alerts into the database

In [None]:
# Run cross-matching

## Run a cone search query

In [None]:
import math
import time

t0 = time.time()

# Run through some filters
ra_center = 180.0
dec_center = 0.0
radius_arcmin = 60.0
radius_radians = radius_arcmin * (math.pi / 180.0) / 60.0

cursor = ztf_collection.find(
    {
        "coordinates": {
            "$geoWithin": {
                "$centerSphere": [[ra_center, dec_center], radius_radians]
            }
        }
    }
).limit(20)

res = list(cursor)
t1 = time.time()
results["cone_search_query_time_s"] = t1 - t0
print(len(res), t1 - t0, "seconds")
res[0]["object_id"]

In [None]:
# Shut down Docker containers
! docker compose -f docker-compose.mongo.yaml down

In [None]:
# Write results out to file
import os
import json

os.makedirs("results", exist_ok=True)

with open("results/mongo.json", "w") as f:
    json.dump(results, f, indent=4)