# Collect Build Times

Collect build times for datasets.

How to run

```bash
papermill --log-output collect_build_times.ipynb collect_build_times.run1.ipynb

# Dry run
papermill --log-output collect_build_times.ipynb collect_build_times.run1.ipynb -p DRY_RUN 1

# To override parameters with yaml:
papermill --log-output collect_build_times.ipynb collect_build_times.run1.ipynb -y "
DATASETS:
    - "sift-128-euclidean"
"
```

In [None]:
import pathlib
import subprocess
import time
from typing import Callable

import common

In [None]:
# Notebook parameters
DRY_RUN = False
RUN_E2E = False  # Run end-to-end benchmark

DATASETS = [common.SIFT_128_EUCLIDEAN]
ALGORITHMS = [
    common.FAISS_CPU_FLAT,
    common.FAISS_CPU_IVF_FLAT,
    common.FAISS_CPU_IVF_PQ,
    common.FAISS_GPU_FLAT,
    common.FAISS_GPU_IVF_FLAT,
    common.FAISS_GPU_IVF_PQ,
    common.HNSWLIB,
    common.RAFT_BRUTE_FORCE,
    common.RAFT_CAGRA,
    common.RAFT_IVF_FLAT,
    common.RAFT_IVF_PQ,
    common.RAFT_CAGRA_HNSWLIB,
]
DATASET_PATH = common.DATASET_PATH
ALGO_CONFIG_DIR = common.ALGO_CONFIG_DIR
GPU_DOCKER_CMD_TEMPLATE = common.GPU_DOCKER_CMD_TEMPLATE
CPU_DOCKER_CMD_TEMPLATE = common.CPU_DOCKER_CMD_TEMPLATE
DOWNLOAD_CMD_TEMPLATE = common.DOWNLOAD_CMD_TEMPLATE
BUILD_CMD_TEMPLATE = common.BUILD_CMD_TEMPLATE
SEARCH_CMD_TEMPLATE = common.SEARCH_CMD_TEMPLATE
EXPORT_DATA_CMD_TEMPLATE = common.EXPORT_DATA_CMD_TEMPLATE
PLOT_CMD_TEMPLATE = common.PLOT_CMD_TEMPLATE

In [None]:
DATASET_PATH = pathlib.Path(DATASET_PATH).resolve()
DATASET_PATH.mkdir(parents=True, exist_ok=True)

In [None]:
def get_docker_image(algo: str = ""):
    """FAISS requires CUDA 11."""
    if algo.startswith("faiss"):
        return common.CUDA11_DOCKER_IMAGE
    return common.CUDA12_DOCKER_IMAGE

In [None]:
def init_collector(datasets: list[str], algorithms: list[str], default_fn: Callable):
    coll = {}
    for ds in datasets:
        for algo in algorithms:
            coll[(ds, algo)] = default_fn()
    return coll


build_time_collector = init_collector(
    DATASETS, ALGORITHMS, lambda: dict(tick=None, tock=None)
)
search_time_collector = init_collector(
    DATASETS, ALGORITHMS, lambda: dict(tick=None, tock=None)
)
error_collector = {ds: list() for ds in DATASETS}

## Download data

* Use the CPU image for download.

In [None]:
def download_all():
    for ds in DATASETS:
        dataset_dir = DATASET_PATH / "datasets"
        if DRY_RUN:
            print(f"Would download {ds} to {dataset_dir}")
            continue
        try:
            if ds == common.WIKI_ALL_1M:
                common.download_wiki_1M(dataset_dir)
                continue
            if ds == common.WIKI_ALL_10M:
                common.download_wiki_10M(dataset_dir)
                continue
            if ds == common.WIKI_ALL_88M:
                common.download_wiki_88M(dataset_dir)
                continue

            need_normalize = ds in (common.GLOVE_100_INNER, common.DEEP_10M_INNER)
            if need_normalize:
                ds = ds.replace("inner", "angular")

            download_cmd = DOWNLOAD_CMD_TEMPLATE.safe_substitute(
                DATASET=ds, NORMALIZE="--normalize" if need_normalize else ""
            )
            cmd = CPU_DOCKER_CMD_TEMPLATE.safe_substitute(
                DATASET_PATH=DATASET_PATH,
                DOCKER_IMAGE=get_docker_image(),
                CONTAINER_CMD=download_cmd,
            )
            print(cmd)
            subprocess.run(
                cmd, shell=True, executable="/bin/bash", check=True, text=True
            )
        except Exception as e:
            print(f"Error downloading {ds}: {e}")
            error_collector[ds].append("download")
            continue


download_all()

## Build


In [None]:
def build_all():
    for ds in DATASETS:
        for algo in ALGORITHMS:
            extra_args = []
            # FIXME: the docker image is either missing configurations, or the default doesn't work
            need_algo_config = algo in (
                common.FAISS_CPU_IVF_FLAT,
                common.FAISS_CPU_IVF_PQ,
            )
            if need_algo_config:
                extra_args.append(f"--configuration {ALGO_CONFIG_DIR}")

            # FIXME: some algos need --force to write results properly
            need_force_write = algo in (common.FAISS_GPU_FLAT, common.RAFT_BRUTE_FORCE)
            if need_force_write:
                extra_args.append("--force")

            build_cmd = BUILD_CMD_TEMPLATE.safe_substitute(
                DATASET=ds, ALGORITHMS=algo, EXTRA_ARGS=" ".join(extra_args)
            )
            cmd = GPU_DOCKER_CMD_TEMPLATE.safe_substitute(
                DATASET_PATH=DATASET_PATH,
                DOCKER_IMAGE=get_docker_image(algo),
                CONTAINER_CMD=build_cmd,
            )
            print(cmd)
            if DRY_RUN:
                print(f"Would build {ds} with {algo}")
                continue
            try:
                build_time_collector[(ds, algo)]["tick"] = time.time()
                subprocess.run(
                    cmd, shell=True, executable="/bin/bash", check=True, text=True
                )
                build_time_collector[(ds, algo)]["tock"] = time.time()
            except Exception as e:
                print(f"Error building {ds} with {algo}: {e}")
                error_collector[ds].append(f"{algo} build")
                continue


build_all()

## Search

* Use the `latency` search mode, and set `--search-threads=1` as we only need this for correlating the build time statistics to recall.

In [None]:
def search_all():
    for ds in DATASETS:
        for algo in ALGORITHMS:
            extra_args = []
            # FIXME: the docker image is missing configurations for FAISS_CPU_IVF_FLAT and FAISS_CPU_IVF_PQ
            need_algo_config = algo in (
                common.FAISS_CPU_IVF_FLAT,
                common.FAISS_CPU_IVF_PQ,
            )
            if need_algo_config:
                extra_args.append(f"--configuration {ALGO_CONFIG_DIR}")
            extra_args.append("--search-threads 1")

            search_cmd = SEARCH_CMD_TEMPLATE.safe_substitute(
                SEARCH_MODE="latency",
                DATASET=ds,
                ALGORITHMS=algo,
                BATCH_SIZE=1,
                COUNT=10,
                EXTRA_ARGS=" ".join(extra_args),
            )
            cmd = GPU_DOCKER_CMD_TEMPLATE.safe_substitute(
                DATASET_PATH=DATASET_PATH,
                DOCKER_IMAGE=get_docker_image(algo),
                CONTAINER_CMD=search_cmd,
            )
            print(cmd)
            if DRY_RUN:
                print(f"Would search {ds} with {algo}")
                continue
            try:
                search_time_collector[(ds, algo)]["tick"] = time.time()
                subprocess.run(
                    cmd, shell=True, executable="/bin/bash", check=True, text=True
                )

                search_time_collector[(ds, algo)]["tock"] = time.time()
            except Exception as e:
                print(f"Error searching {ds} with {algo}: {e}")
                error_collector[ds].append(f"{algo} search")
                continue


if RUN_E2E:
    search_all()

## Export data

In [None]:
def export_all():
    for ds in DATASETS:
        export_cmd = EXPORT_DATA_CMD_TEMPLATE.safe_substitute(DATASET=ds)
        cmd = GPU_DOCKER_CMD_TEMPLATE.safe_substitute(
            DATASET_PATH=DATASET_PATH,
            DOCKER_IMAGE=get_docker_image(),
            CONTAINER_CMD=export_cmd,
        )
        print(cmd)
        if DRY_RUN:
            print(f"Would export {ds} from {DATASET_PATH}")
            continue
        try:
            subprocess.run(
                cmd, shell=True, executable="/bin/bash", check=True, text=True
            )
        except Exception as e:
            print(f"Error exporting {ds}: {e}")
            error_collector[ds].append("export")
            continue


if RUN_E2E:
    export_all()

## Plot

In [None]:
def plot_all():
    for ds in DATASETS:
        export_cmd = PLOT_CMD_TEMPLATE.safe_substitute(
            SEARCH_MODE="latency",
            DATASET=ds,
            BATCH_SIZE=1,  # This should be the same as in search_all
            COUNT=10,  # This should be the same as in search_all
            EXTRA_ARGS="",
        )
        cmd = GPU_DOCKER_CMD_TEMPLATE.safe_substitute(
            DATASET_PATH=DATASET_PATH,
            DOCKER_IMAGE=get_docker_image(),
            CONTAINER_CMD=export_cmd,
        )
        print(cmd)
        if DRY_RUN:
            print(f"Would plot {ds} from {DATASET_PATH}")
            continue
        try:
            subprocess.run(
                cmd, shell=True, executable="/bin/bash", check=True, text=True
            )
        except Exception as e:
            print(f"Error plotting {ds}: {e}")
            error_collector[ds].append("plot")
            continue


if RUN_E2E:
    plot_all()

In [None]:
print(error_collector)

In [None]:
for k, v in build_time_collector.items():
    build_time = (
        round(v["tock"] - v["tick"], 2) if v["tick"] and v["tock"] else "failed"
    )
    print(f"Build time for {k}: {build_time}")


if RUN_E2E:
    for k, v in search_time_collector.items():
        search_time = (
            round(v["tock"] - v["tick"], 2) if v["tick"] and v["tock"] else "failed"
        )
        print(f"Search time for {k}: {search_time}")