In [None]:
# mamba create -n blockingpy-gpu python=3.10 -y
# conda activate blockingpy-gpu
# conda config --env --set channel_priority flexible

# mamba install -y \
#   -c pytorch/label/nightly -c rapidsai -c conda-forge \
#   "faiss-gpu-cuvs=1.11.0" "libcuvs=25.4.*"

# python -m pip install blockingpy-gpu==0.2.8

In [None]:
import time
import pandas as pd
from datetime import datetime
from blockingpy import Blocker
import faiss, os
from datetime import datetime
faiss.omp_set_num_threads(os.cpu_count()) # 12

In [None]:
df_1k = pd.read_csv("data_1000.csv", index_col=0)
df_10k = pd.read_csv("data_10000.csv", index_col=0)
df_100k = pd.read_csv("data_100000.csv", index_col=0)
dfs = [df_1k, df_10k, df_100k]


for df in dfs:
  df['gmina'] = df['gmina'].astype(str).str.replace("nan", "")
  df.fillna("", inplace=True)
  df.replace("<NA>", "", inplace=True)

In [None]:
results = []  

control_ann_gpuflat = {
    "gpu_faiss": {
            "index_type": "flat", 
            "k_search": 5,
            "distance": "cosine",
        },
}

control_ann_ivf = {
    "gpu_faiss": {
            "index_type": "ivf", 
            "k_search": 5,
            "distance": "cosine",

            "ivf_nlist": 100,
            "ivf_nprobe": 10,
        },
}

control_ann_cagra = {
    "gpu_faiss": {
            "index_type": "cagra", 
            "k_search": 5,
            "distance": "cosine",

            "cagra": {
                "graph_degree": 16,
                "intermediate_graph_degree": 32,
                "build_algo": "ivf_pq",
                "nn_descent_niter": 20,
                "itopk_size": 128,
                "max_queries": 0,
                "algo": "auto",
                "team_size": 0,
                "search_width": 2,
                "min_iterations": 0,
                "max_iterations": 0,
                "thread_block_size": 0,
                "hashmap_mode": "auto",
                "hashmap_min_bitlen": 0,
                "hashmap_max_fill_rate": 0.5,
                "num_random_samplings": 1,
                "seed": 0x128394,
            },
        },
}

dfs = [df_1k, df_10k, df_100k] 

In [None]:
def bench_once(data: pd.DataFrame, seed: int) -> list[dict]:
    out = []

    df = data.copy()
    df["txt_raw_kraj"] = (
        df["imie"] + " " + df["imie2"] + " " + df["nazwisko"] + " " +
        df["data_ur"] + " " + df["gmina"] + " " + df["kraj"].astype(str)
    )
    df["x"] = range(len(df))
    df["block"] = df["true_id"]
    tb = df[["x", "block"]]

    blocker = Blocker()

    def _append_row(name, res, t0, t1):
        out.append({
            "run": seed, "algorithm": name, "dataset_size": len(df),
            "pairs": int((df.shape[0] * (df.shape[0]-1))/2 * (1 - res.reduction_ratio)),
            "reduction_ratio": res.reduction_ratio,
            "recall": res.metrics["recall"],
            "time_sec": round(t1 - t0, 3),
        })

    
    t0 = time.perf_counter()
    r  = blocker.block(x=df["txt_raw_kraj"], ann="gpu_faiss", verbose=0,
                       random_seed=seed, control_ann=control_ann_gpuflat,)
    t1 = time.perf_counter()
    r  = blocker.eval(r, tb)
    _append_row("BlockingPy (gpu_faiss flat)", r, t0, t1)

    t0 = time.perf_counter()
    r  = blocker.block(x=df["txt_raw_kraj"], ann="gpu_faiss", verbose=0,
                       random_seed=seed, control_ann=control_ann_ivf,)
    t1 = time.perf_counter()
    r  = blocker.eval(r, tb)
    _append_row("BlockingPy (gpu_faiss ivf)", r, t0, t1)

    control_ann_cagra["gpu_faiss"]["cagra"]["seed"] = seed

    t0 = time.perf_counter()
    r  = blocker.block(x=df["txt_raw_kraj"], ann="gpu_faiss", verbose=0,
                       random_seed=seed, control_ann=control_ann_cagra,)
    t1 = time.perf_counter()
    r  = blocker.eval(r, tb)
    _append_row("BlockingPy (gpu_faiss cagra)", r, t0, t1)

    return out

In [None]:
N_RUNS = 10

for run in range(N_RUNS):
    seed = 42 + run
    for _df in dfs:
        results.extend(bench_once(_df, seed))
    print(f"[{run+1}/{N_RUNS}] {datetime.now().isoformat(timespec='seconds')} seed={seed} — done", flush=True)


cols = ["run", "algorithm", "dataset_size", "time_sec", "recall", "reduction_ratio", "pairs"]
results_df = pd.DataFrame(results)[cols]

summary = (
    results_df
    .groupby(["algorithm", "dataset_size"], as_index=False)
    .agg(
        n_runs=("run", "nunique"),
        time_sec_mean=("time_sec", "mean"),
        time_sec_sd=("time_sec", "std"),
        recall_mean=("recall", "mean"),
        recall_sd=("recall", "std"),
        rr_mean=("reduction_ratio", "mean"),
        rr_sd=("reduction_ratio", "std"),
        pairs_mean=("pairs", "mean"),
        pairs_sd=("pairs", "std"),
    )
)

results_df.to_csv("all_runs_raw_gpu.csv", index=False)
summary.to_csv("all_runs_summary_gpu.csv", index=False)
summary, results_df.tail(10) 