In [None]:
# %pip install blocklib
# # %pip install blockingpy
# %pip install bitarray

In [None]:
from blocklib import generate_candidate_blocks
import itertools, time, pandas as pd
from blockingpy import Blocker
from datetime import datetime

In [2]:
df_1k = pd.read_csv("data_sim_1000.csv", index_col=0)
df_10k = pd.read_csv("data_sim_10000.csv", index_col=0)
df_100k = pd.read_csv("data_sim_100000.csv", index_col=0)
dfs = [df_1k, df_10k, df_100k]

kraje = pd.read_csv("wykaz_krajow_obywatelstwa.csv", index_col=0, delimiter=";")
lookup_dict = dict(zip(kraje.index, kraje['kraj']))

for df in dfs:
  df['kraj'] = df['kraj'].map(lookup_dict)
  df['gmina'] = df['gmina'].astype(str).str.replace("nan", "")
  df.fillna("", inplace=True)
  df.replace("<NA>", "", inplace=True)

In [None]:
def block_metrics(cand_blocks, true_ids, total_true_pairs):
    n_records   = len(true_ids)
    cartesian   = n_records * (n_records - 1) // 2

    cand_pairs  = 0
    dedup_pairs = set()

    for block in cand_blocks.values():
        k = len(block)
        if k < 2:
            continue
        cand_pairs += k * (k - 1) // 2

        for i, j in itertools.combinations(block, 2):
            if true_ids[i] == true_ids[j]:
                dedup_pairs.add((i if i < j else j,
                                 j if i < j else i))

    rr      = 1 - cand_pairs / cartesian
    recall  = len(dedup_pairs) / total_true_pairs if total_true_pairs else 1.0
    return cand_pairs, rr, recall

blocking_features = ["imie", "nazwisko", "by", "bm", "bd", "gmina"]

psig_schema = {
      "type": "p-sig",
      "version": 1,
      "config": {
          "blocking-features": blocking_features,
          "filter": {"type": "ratio", "max": 0.02, "min": 0.0},

          "blocking-filter": {
              "type": "bloom filter",
              "number-hash-functions": 20,
              "bf-len": 4096, 
          },

          "signatureSpecs": [

              [
                  {"type": "characters-at", "feature": "imie",     "config": {"pos": [0]}},
                  {"type": "characters-at", "feature": "nazwisko", "config": {"pos": [0]}},
                  {"type": "characters-at", "feature": "by",       "config": {"pos": [-1]}},
                  {"type": "characters-at", "feature": "gmina",    "config": {"pos": [0]}},
              ],

              [
                  {"type": "metaphone", "feature": "nazwisko"},
              ],

              [
                  {"type": "characters-at", "feature": "nazwisko", "config": {"pos": [-2, -1]}},
                  {"type": "characters-at", "feature": "bm",       "config": {"pos": [0]}},
              ],
          ],
      },
  } 
lsh_schema = {
    "type": "lambda-fold",
    "version": 1,
    "config": {
        "blocking-features": [1, 2],  
        "Lambda": 6,                  
        "K": 110,                      
        "bf-len": 4096,
        "num-hash-funcs": 12,
        "random_state": 0,
        "input-clks": False,
    },
} 
schemata = [("P-Sig", psig_schema), ("λ-fold LSH", lsh_schema)]

results = []  

control_ann_voy = {
    'voyager': {
        "k_search": 30,
    }
}

control_ann_faiss_lsh = {
    'faiss': {
        "k_search": 5,
        "index_type": "lsh",
        "lsh_nbits": 1,
    }
}

control_ann_faiss_hnsw = {
    'faiss': {
        "k_search": 5,
        "index_type": "hnsw",
        'hnsw_M': 12,
        'hnsw_ef_construction': 200,
        'hnsw_ef_search': 200,
    }
}

control_ann_voy_fast = {
    'voyager': {
        "k_search": 30,
        'distance': 'cosine',
        'M': 5,               
        'ef_construction': 60, 
    }
}

control_ann_gpuflat = {
    "gpu_faiss": {
            "index_type": "flat", 
            "k_search": 5,
            "distance": "cosine",
        },
}

control_ann_ivf = {
    "gpu_faiss": {
            "index_type": "ivf", 
            "k_search": 5,
            "distance": "cosine",

            "ivf_nlist": 100,
            "ivf_nprobe": 10,
        },
}

control_ann_cagra = {
    "gpu_faiss": {
            "index_type": "cagra", 
            "k_search": 5,
            "distance": "cosine",

            "cagra": {
                "graph_degree": 16,
                "intermediate_graph_degree": 32,
                "build_algo": "ivf_pq",
                "nn_descent_niter": 20,
                "itopk_size": 128,
                "max_queries": 0,
                "algo": "auto",
                "team_size": 0,
                "search_width": 2,
                "min_iterations": 0,
                "max_iterations": 0,
                "thread_block_size": 0,
                "hashmap_mode": "auto",
                "hashmap_min_bitlen": 0,
                "hashmap_max_fill_rate": 0.5,
                "num_random_samplings": 1,
                "seed": 0x128394,
            },
        },
}

dfs      = [df_1k, df_10k, df_100k] 

In [None]:
for df in dfs:

    bl = df.copy().reset_index(drop=True)

    dob = pd.to_datetime(bl["data_ur"],
                         errors="coerce", format="%Y/%m/%d")
    bl["by"] = dob.dt.year.astype("Int64").astype(str).str.zfill(4)
    bl["bm"] = dob.dt.month.astype("Int64").astype(str).str.zfill(2)
    bl["bd"] = dob.dt.day.astype("Int64").astype(str).str.zfill(2)

    bl["id"] = bl.index
    header   = ["id"] + blocking_features
    records  = bl[header].values.tolist()
    true_ids = bl["true_id"].to_numpy()
    total_true_pairs = int(
        bl.groupby("true_id").size()
          .apply(lambda k: k * (k - 1) // 2).sum()
    )

    df['txt_raw_kraj'] = df['imie'] + " " + df['imie2'] + " " + df['nazwisko'] + " " + df['data_ur'] + " " + df['gmina'] + " " + df['kraj'].astype(str)
    df['x'] = range(len(df))
    df['block'] = df['true_id']
    tb = df[['x', 'block']]

    for label, schema in schemata:
        s_tim = time.perf_counter()

        cand   = generate_candidate_blocks(records, schema, header=header)

        e_tim = time.perf_counter()

        pairs, rr, recall = block_metrics(
            cand.blocks, true_ids, total_true_pairs)

        results.append({
            "algorithm"     : label,
            "dataset_size"  : len(bl),
            "pairs"         : pairs,
            "reduction_ratio": rr,
            "recall"        : recall,
            "time_sec"      : round(e_tim - s_tim, 3),
        })


    blocker = Blocker()

    s_time = time.perf_counter()
    res = blocker.block(x=df['txt_raw_kraj'], ann='voyager', verbose=0, random_seed=42, control_ann=control_ann_voy)
    e_time = time.perf_counter()
    res = blocker.eval(res, tb)

    results.append({
            "algorithm"     : "BlockingPy (voyager)",
            "dataset_size"  : len(df),
            "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
            "reduction_ratio": res.reduction_ratio,
            "recall"        : res.metrics['recall'],
            "time_sec"      : round(e_time - s_time, 3),
        })

    s_time = time.perf_counter()
    res = blocker.block(x=df['txt_raw_kraj'], ann='faiss', control_ann=control_ann_faiss_hnsw, verbose=0, random_seed=42)
    e_time = time.perf_counter()
    res = blocker.eval(res, tb)

    results.append({
            "algorithm"     : "BlockingPy (faiss_hnsw)",
            "dataset_size"  : len(df),
            "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
            "reduction_ratio": res.reduction_ratio,
            "recall"        : res.metrics['recall'],
            "time_sec"      : round(e_time - s_time, 3),
        })

    s_time = time.perf_counter()
    res = blocker.block(x=df['txt_raw_kraj'], ann='faiss', control_ann=control_ann_faiss_lsh, verbose=0, random_seed=42)
    e_time = time.perf_counter()
    res = blocker.eval(res, tb)

    results.append({
            "algorithm"     : "BlockingPy (faiss_lsh)",
            "dataset_size"  : len(df),
            "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
            "reduction_ratio": res.reduction_ratio,
            "recall"        : res.metrics['recall'],
            "time_sec"      : round(e_time - s_time, 3),
    })

    s_time = time.perf_counter()
    res = blocker.block(x=df['txt_raw_kraj'], ann='voyager', control_ann=control_ann_voy_fast, verbose=0, random_seed=42)
    e_time = time.perf_counter()
    res = blocker.eval(res, tb)

    results.append({
            "algorithm"     : "BlockingPy (voyager) - fast",
            "dataset_size"  : len(df),
            "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
            "reduction_ratio": res.reduction_ratio,
            "recall"        : res.metrics['recall'],
            "time_sec"      : round(e_time - s_time, 3),
    })


    # GPU Algorithms - run with blockingpy-gpu on py 3.10 (RTX 3050 4GB VRAM, 8GB CPU RAM, WSL2)
    s_time = time.perf_counter()
    res = blocker.block(x=df['txt_raw_kraj'], ann='gpu_faiss', verbose=0, random_seed=42, control_ann=control_ann_gpuflat)
    e_time = time.perf_counter()
    res = blocker.eval(res, tb)

    results.append({
            "algorithm"     : "BlockingPy (gpu_faiss flat)",
            "dataset_size"  : len(df),
            "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
            "reduction_ratio": res.reduction_ratio,
            "recall"        : res.metrics['recall'],
            "time_sec"      : round(e_time - s_time, 3),
        })


    s_time = time.perf_counter()
    res = blocker.block(x=df['txt_raw_kraj'], ann='gpu_faiss', verbose=0, random_seed=42, control_ann=control_ann_ivf)
    e_time = time.perf_counter()
    res = blocker.eval(res, tb)

    results.append({
            "algorithm"     : "BlockingPy (gpu_faiss ivf)",
            "dataset_size"  : len(df),
            "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
            "reduction_ratio": res.reduction_ratio,
            "recall"        : res.metrics['recall'],
            "time_sec"      : round(e_time - s_time, 3),
        })

    s_time = time.perf_counter()
    res = blocker.block(x=df['txt_raw_kraj'], ann='gpu_faiss', verbose=0, random_seed=42, control_ann=control_ann_cagra)
    e_time = time.perf_counter()
    res = blocker.eval(res, tb)

    results.append({
            "algorithm"     : "BlockingPy (gpu_faiss cagra)",
            "dataset_size"  : len(df),
            "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
            "reduction_ratio": res.reduction_ratio,
            "recall"        : res.metrics['recall'],
            "time_sec"      : round(e_time - s_time, 3),
        })

cols = ["algorithm", "dataset_size", "time_sec",
        "recall", "reduction_ratio", "pairs"]

results_df = pd.DataFrame(results)[cols]
# results_df.to_csv("blocklib_comparison.csv")
results_df