In [1]:
# %pip install blocklib
# # %pip install blockingpy
# %pip install bitarray
# %pip install unidecode

In [1]:
from blocklib import generate_candidate_blocks
import itertools, time, math, numpy as np, pandas as pd
from blockingpy import Blocker
# import unidecode
from datetime import datetime

In [2]:
df_1k = pd.read_csv("data_sim_1000.csv", index_col=0)
df_10k = pd.read_csv("data_sim_10000.csv", index_col=0)
df_100k = pd.read_csv("data_sim_100000.csv", index_col=0)
dfs = [df_1k, df_10k, df_100k]

kraje = pd.read_csv("wykaz_krajow_obywatelstwa.csv", index_col=0, delimiter=";")
lookup_dict = dict(zip(kraje.index, kraje['kraj']))

for df in dfs:
  df['kraj'] = df['kraj'].map(lookup_dict)
  df['gmina'] = df['gmina'].astype(str).str.replace("nan", "")
  df.fillna("", inplace=True)
  df.replace("<NA>", "", inplace=True)

In [4]:
def block_metrics(cand_blocks, true_ids, total_true_pairs):
    n_records   = len(true_ids)
    cartesian   = n_records * (n_records - 1) // 2

    cand_pairs  = 0
    dedup_pairs = set()

    for block in cand_blocks.values():
        k = len(block)
        if k < 2:
            continue
        cand_pairs += k * (k - 1) // 2

        for i, j in itertools.combinations(block, 2):
            if true_ids[i] == true_ids[j]:
                dedup_pairs.add((i if i < j else j,
                                 j if i < j else i))

    rr      = 1 - cand_pairs / cartesian
    recall  = len(dedup_pairs) / total_true_pairs if total_true_pairs else 1.0
    return cand_pairs, rr, recall

blocking_features = ["imie", "nazwisko", "by", "bm", "bd", "gmina"]

psig_schema = {
      "type": "p-sig",
      "version": 1,
      "config": {
          "blocking-features": blocking_features,
          "filter": {"type": "ratio", "max": 0.02, "min": 0.0},

          "blocking-filter": {
              "type": "bloom filter",
              "number-hash-functions": 20,
              "bf-len": 4096, 
          },

          "signatureSpecs": [

              [
                  {"type": "characters-at", "feature": "imie",     "config": {"pos": [0]}},
                  {"type": "characters-at", "feature": "nazwisko", "config": {"pos": [0]}},
                  {"type": "characters-at", "feature": "by",       "config": {"pos": [-1]}},
                  {"type": "characters-at", "feature": "gmina",    "config": {"pos": [0]}},
              ],

              [
                  {"type": "metaphone", "feature": "nazwisko"},
              ],

              [
                  {"type": "characters-at", "feature": "nazwisko", "config": {"pos": [-2, -1]}},
                  {"type": "characters-at", "feature": "bm",       "config": {"pos": [0]}},
              ],
          ],
      },
  } 
lsh_schema = {
    "type": "lambda-fold",
    "version": 1,
    "config": {
        "blocking-features": [1, 2],  
        "Lambda": 6,                  
        "K": 110,                      
        "bf-len": 4096,
        "num-hash-funcs": 12,
        "random_state": 0,
        "input-clks": False,
    },
} 
schemata = [("P-Sig", psig_schema), ("λ-fold LSH", lsh_schema)]

results = []        

control_ann_faiss_lsh = {
    'faiss': {
        "index_type": "lsh",
        "lsh_nbits": 1,
    }
}

control_ann_faiss_hnsw = {
    'faiss': {
        "index_type": "hnsw",
        'hnsw_M': 12,
        'hnsw_ef_construction': 200,
        'hnsw_ef_search': 200,
    }
}

control_ann_voy_fast = {
    'voyager': {
        'distance': 'cosine',
        'M': 5,               
        'ef_construction': 60, 
    }
}

dfs      = [df_1k, df_10k, df_100k] 

In [7]:
for df in dfs:

    bl = df.copy().reset_index(drop=True)

    dob = pd.to_datetime(bl["data_ur"],
                         errors="coerce", format="%Y/%m/%d")
    bl["by"] = dob.dt.year.astype("Int64").astype(str).str.zfill(4)
    bl["bm"] = dob.dt.month.astype("Int64").astype(str).str.zfill(2)
    bl["bd"] = dob.dt.day.astype("Int64").astype(str).str.zfill(2)

    bl["id"] = bl.index
    header   = ["id"] + blocking_features
    records  = bl[header].values.tolist()
    true_ids = bl["true_id"].to_numpy()
    total_true_pairs = int(
        bl.groupby("true_id").size()
          .apply(lambda k: k * (k - 1) // 2).sum()
    )

    df['txt_raw_kraj'] = df['imie'] + " " + df['imie2'] + " " + df['nazwisko'] + " " + df['data_ur'] + " " + df['gmina'] + " " + df['kraj'].astype(str)
    df['x'] = range(len(df))
    df['block'] = df['true_id']
    tb = df[['x', 'block']]

    for label, schema in schemata:
        s_tim = time.perf_counter()

        cand   = generate_candidate_blocks(records, schema, header=header)
        print(cand.blocks)

        e_tim = time.perf_counter()

        pairs, rr, recall = block_metrics(
            cand.blocks, true_ids, total_true_pairs)

        results.append({
            "algorithm"     : label,
            "dataset_size"  : len(bl),
            "pairs"         : pairs,
            "reduction_ratio": rr,
            "recall"        : recall,
            "time_sec"      : round(e_tim - s_tim, 3),
        })
    break


#     blocker = Blocker()

#     s_time = time.perf_counter()
#     res = blocker.block(x=df['txt_raw_kraj'], ann='voyager', verbose=0, random_seed=42)
#     e_time = time.perf_counter()
#     res = blocker.eval(res, tb)

#     results.append({
#             "algorithm"     : "BlockingPy (voyager)",
#             "dataset_size"  : len(df),
#             "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
#             "reduction_ratio": res.reduction_ratio,
#             "recall"        : res.metrics['recall'],
#             "time_sec"      : round(e_time - s_time, 3),
#         })

#     s_time = time.perf_counter()
#     res = blocker.block(x=df['txt_raw_kraj'], ann='faiss', control_ann=control_ann_faiss_hnsw, verbose=0, random_seed=42)
#     e_time = time.perf_counter()
#     res = blocker.eval(res, tb)

#     results.append({
#             "algorithm"     : "BlockingPy (faiss_hnsw)",
#             "dataset_size"  : len(df),
#             "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
#             "reduction_ratio": res.reduction_ratio,
#             "recall"        : res.metrics['recall'],
#             "time_sec"      : round(e_time - s_time, 3),
#         })

#     s_time = time.perf_counter()
#     res = blocker.block(x=df['txt_raw_kraj'], ann='faiss', control_ann=control_ann_faiss_lsh, verbose=0, random_seed=42)
#     e_time = time.perf_counter()
#     res = blocker.eval(res, tb)

#     results.append({
#             "algorithm"     : "BlockingPy (faiss_lsh)",
#             "dataset_size"  : len(df),
#             "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
#             "reduction_ratio": res.reduction_ratio,
#             "recall"        : res.metrics['recall'],
#             "time_sec"      : round(e_time - s_time, 3),
#     })

#     s_time = time.perf_counter()
#     res = blocker.block(x=df['txt_raw_kraj'], ann='voyager', control_ann=control_ann_voy_fast, verbose=0, random_seed=42)
#     e_time = time.perf_counter()
#     res = blocker.eval(res, tb)

#     results.append({
#             "algorithm"     : "BlockingPy (voyager) - fast",
#             "dataset_size"  : len(df),
#             "pairs"         : int((df.shape[0] * (df.shape[0]-1))/2 * (1-res.reduction_ratio)),
#             "reduction_ratio": res.reduction_ratio,
#             "recall"        : res.metrics['recall'],
#             "time_sec"      : round(e_time - s_time, 3),
#     })

# cols = ["algorithm", "dataset_size", "time_sec",
#         "recall", "reduction_ratio", "pairs"]

# results_df = pd.DataFrame(results)[cols]
# # results_df.to_csv("blocklib_comparison.csv")
# results_df

This means that not all records are part of at least one block. You can increase coverage by adjusting the filter to be less aggressive or by finding signatures that produce smaller block sizes.


{'(900, 2569, 1810, 3479, 1052, 293, 1962, 3631, 2872, 445, 2114, 1355, 3024, 597, 3934, 1507, 3176, 2417, 4086, 1659)': [6], '(1920, 3845, 1419, 3344, 918, 417, 2087, 4012, 1586, 3511, 1085, 584, 2254, 83, 1753, 3678, 1252, 751, 2421, 250)': [7], '(262, 1162, 2062, 662, 1562, 162, 1062, 1962, 562, 1462, 962, 1862, 462, 1362, 862, 1762, 362, 1262, 762, 1662)': [8], '(3330, 2441, 2321, 1432, 1312, 423, 1192, 303, 3510, 183, 3390, 2501, 3270, 2381, 2261, 1372, 1252, 363, 243, 3450)': [24], '(640, 1414, 2188, 2962, 3736, 414, 1188, 1962, 2736, 3510, 188, 962, 1736, 2510, 3284, 4058, 736, 1510, 2284, 3962)': [27], '(2438, 1671, 788, 21, 3350, 2467, 1700, 817, 50, 3263, 2496, 1613, 846, 79, 3292, 2525, 1642, 875, 4088, 3321)': [37], '(3718, 2445, 1172, 1559, 286, 3109, 1836, 563, 3386, 3773, 2500, 1227, 4050, 2777, 1504, 231, 618, 3441, 2168, 895)': [40], '(2958, 3599, 144, 785, 1426, 2067, 2708, 3349, 3990, 535, 1176, 1817, 2458, 3099, 3740, 285, 926, 1567, 2208, 2849)': [41], '(3499, 2477