# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [10]:
import os
import glob
import geometric_sampling as gs
import pandas as pd
import numpy as np
import itertools
from tqdm import tqdm
import rpy2.robjects as ro
from rpy2.robjects import numpy2ri, default_converter
from rpy2.robjects.conversion import localconverter

In [3]:
%load_ext rpy2.ipython



In [34]:
%%R

library(WaveSampling)
library(sampling)
library(BalancedSampling)


Loading required package: Matrix


# Moran and Local Balance Score

In [42]:
def score_all_samples_moran_lb(coords, probs, sample_indices_list):
    """
    coords          : an (N×2)-array of spatial coordinates
    probs           : length-N array of inclusion probabilities
    sample_indices_list : list of length-n integer numpy arrays (0-based indices)

    Returns an (S×2) numpy array of [IB, SBLB] for each of the S samples.
    """

    # Convert Python list of numpy arrays into an R list of integer vectors
    #   * add +1 because R is 1-based
    r_sample_list = ro.ListVector({
        str(i+1): ro.IntVector(sample_idx.astype(int) + 1)
        for i, sample_idx in enumerate(sample_indices_list)
    })

    with localconverter(default_converter + numpy2ri.converter):
        ro.globalenv['coords'] = coords
        ro.globalenv['probs'] = probs
        # Precompute W once
        ro.r("""
            W0 <- wpik(coords, probs)
            W <- W0 - diag(diag(W0))
            diag(W) <- 0
        """)
        ro.globalenv['samples'] = r_sample_list

        # Define an R function that loops over all samples
        ro.r("""
            score_samples <- function(W, probs, coords, samples_list) {
              S <- length(samples_list)
              IBs   <- numeric(S)
              SBLBs <- numeric(S)

              for (i in seq_len(S)) {
                samp_idx <- samples_list[[i]]
                mask <- integer(length(probs))
                mask[samp_idx] <- 1

                IBs[i]   <- tryCatch(IB(W, mask),        error = function(e) Inf)
                SBLBs[i] <- tryCatch(sblb(probs, coords, samp_idx), error = function(e) Inf)
              }
              # return as a 2-column matrix
              cbind(IB = IBs, SBLB = SBLBs)
            }
        """)

        # Call it once
        result = ro.r("score_samples(W, probs, coords, samples)")
        # result comes back as an R matrix  S×2

    # Turn it into an (S×2) numpy array
    with localconverter(default_converter + numpy2ri.converter):
        np_result = np.array(result)
    return np_result[:, 0], np_result[:, 1]


# Loading Population

In [26]:
DATA_DIR = "../data_samples/coords_probs"
csv_paths = glob.glob(os.path.join(DATA_DIR, "*.csv"))

coords_dict = {}
probs_dict = {}

for fp in csv_paths:
    if 'swiss' not in fp:
        name = os.path.splitext(os.path.basename(fp))[0]
        data = np.loadtxt(fp, delimiter=",", skiprows=1)
        coords = data[:, :2]
        probs  = data[:, -1]

        coord_name, prob_name = name.split("_")
        coord_name = 'cluster' if coord_name == 'clust' else coord_name
        prob_name = 'equal' if prob_name == 'eq' else 'unequal'

        coords_dict[coord_name] = coords
        probs_dict[coord_name] = probs_dict.get(coord_name, {})
        probs_dict[coord_name][prob_name] = probs

print(coords_dict.keys())
print(probs_dict.keys())
print(probs_dict['random'].keys())

dict_keys(['random', 'cluster', 'meuse', 'grid'])
dict_keys(['random', 'cluster', 'meuse', 'grid'])
dict_keys(['equal', 'unequal'])


# Evaluation Function

In [53]:
def evaluate(
        coords_dict,
        probs_dict,
        n_values=[4, 8, 16],
        zone_list=[(1, 1), (2, 2), (3, 3)],
        tolerance=5,
        split_size=1e-3,
):
    records = []

    # pre‐compute all combinations
    combos = list(itertools.product(
        n_values,
        zone_list
    ))

    for n, zones in tqdm(
        combos,
        desc="Total combos",
        unit="combo"
    ):

        for coord_name in probs_dict.keys():
            for prob_name in probs_dict[coord_name].keys():

                coords = coords_dict[coord_name]
                probs = probs_dict[coord_name][prob_name]

                kss = gs.sampling.KMeansSpatialSamplingSimple(
                    coords, probs,
                    n=n,
                    n_zones=zones,
                    tolerance=tolerance,
                    split_size=split_size
                )

                density_expected = np.round(kss.expected_score(), 4)
                density_val = np.round(kss.var_score(), 4)

                moran_scores, lb_scores = score_all_samples_moran_lb(coords, probs, kss.all_samples)

                moran_expected = np.round(kss.expected_score(moran_scores), 4)
                moran_val = np.round(kss.var_score(moran_scores), 4)

                lb_expected = np.round(kss.expected_score(lb_scores), 4)
                lb_val = np.round(kss.var_score(lb_scores), 4)

                records.append({
                    'n': n,
                    'zones': f"{zones[0]}×{zones[1]}",
                    'coord': coord_name,
                    'prob': prob_name,
                    'exp_density': density_expected,
                    'exp_moran': moran_expected,
                    'exp_lb': lb_expected,
                    'var_density': density_val,
                    'var_moran': moran_val,
                    'var_lb': lb_val,
                })

    return pd.DataFrame.from_records(records)


In [54]:
df = evaluate(
    coords_dict,
    probs_dict,
    n_values=[4],
    zone_list=[(1, 1), (2, 2), (3, 3)],
    tolerance=5,
    split_size=1e-3,
)

Total combos: 100%|██████████| 3/3 [00:40<00:00, 13.47s/combo]


In [56]:
summary = df.pivot_table(
    columns=['n', 'zones'],
    values=['exp_density', 'exp_moran', 'exp_lb', 'var_density', 'var_moran', 'var_lb'],
    index=['coord', 'prob'],
    aggfunc='first'
)

summary

Unnamed: 0_level_0,Unnamed: 1_level_0,exp_density,exp_density,exp_density,exp_lb,exp_lb,exp_lb,exp_moran,exp_moran,exp_moran,var_density,var_density,var_density,var_lb,var_lb,var_lb,var_moran,var_moran,var_moran
Unnamed: 0_level_1,n,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
Unnamed: 0_level_2,zones,1×1,2×2,3×3,1×1,2×2,3×3,1×1,2×2,3×3,1×1,2×2,3×3,1×1,2×2,3×3,1×1,2×2,3×3
coord,prob,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3
cluster,equal,-0.0809,-0.0743,-0.0765,0.3191,0.3206,0.3221,-0.3714,-0.3173,-0.3791,0.0566,0.0543,0.0512,0.0107,0.0108,0.0105,0.0627,0.0529,0.0705
cluster,unequal,-0.3429,-0.3378,-0.3305,0.6694,0.6721,0.6662,-0.1382,-0.1381,-0.1392,0.1521,0.1468,0.1466,0.9885,0.988,0.9893,0.0045,0.0054,0.0062
grid,equal,-0.0,-0.0,-0.0,0.4213,0.4213,0.4213,-0.2041,-0.2041,-0.2041,0.0,0.0,0.0,0.0244,0.0244,0.0244,0.0049,0.0049,0.0049
grid,unequal,-0.243,-0.2541,-0.2517,0.7098,0.7323,0.7322,-0.1336,-0.1411,-0.1391,0.0658,0.0323,0.025,2.5212,2.4793,2.4797,0.004,0.0038,0.0042
meuse,equal,0.0637,0.0388,0.0304,0.4511,0.4688,0.47,-0.2656,-0.2863,-0.2882,0.0203,0.0128,0.0107,0.0374,0.0397,0.0422,0.0082,0.017,0.012
meuse,unequal,0.0776,0.054,0.0468,0.5955,0.5957,0.5955,-0.2261,-0.2278,-0.2453,0.022,0.0121,0.012,0.0593,0.0649,0.0724,0.0042,0.0079,0.0072
random,equal,-0.0403,-0.0061,-0.0093,0.3787,0.4031,0.4006,-0.1721,-0.2041,-0.203,0.057,0.0176,0.0158,0.0146,0.0137,0.0173,0.0058,0.0039,0.0035
random,unequal,-0.2017,-0.1964,-0.1916,0.6683,0.6937,0.7138,-0.1387,-0.1482,-0.1462,0.0731,0.0363,0.0229,2.255,2.236,2.2211,0.0035,0.0029,0.0039
