In [None]:
import numpy as np
from numpy.random import multivariate_normal
import scipy.stats as stats
from sklearn.model_selection import train_test_split

def generate_random_samples(n, scale=1.0, size=1000) -> np.ndarray:
    """
    Generate random samples from a multivariate normal distribution.
    
    Parameters:
    n : int - dimension of the distribution
    scale : float - scaling factor for the variance magnitudes
    
    Returns:
    numpy.ndarray - samples drawn from the distribution
    """
    mu = [np.random.randn() for _ in range(n)] # does this need to be zero mean?
    # Generate a random matrix
    A = np.random.randn(n, n) * scale
    # Make it positive semi-definite by multiplying with its transpose
    cov = A @ A.T
    samples = multivariate_normal(mu, cov, size=size)
    return samples, mu, cov

# Example usage
n = 6
samples, mu, cov = generate_random_samples(n, scale=2.0, size=10000)

In [56]:
#nonparametric discretization

def discretize_to_percentiles(samples, n_bins=10):
    """
    Discretize each variable in samples into n bins based on percentiles.
    
    Parameters:
    samples : numpy.ndarray - array of shape (n_samples, n_variables)
    n_bins : int - number of bins (percentiles) to use
    
    Returns:
    numpy.ndarray - discretized samples with values from 1 to n_bins
    """
    n_samples, n_vars = samples.shape
    discretized = np.zeros_like(samples, dtype=int)
    
    for i in range(n_vars):
        # Compute percentile edges for this variable
        percentiles = np.linspace(0, 100, n_bins + 1)
        bin_edges = np.percentile(samples[:, i], percentiles)
        
        # Digitize: assign each sample to a bin (1 to n_bins)
        discretized[:, i] = np.digitize(samples[:, i], bin_edges[1:-1]) + 1
        
        # Handle edge case: values exactly at max should be in last bin
        discretized[:, i] = np.clip(discretized[:, i], 1, n_bins)
    
    return discretized

# Example usage
n_bins = 5000
discretized_samples = discretize_to_percentiles(samples, n_bins=n_bins)
print("Original samples (first 5):")
print(samples[:5])
print("\nDiscretized samples (first 5):")
print(discretized_samples[:5])
print("\nValue range:", discretized_samples.min(), "to", discretized_samples.max())

Original samples (first 5):
[[ -2.62090456  14.78740943   0.48413253   4.89910089  -2.43304291
    2.33138888]
 [  2.47418582   0.45865875  -0.92419915   3.97267749   1.65871741
   -1.25402435]
 [ -0.59404646  -3.14665703   2.85125504  -1.52828653  -1.37209772
   -0.39330742]
 [ -3.03238588   7.42815195  -9.82997429  -0.17118683   8.01226121
   12.9472815 ]
 [ -2.02764326  -4.16599263 -11.52986561  -2.62243323   6.73135044
    6.40364022]]

Discretized samples (first 5):
[[1364 4942 3190 4693  999 3523]
 [4050 2677 2689 4458 2599 2041]
 [2490 1617 3932 1536 1358 2418]
 [1166 4353  303 2358 4624 4988]
 [1675 1331  155  992 4385 4607]]

Value range: 1 to 5000


In [77]:
def identification_experiment(samples_train, samples_test, bins):
    # For each category of the first variable, learn the mean and covariance of the remaining variables
    d = samples_train.shape[1]
    cond_stats = {}
    for cat in range(1, bins+1):
        mask = samples_train[:,0] == cat
        # this will produce nan if mask.sum() is too small
        if mask.sum() < 2:  # Need at least 2 for covariance
            # Use overall statistics as fallback
            cond_mean = samples_train[:,1:].mean(axis=0)
            cond_cov = np.cov(samples_train[:,1:].T) + np.eye(d-1)*1e-6
        else:
            cond_mean = samples_train[mask][:,1:].mean(axis=0)
            cond_cov = np.cov(samples_train[mask][:,1:].T) + np.eye(d-1)*1e-6 # add small value to diagonal for numerical stability
        cond_stats[cat] = (cond_mean, cond_cov)
    
    # trials = 80% of test samples or max 2000
    trials = min(int(samples_test.shape[0] * 0.8), 2000)
    ids = np.random.choice(samples_test.shape[0], size=trials, replace=False)
    top1_success, avg_rank = 0, 0.0

    for tidx in ids:
        # observe samples public traits
        true_cat = samples_test[tidx,0]
        # gather learned stats
        mean, cov = cond_stats[true_cat]
        # compute log-probabilities for all candidates
        candidates = samples_test[:,1:]
        logps = stats.multivariate_normal.logpdf(candidates, mean=mean, cov=cov)
        ranks = (-logps).argsort().argsort() + 1
        true_rank = ranks[tidx]
        avg_rank += true_rank
        if true_rank == 1:
            top1_success += 1

    avg_rank /= trials
    print(f"Top-1 Success: {top1_success / trials * 100:.2f}%")
    print(f"Average Rank: {avg_rank:.2f}")

dsample_train, dsample_test = train_test_split(discretized_samples, test_size=0.2, random_state=42)
#identification_experiment(dsample_train, dsample_test, bins=n_bins)

In [None]:
# put it all together
n = 6
samples, mu, cov = generate_random_samples(n, scale=2.0, size=100000)
for n_bins in range(5,100,10):
    print(f"\n=== Identification Experiment with {n_bins} bins ===")

    discretized_samples = discretize_to_percentiles(samples, n_bins=n_bins)

    # get train test split of both continuous and discretized samples
    # to do pick a method that makes it clearer these have been split the same way
    continuous_train, continuous_test = train_test_split(samples, test_size=0.4, random_state=42)
    discretised_train, discretised_test = train_test_split(discretized_samples, test_size=0.4, random_state=42)


    identification_experiment(discretised_train, discretised_test, bins=n_bins)
    




=== Identification Experiment with 5 bins ===
Top-1 Success: 0.05%
Average Rank: 18906.61

=== Identification Experiment with 15 bins ===
Top-1 Success: 0.00%
Average Rank: 18369.84

=== Identification Experiment with 25 bins ===
Top-1 Success: 0.00%
Average Rank: 18202.49

=== Identification Experiment with 35 bins ===
Top-1 Success: 0.00%
Average Rank: 17919.06

=== Identification Experiment with 45 bins ===
Top-1 Success: 0.00%
Average Rank: 17889.94

=== Identification Experiment with 55 bins ===
Top-1 Success: 0.00%
Average Rank: 17948.10

=== Identification Experiment with 65 bins ===
Top-1 Success: 0.00%
Average Rank: 18061.37

=== Identification Experiment with 75 bins ===
Top-1 Success: 0.00%
Average Rank: 18145.35

=== Identification Experiment with 85 bins ===
Top-1 Success: 0.05%
Average Rank: 17854.94

=== Identification Experiment with 95 bins ===
Top-1 Success: 0.00%
Average Rank: 17750.05


In [36]:
# bin data into 10 bins
hist, bin_edges = np.histogram(x, bins=10)
print("Histogram:", hist)
print("Bin edges:", bin_edges)

Histogram: [  1   0   6  36 145 276 320 165  47   4]
Bin edges: [-24.02273693 -19.95944988 -15.89616283 -11.83287578  -7.76958874
  -3.70630169   0.35698536   4.42027241   8.48355946  12.54684651
  16.61013356]


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal

# Set reproducibility
np.random.seed(42)

# Simulation parameters
N_total = 1000000       # total population
N_train = 800000       # training cohort
N_test = N_total - N_train
D = 6                # number of variables
bins_list = [4, 8, 16, 36, 64, 128]  # discretization levels

# Generate correlated multivariate data
mu = np.zeros(D)
Sigma = 0.5 * np.ones((D, D)) + 0.5 * np.eye(D)  # correlated structure
latent = np.random.multivariate_normal(mu, Sigma, size=N_total)

# Split train/test
latent_train = latent[:N_train]
latent_test  = latent[N_train:]

def quantile_discretize(arr, bins):
    """Discretize each column into equal-frequency quantile bins."""
    arr_disc = np.zeros_like(arr, dtype=int)
    for j in range(arr.shape[1]):
        ranks = arr[:, j].argsort().argsort()
        arr_disc[:, j] = np.ceil((ranks + 1) / len(ranks) * bins).astype(int)
        arr_disc[arr_disc[:, j] > bins, j] = bins
    return arr_disc

def identify_accuracy(train, test, bins):
    """Estimate identifiability for given discretization."""
    # Estimate mean and covariance from training data
    mu_hat = train.mean(axis=0)
    Sigma_hat = np.cov(train, rowvar=False)
    inv_Sigma = np.linalg.pinv(Sigma_hat)
    
    correct = 0
    known_var = 0  # assume we only know variable 0 for reidentification

    # For each test subject
    for i, x in enumerate(test):
        # true observed value
        x_obs = x[known_var]
        # compute conditional distribution of X_known given others
        # Using covariance to get predicted mean for known variable given the rest
        likelihoods = []
        for j, candidate in enumerate(test):
            diff = candidate - mu_hat
            # Mahalanobis distance as a proxy for "match likelihood"
            mdist = diff @ inv_Sigma @ diff.T
            likelihoods.append(-mdist)
        # guess = index of highest "likelihood"
        guess = np.argmax(likelihoods)
        if guess == i:
            correct += 1
    return correct / len(test)

# Run for each discretization level
results = []
for bins in bins_list:
    print(f"\n=== Evaluating discretization with {bins} bins ===")
    disc = quantile_discretize(latent, bins)
    train_disc = disc[:N_train]
    test_disc  = disc[N_train:]
    acc = identify_accuracy(train_disc, test_disc, bins)
    print(f"Identifiability (Top-1 Accuracy): {acc*100:.2f}%")
    results.append((bins, acc))

df = pd.DataFrame(results, columns=["Discretization_bins", "Identifiability"])
print(df)


=== Evaluating discretization with 4 bins ===
