## Simulation: subspace distance performance comparison

In [1]:
# Run this cell ONLY ONCE to set up the path
import os, sys
os.chdir('..')
# sys.path.insert(0, os.path.abspath('..'))

# Import necessary training functions
import numpy as np

import pickle

MASTER_SEED = 20241225

In [2]:
# Functions for simulation experiments
from reproducibility.simulations import Y1, Y2, Y3, Y4, logistic_normal, repeat_cv

## CKDR-$m^\star$
The **oracle** case in which the central compositional subspace dimension is well-specified

In [None]:
# 24min with 100 jobs
dists = []
for n in [200, 500, 1000]:
    for X_func in [logistic_normal]:
        for Y_func in [Y1, Y2, Y3, Y4]:
            dists.append(repeat_cv(n, 100, Y_func, X_func, njobs=100, reps=100, seed=MASTER_SEED, foldername="subsp_convergence", load=False))

In [3]:
print("Subspace distances and ARI scores")
for X_func in [logistic_normal]:
    print(X_func.__name__, "="*10)
    for Y_func in [Y1, Y2, Y3, Y4]:
        print(" ", Y_func.__name__)
        for n in [200, 500, 1000]:
            print("    n={}".format(n))
            with open("./results/simulation/subsp_convergence/op_{}_{}_{}{}.pickle".format(n, Y_func.__name__, X_func.__name__, None), "rb") as f:
                results = np.array(pickle.load(f))
            print("\tMean distance: {:.4f}".format(results.mean(0)[0]), "+/-", "SE: {:.4f}".format(results.std(0)[0] / np.sqrt(100)))
            print("\tMean ARI: {:.4f}".format(results.mean(0)[1]), "+/-", "SE: {:.4f}".format(results.std(0)[1] / np.sqrt(100)))
            m = 2 if Y_func.__name__ in ("Y1", "Y3") else 3
            print("\tRank deficiency:", np.sum(results[:, 2] < m))

Subspace distances and ARI scores
  Y1
    n=200
	Mean distance: 0.1038 +/- SE: 0.0017
	Mean ARI: 0.9954 +/- SE: 0.0016
	Rank deficiency: 0
    n=500
	Mean distance: 0.0526 +/- SE: 0.0005
	Mean ARI: 0.9942 +/- SE: 0.0058
	Rank deficiency: 0
    n=1000
	Mean distance: 0.0339 +/- SE: 0.0003
	Mean ARI: 0.9940 +/- SE: 0.0059
	Rank deficiency: 0
  Y2
    n=200
	Mean distance: 0.5592 +/- SE: 0.0049
	Mean ARI: 0.6106 +/- SE: 0.0174
	Rank deficiency: 0
    n=500
	Mean distance: 0.4402 +/- SE: 0.0064
	Mean ARI: 0.8792 +/- SE: 0.0205
	Rank deficiency: 0
    n=1000
	Mean distance: 0.3407 +/- SE: 0.0078
	Mean ARI: 0.9433 +/- SE: 0.0148
	Rank deficiency: 0
  Y3
    n=200
	Mean distance: 0.3558 +/- SE: 0.0033
	Mean ARI: 0.4551 +/- SE: 0.0066
	Rank deficiency: 0
    n=500
	Mean distance: 0.1851 +/- SE: 0.0022
	Mean ARI: 0.7396 +/- SE: 0.0090
	Rank deficiency: 0
    n=1000
	Mean distance: 0.1202 +/- SE: 0.0011
	Mean ARI: 0.9316 +/- SE: 0.0094
	Rank deficiency: 0
  Y4
    n=200
	Mean distance: 0.6435 +

## CKDR$^*$
target dimension $m$ is cross-validated using the grid $\{3, 4, 5, 6, 7\}$

In [None]:
# 90min with 100 jobs
dists = []
for n in [200, 500, 1000]:
    for X_func in [logistic_normal]:
        for Y_func in [Y1, Y2, Y3, Y4]:  
            # cv
            m = "cv"
            dists.append(repeat_cv(n, 100, Y_func, X_func, m=m, njobs=50, reps=100, seed=MASTER_SEED * 2, load=False, foldername="subsp_convergence"))

In [4]:
print("Subspace distances and ARI scores")
for X_func in [logistic_normal]:
    print(X_func.__name__, "="*10)
    for Y_func in [Y1, Y2, Y3, Y4]:
        print(" ", Y_func.__name__)
        for n in [200, 500, 1000]:
            print("    n={}".format(n))
            m = "cv"
            with open("./results/simulation/subsp_convergence/op_{}_{}_{}{}.pickle".format(n, Y_func.__name__, X_func.__name__, m), "rb") as f:
                results = np.array(pickle.load(f))
            print("\tMean distance: {:.4f}".format(results.mean(0)[0]), "+/-", "SE: {:.4f}".format(results.std(0)[0] / np.sqrt(100)))
            print("\tMean ARI: {:.4f}".format(results.mean(0)[1]), "+/-", "SE: {:.4f}".format(results.std(0)[1] / np.sqrt(100)))
            mstar = 2 if Y_func.__name__ in ("Y1", "Y3") else 3
            print("\tRank deficiency:", np.sum(results[:, 2] < mstar))

Subspace distances and ARI scores
  Y1
    n=200
	Mean distance: 0.1032 +/- SE: 0.0043
	Mean ARI: 0.8063 +/- SE: 0.0195
	Rank deficiency: 0
    n=500
	Mean distance: 0.0522 +/- SE: 0.0005
	Mean ARI: 0.9468 +/- SE: 0.0134
	Rank deficiency: 0
    n=1000
	Mean distance: 0.0384 +/- SE: 0.0047
	Mean ARI: 0.9807 +/- SE: 0.0077
	Rank deficiency: 0
  Y2
    n=200
	Mean distance: 0.5588 +/- SE: 0.0047
	Mean ARI: 0.5412 +/- SE: 0.0158
	Rank deficiency: 0
    n=500
	Mean distance: 0.4319 +/- SE: 0.0099
	Mean ARI: 0.8403 +/- SE: 0.0215
	Rank deficiency: 0
    n=1000
	Mean distance: 0.3172 +/- SE: 0.0066
	Mean ARI: 0.8917 +/- SE: 0.0208
	Rank deficiency: 0
  Y3
    n=200
	Mean distance: 0.3279 +/- SE: 0.0052
	Mean ARI: 0.4540 +/- SE: 0.0068
	Rank deficiency: 0
    n=500
	Mean distance: 0.3388 +/- SE: 0.0178
	Mean ARI: 0.5847 +/- SE: 0.0119
	Rank deficiency: 0
    n=1000
	Mean distance: 0.1172 +/- SE: 0.0011
	Mean ARI: 0.9006 +/- SE: 0.0126
	Rank deficiency: 0
  Y4
    n=200
	Mean distance: 0.6589 +

In [5]:
print("Selected target dimensions m:")
for Y_func in [Y1, Y2, Y3, Y4]:
    print(" ", Y_func.__name__)
    for n in [200, 500, 1000]:
        print("    n={}".format(n))
        with open("./results/simulation/subsp_convergence/param_op_{}_{}_{}cv.pickle".format(n, Y_func.__name__, logistic_normal.__name__), "rb") as f:
            params = pickle.load(f)
        epsilon, sigma, target_dim = params["epsilon"], params["sigma_Z"], params["target_dim"]
        print("\t Selected m:", target_dim)
        print("\t True m^*:", 2 if Y_func.__name__ in ("Y1", "Y3") else 3)

Selected target dimensions m:
  Y1
    n=200
	 Selected m: 4
	 True m^*: 2
    n=500
	 Selected m: 3
	 True m^*: 2
    n=1000
	 Selected m: 3
	 True m^*: 2
  Y2
    n=200
	 Selected m: 4
	 True m^*: 3
    n=500
	 Selected m: 7
	 True m^*: 3
    n=1000
	 Selected m: 6
	 True m^*: 3
  Y3
    n=200
	 Selected m: 4
	 True m^*: 2
    n=500
	 Selected m: 6
	 True m^*: 2
    n=1000
	 Selected m: 3
	 True m^*: 2
  Y4
    n=200
	 Selected m: 7
	 True m^*: 3
    n=500
	 Selected m: 6
	 True m^*: 3
    n=1000
	 Selected m: 5
	 True m^*: 3


## RS-ES
Check for continuous responses (Settings (I) and (II))

In [6]:
from scipy.io import loadmat

# Load RS-ES results from MATLAB files
rs_es_results = {}

# Sample sizes and Y functions to load
sample_sizes = [200, 500, 1000]
Y_func_names = ['Y1', 'Y2']

print("Loading RS-ES results...")
for n in sample_sizes:
    for Y_func_name in Y_func_names:
        filename = f'beta_rs_n{n}_{Y_func_name}.mat'
        filepath = f'./results/simulation/rs_es_results/{filename}'
        
        if os.path.exists(filepath):
            mat_data = loadmat(filepath)
            rs_es_results[(n, Y_func_name)] = {
                'beta_all': mat_data['beta_all']
            }
            # print(f"  Loaded: {filename}")
        else:
            print(f"  Missing: {filename}")

print("Available conditions:", list(rs_es_results.keys()))

Loading RS-ES results...
Available conditions: [(200, 'Y1'), (200, 'Y2'), (500, 'Y1'), (500, 'Y2'), (1000, 'Y1'), (1000, 'Y2')]


In [7]:
def convert_to_cdr(beta_vec):
    """
    Convert a beta vector to a compositional dimension reduction (CDR) matrix

    beta_vec: 1D numpy array representing the beta coefficients of the linear relative-shift model
    Returns: 2D numpy array (2 x d) representing the corresponding CDR matrix
    """
    max_beta = np.max(beta_vec)
    min_beta = np.min(beta_vec)
    gap = max_beta - min_beta

    P = np.zeros((2, len(beta_vec)))

    for j in range(len(beta_vec)):
        beta_j = beta_vec[j]
        P[0, j] = (max_beta - beta_j) / gap
        P[1, j] = (beta_j - min_beta) / gap
    
    # Return the CDR matrix P
    return P

In [10]:
data.keys()

dict_keys(['beta_all'])

In [12]:
from sklearn.metrics import adjusted_rand_score
from functions import KQuantiles
from reproducibility.simulations import subsp_dist, get_true, true_mat

# Analyze RS-ES results
print("RS-ES Results Summary")
print("="*50)

for Y_func_name in Y_func_names:
    Y_func = Y1 if Y_func_name == 'Y1' else Y2
    true_matrix = true_mat(100, Y_func)
    true_cluster = get_true(100)
    
    print(f"\n{Y_func_name}:")
    for n in sample_sizes:
        print(f"  n={n}:")
        
        data = rs_es_results[(n, Y_func_name)]
        beta_all = data['beta_all']
        reps = 100

        # Extract beta coefficients from cell array, process them, and assess distances & ARIs
        # Note: MATLAB cell arrays come as object arrays in Python
        distances = []
        aris = []
        for i in range(reps):
            beta_vec = np.array(beta_all[i, 0]).flatten()
            P = convert_to_cdr(beta_vec)

            # Calculate subspace distance
            dist = subsp_dist(P, true_matrix)
            distances.append(dist)

            # Calculate ARI after clustering
            RS = np.random.RandomState(MASTER_SEED + i)
            clus = KQuantiles(n_clusters=3, random_state=RS, verbose=False)
            clus.fit(P.T)
            ari = adjusted_rand_score(true_cluster, clus.clusters)
            aris.append(ari)
        
        # Store processed betas for potential further analysis
        rs_es_results[(n, Y_func_name)]['subsp_distances'] = distances
        rs_es_results[(n, Y_func_name)]['aris'] = aris

        # Subspace distances to the true matrix
        if Y_func_name == 'Y1':
            print("\tMean distance: {:.4f}".format(np.mean(distances)), "+/-", "SE: {:.4f}".format(np.std(distances) / np.sqrt(reps)))
        else:
            print("\tMean distance: NA")
        # Adjusted Rand Index (ARI) scores
        print("\tMean ARI: {:.4f}".format(np.mean(aris)), "+/-", "SE: {:.4f}".format(np.std(aris) / np.sqrt(reps)))

RS-ES Results Summary

Y1:
  n=200:
	Mean distance: 0.1280 +/- SE: 0.0012
	Mean ARI: 0.9949 +/- SE: 0.0015
  n=500:
	Mean distance: 0.0646 +/- SE: 0.0005
	Mean ARI: 0.9763 +/- SE: 0.0116
  n=1000:
	Mean distance: 0.0429 +/- SE: 0.0003
	Mean ARI: 0.9882 +/- SE: 0.0083

Y2:
  n=200:
	Mean distance: NA
	Mean ARI: 0.5589 +/- SE: 0.0139
  n=500:
	Mean distance: NA
	Mean ARI: 0.6826 +/- SE: 0.0212
  n=1000:
	Mean distance: NA
	Mean ARI: 0.7461 +/- SE: 0.0233


The subspace distance computed for the response Y2 is not comparable to the other cases since the resulting matrix $P$ has smaller row space dimension than the central compositional subspace.  

Technically speaking, this is because only the second principal angle (ignoring the trivial first principal angle induced by the vector $1_d$) is taken into account

## Amalgam
Requires the R package `amalgam` and the `rpy2` environment   
To favor the discrete method, we set the precise amalgamation $m = m^*$ case.  
Since the package cannot deal with zero inputs, we replace zeros according to the same zero replacement rule as described in the paper (in each sample, zeros are replaced by the 50\% of the minimum positive value)

In [5]:
from reproducibility.simulations import repeat_amalgam

n_list = [200, 500, 1000]
Y_func_list = [Y3, Y4]

In [None]:
# 3.5 hours with 100 cores
results_list = []
for n in n_list:
    for Y_func in Y_func_list:
        print(f"Running repeat_amalgam for n={n}, Y_func={Y_func.__name__}")
        result = repeat_amalgam(n, 100, Y_func, njobs=100, reps=100, seed=MASTER_SEED)
        results_list.append(result)

In [None]:
from sklearn.metrics import adjusted_rand_score
from functions import KQuantiles
from reproducibility.simulations import subsp_dist, get_true, true_mat

p, reps = 100, 100


for Y_func in Y_func_list:
    for n in n_list:
        print(f"Results for n={n}, Y_func={Y_func.__name__}:")
        with open("./results/simulation/amalgam_results/op_{}_{}.pickle".format(n, Y_func.__name__), "rb") as fi:
            result_arrays = np.array(pickle.load(fi))
        true_matrix = true_mat(100, Y_func)
        true_cluster = get_true(100)

        distances = []
        aris = []
        for i, W in enumerate(result_arrays):
            # Calculate subspace distance
            dist = subsp_dist(W, true_matrix)
            distances.append(dist)

            # Calculate ARI after clustering
            RS = np.random.RandomState(MASTER_SEED + i)
            n_clusters = 3
            # Use KQuantiles for clustering
            clus = KQuantiles(n_clusters=n_clusters, random_state=RS, verbose=False)
            clus.fit(W.T)
            ari = adjusted_rand_score(true_cluster, clus.clusters)
            aris.append(ari)

        print("    n={}, p={}, Y={}".format(n, p, Y_func.__name__))
        print("\tMean distance: {:.4f}".format(np.mean(distances)), "+/-", "SE: {:.4f}".format(np.std(distances) / np.sqrt(reps)))
        print("\tMean ARI: {:.4f}".format(np.mean(aris)), "+/-", "SE: {:.4f}".format(np.std(aris) / np.sqrt(reps)))

Results for n=200, Y_func=Y3:
    n=200, p=100, Y=Y3
	Mean distance: 0.5619 +/- SE: 0.0043
	Mean ARI: 0.2018 +/- SE: 0.0048
Results for n=500, Y_func=Y3:
    n=500, p=100, Y=Y3
	Mean distance: 0.4251 +/- SE: 0.0048
	Mean ARI: 0.3420 +/- SE: 0.0059
Results for n=1000, Y_func=Y3:
    n=1000, p=100, Y=Y3
	Mean distance: 0.3577 +/- SE: 0.0038
	Mean ARI: 0.4059 +/- SE: 0.0044
Results for n=200, Y_func=Y4:
    n=200, p=100, Y=Y4
	Mean distance: 0.7508 +/- SE: 0.0015
	Mean ARI: 0.1742 +/- SE: 0.0053
Results for n=500, Y_func=Y4:
    n=500, p=100, Y=Y4
	Mean distance: 0.7037 +/- SE: 0.0016
	Mean ARI: 0.2527 +/- SE: 0.0060
Results for n=1000, Y_func=Y4:
    n=1000, p=100, Y=Y4
	Mean distance: 0.6664 +/- SE: 0.0018
	Mean ARI: 0.3450 +/- SE: 0.0089


## Export table

In [13]:
# Self-contained table generation with all necessary imports and functions
import os
import numpy as np
import pickle
from scipy.io import loadmat
from sklearn.metrics import adjusted_rand_score

# Set master seed
MASTER_SEED = 20241225

# Import simulation functions
from reproducibility.simulations import Y1, Y2, Y3, Y4, subsp_dist, get_true, true_mat
from functions import KQuantiles

# Load RS-ES results
def load_rs_es_results():
    rs_es_results = {}
    sample_sizes = [200, 500, 1000]
    Y_func_names = ['Y1', 'Y2']
    
    for n in sample_sizes:
        for Y_func_name in Y_func_names:
            filename = f'beta_rs_n{n}_{Y_func_name}.mat'
            filepath = f'./results/simulation/rs_es_results/{filename}'
            
            if os.path.exists(filepath):
                mat_data = loadmat(filepath)
                rs_es_results[(n, Y_func_name)] = {
                    'beta_all': mat_data['beta_all'],
                    'reps': mat_data['beta_all'].shape[0]
                }
                
                # Process RS-ES results
                Y_func = Y1 if Y_func_name == 'Y1' else Y2
                true_matrix = true_mat(100, Y_func)
                true_cluster = get_true(100)
                
                beta_all = mat_data['beta_all']
                reps = beta_all.shape[0]
                
                distances = []
                aris = []
                for i in range(reps):
                    beta_vec = np.array(beta_all[i, 0]).flatten()
                    
                    # Convert to CDR matrix
                    max_beta = np.max(beta_vec)
                    min_beta = np.min(beta_vec)
                    gap = max_beta - min_beta
                    P = np.zeros((2, len(beta_vec)))
                    for j in range(len(beta_vec)):
                        beta_j = beta_vec[j]
                        P[0, j] = (max_beta - beta_j) / gap
                        P[1, j] = (beta_j - min_beta) / gap
                    
                    # Calculate subspace distance
                    dist = subsp_dist(P, true_matrix)
                    distances.append(dist)
                    
                    # Calculate ARI after clustering
                    RS = np.random.RandomState(MASTER_SEED + i)
                    clus = KQuantiles(n_clusters=3, random_state=RS, verbose=False)
                    clus.fit(P.T)
                    ari = adjusted_rand_score(true_cluster, clus.clusters)
                    aris.append(ari)
                
                rs_es_results[(n, Y_func_name)]['subsp_distances'] = distances
                rs_es_results[(n, Y_func_name)]['aris'] = aris
    
    return rs_es_results

# Load RS-ES results
rs_es_results = load_rs_es_results()

# Collect all results for the table
def collect_results():
    results = {}
    
    # CKDR-m* (Oracle)
    for Y_func in [Y1, Y2, Y3, Y4]:
        for n in [200, 500, 1000]:
            with open("./results/simulation/subsp_convergence/op_{}_{}_{}{}.pickle".format(n, Y_func.__name__, "logistic_normal", None), "rb") as f:
                res = np.array(pickle.load(f))
            mean_dist = res.mean(0)[0] * 100  # multiply by 100
            se_dist = res.std(0)[0] / np.sqrt(100) * 100
            mean_ari = res.mean(0)[1] * 100
            se_ari = res.std(0)[1] / np.sqrt(100) * 100
            
            results[('CKDR-$m^*$', n, Y_func.__name__)] = {
                'dist_mean': mean_dist, 'dist_se': se_dist,
                'ari_mean': mean_ari, 'ari_se': se_ari
            }
    
    # CKDR (Cross-validated)
    for Y_func in [Y1, Y2, Y3, Y4]:
        for n in [200, 500, 1000]:
            with open("./results/simulation/subsp_convergence/op_{}_{}_{}{}.pickle".format(n, Y_func.__name__, "logistic_normal", "cv"), "rb") as f:
                res = np.array(pickle.load(f))
            mean_dist = res.mean(0)[0] * 100
            se_dist = res.std(0)[0] / np.sqrt(100) * 100
            mean_ari = res.mean(0)[1] * 100
            se_ari = res.std(0)[1] / np.sqrt(100) * 100
            
            results[('CKDR', n, Y_func.__name__)] = {
                'dist_mean': mean_dist, 'dist_se': se_dist,
                'ari_mean': mean_ari, 'ari_se': se_ari
            }
    
    # RS-ES (subspace distance only for Y1, ARI for both Y1 and Y2)
    for Y_func_name in ['Y1', 'Y2']:
        for n in [200, 500, 1000]:
            if (n, Y_func_name) in rs_es_results:
                aris = rs_es_results[(n, Y_func_name)]['aris']
                mean_ari = np.mean(aris) * 100
                se_ari = np.std(aris) / np.sqrt(len(aris)) * 100
                
                # For subspace distance: only use Y1 results
                if Y_func_name == 'Y1':
                    distances = rs_es_results[(n, Y_func_name)]['subsp_distances']
                    mean_dist = np.mean(distances) * 100
                    se_dist = np.std(distances) / np.sqrt(len(distances)) * 100
                else:  # Y2
                    mean_dist = '--'
                    se_dist = '--'
                
                results[('RS-ES', n, Y_func_name)] = {
                    'dist_mean': mean_dist, 'dist_se': se_dist,
                    'ari_mean': mean_ari, 'ari_se': se_ari
                }
            else:
                # Fill with -- for missing results
                results[('RS-ES', n, Y_func_name)] = {
                    'dist_mean': '--', 'dist_se': '--',
                    'ari_mean': '--', 'ari_se': '--'
                }
    
    # For Y3 and Y4, RS-ES is not applicable
    for Y_func_name in ['Y3', 'Y4']:
        for n in [200, 500, 1000]:
            results[('RS-ES', n, Y_func_name)] = {
                'dist_mean': '--', 'dist_se': '--',
                'ari_mean': '--', 'ari_se': '--'
            }
    
    # Amalgam (only Y3 and Y4)
    for Y_func in [Y3, Y4]:
        for n in [200, 500, 1000]:
            try:
                with open("./results/simulation/amalgam_results/op_{}_{}.pickle".format(n, Y_func.__name__), "rb") as fi:
                    result_arrays = np.array(pickle.load(fi))
                true_matrix = true_mat(100, Y_func)
                true_cluster = get_true(100)

                distances = []
                aris = []
                for i, W in enumerate(result_arrays):
                    # Calculate subspace distance
                    dist = subsp_dist(W, true_matrix)
                    distances.append(dist)

                    # Calculate ARI after clustering
                    RS = np.random.RandomState(MASTER_SEED + i)
                    n_clusters = 3
                    clus = KQuantiles(n_clusters=n_clusters, random_state=RS, verbose=False)
                    clus.fit(W.T)
                    ari = adjusted_rand_score(true_cluster, clus.clusters)
                    aris.append(ari)
                
                mean_dist = np.mean(distances) * 100
                se_dist = np.std(distances) / np.sqrt(len(distances)) * 100
                mean_ari = np.mean(aris) * 100
                se_ari = np.std(aris) / np.sqrt(len(aris)) * 100
                
                results[('Amalgam', n, Y_func.__name__)] = {
                    'dist_mean': mean_dist, 'dist_se': se_dist,
                    'ari_mean': mean_ari, 'ari_se': se_ari
                }
            except Exception as e:
                print(f"Error loading Amalgam results for {Y_func.__name__}, n={n}: {e}")
                results[('Amalgam', n, Y_func.__name__)] = {
                    'dist_mean': '--', 'dist_se': '--',
                    'ari_mean': '--', 'ari_se': '--'
                }
    
    # For Y1 and Y2, Amalgam is not applicable
    for Y_func_name in ['Y1', 'Y2']:
        for n in [200, 500, 1000]:
            results[('Amalgam', n, Y_func_name)] = {
                'dist_mean': '--', 'dist_se': '--',
                'ari_mean': '--', 'ari_se': '--'
            }
    
    return results

# Collect results
all_results = collect_results()

print("Results collected successfully!")

Results collected successfully!


In [14]:
all_results

{('CKDR-$m^*$', 200, 'Y1'): {'dist_mean': 10.377841592809256,
  'dist_se': 0.1711307198065112,
  'ari_mean': 99.5435175653203,
  'ari_se': 0.15511198125069928},
 ('CKDR-$m^*$', 500, 'Y1'): {'dist_mean': 5.262336106620131,
  'dist_se': 0.05393145015046639,
  'ari_mean': 99.41981844802342,
  'ari_se': 0.5772733554577129},
 ('CKDR-$m^*$', 1000, 'Y1'): {'dist_mean': 3.394943860435841,
  'dist_se': 0.03458452402674914,
  'ari_mean': 99.40349414787325,
  'ari_se': 0.5935158290266943},
 ('CKDR-$m^*$', 200, 'Y2'): {'dist_mean': 55.92333593521543,
  'dist_se': 0.49315113881599054,
  'ari_mean': 61.057394483063874,
  'ari_se': 1.7372540442829827},
 ('CKDR-$m^*$', 500, 'Y2'): {'dist_mean': 44.01609017604641,
  'dist_se': 0.6424024838752485,
  'ari_mean': 87.92453851921508,
  'ari_se': 2.050654083522289},
 ('CKDR-$m^*$', 1000, 'Y2'): {'dist_mean': 34.065999068683915,
  'dist_se': 0.7755821802377287,
  'ari_mean': 94.32776296528078,
  'ari_se': 1.4808127915154656},
 ('CKDR-$m^*$', 200, 'Y3'): {'dis

In [15]:
print("\n" + "="*80)
print("FINAL CUSTOM TABLE DESIGN")
print("="*80)

def format_result(mean, se, is_best=False):
    """Format mean (se) for LaTeX table"""
    if mean == '--':
        return '--'
    formatted = f"{mean:.1f} ({se:.1f})"
    if is_best:
        formatted = f"\\textbf{{{formatted}}}"
    return formatted

def generate_final_custom_table():
    """Generate the final custom table design with switched columns"""
    
    sample_sizes = [200, 500, 1000]
    y_settings = ['Y1', 'Y2', 'Y3', 'Y4']
    y_labels = ['(I)', '(II)', '(III)', '(IV)']
    
    # Find best values for each setting across sample sizes
    def find_best_values_final():
        best_dist = {}  # (setting, n) -> method
        best_ari = {}   # (setting, n) -> method
        
        for y_setting in y_settings:
            for n in sample_sizes:
                # Determine applicable methods for this setting
                if y_setting in ['Y1', 'Y2']:  # Continuous
                    applicable_methods = ['CKDR-$m^*$', 'CKDR', 'RS-ES'] if y_setting == 'Y1' else ['CKDR-$m^*$', 'CKDR']
                else:  # Discrete
                    applicable_methods = ['CKDR-$m^*$', 'CKDR', 'Amalgam']
                
                # Find best distance (minimum)
                valid_dist_values = []
                for method in applicable_methods:
                    key = (method, n, y_setting)
                    if key in all_results and all_results[key]['dist_mean'] != '--':
                        valid_dist_values.append((all_results[key]['dist_mean'], method))
                
                if valid_dist_values:
                    min_dist = min(valid_dist_values, key=lambda x: x[0])
                    best_dist[(y_setting, n)] = min_dist[1]
                
                # Find best ARI (maximum)
                valid_ari_values = []
                for method in applicable_methods:
                    key = (method, n, y_setting)
                    if key in all_results and all_results[key]['ari_mean'] != '--':
                        valid_ari_values.append((all_results[key]['ari_mean'], method))
                
                if valid_ari_values:
                    max_ari = max(valid_ari_values, key=lambda x: x[0])
                    best_ari[(y_setting, n)] = max_ari[1]
        
        return best_dist, best_ari
    
    best_dist, best_ari = find_best_values_final()
    
    latex_lines = []
    
    # Table header
    latex_lines.append("\\begin{table}[htbp]")
    latex_lines.append("\\centering")
    latex_lines.append("\\caption{Simulation Results: Subspace Distance and ARI}")
    latex_lines.append("\\label{tab:simulation_final}")
    latex_lines.append("\\begin{tabular}{llccccccc}")
    latex_lines.append("\\toprule")
    
    # Column headers (switched: Setting first, then Method)
    header1 = "Setting & Method & \\multicolumn{3}{c}{Subspace Distance $\\times 100$} & \\multicolumn{3}{c}{ARI $\\times 100$} \\\\"
    header2 = "& & $n=200$ & $n=500$ & $n=1000$ & $n=200$ & $n=500$ & $n=1000$ \\\\"
    latex_lines.append(header1)
    latex_lines.append("\\cmidrule(lr){3-5} \\cmidrule(lr){6-8}")
    latex_lines.append(header2)
    latex_lines.append("\\midrule")
    
    # Setting specifications: (setting, label, applicable_methods)
    setting_specs = [
        ('Y1', '(I)', ['CKDR-$m^*$', 'CKDR', 'RS-ES']),
        ('Y2', '(II)', ['CKDR-$m^*$', 'CKDR', 'RS-ES']),
        ('Y3', '(III)', ['CKDR-$m^*$', 'CKDR', 'Amalgam']),
        ('Y4', '(IV)', ['CKDR-$m^*$', 'CKDR', 'Amalgam'])
    ]
    
    # Generate table body
    for y_setting, setting_label, applicable_methods in setting_specs:
        for i, method_name in enumerate(applicable_methods):
            # Setting label (with multirow for first row)
            if i == 0:
                setting_cell = f"\\multirow{{3}}{{*}}{{{setting_label}}}"
            else:
                setting_cell = ""
            
            row_data = [setting_cell, method_name]
            
            # Subspace distances for all sample sizes
            for n in sample_sizes:
                key = (method_name, n, y_setting)
                if key in all_results and all_results[key]['dist_mean'] != '--':
                    res = all_results[key]
                    is_best = best_dist.get((y_setting, n)) == method_name
                    formatted = format_result(res['dist_mean'], res['dist_se'], is_best)
                else:
                    formatted = "--"
                row_data.append(formatted)
            
            # ARI scores for all sample sizes
            for n in sample_sizes:
                key = (method_name, n, y_setting)
                if key in all_results and all_results[key]['ari_mean'] != '--':
                    res = all_results[key]
                    is_best = best_ari.get((y_setting, n)) == method_name
                    formatted = format_result(res['ari_mean'], res['ari_se'], is_best)
                else:
                    formatted = "--"
                row_data.append(formatted)
            
            latex_lines.append(" & ".join(row_data) + " \\\\")
        
        # Add midrule after each setting except the last
        if y_setting != setting_specs[-1][0]:
            latex_lines.append("\\midrule")
    
    # Table footer
    latex_lines.append("\\bottomrule")
    latex_lines.append("\\end{tabular}")
    latex_lines.append("\\end{table}")
    
    return "\n".join(latex_lines)

# Generate the final custom table
final_table = generate_final_custom_table()

print("FINAL CUSTOM TABLE:")
print("-" * 50)
print(final_table)

# Save the final table
with open("results/tables/simulation_final_table.tex", "w") as f:
    f.write(final_table)

print("\\n\\nFinal custom table saved to 'simulation_final_table.tex'!")
print("\\nThis design shows:")
print("- CKDR methods: All 4 settings (I-IV) with multirow")
print("- RS-ES: Only settings (I-II) for continuous responses") 
print("- Amalgam: Only settings (III-IV) for discrete responses")
print("- Optimal values are boldfaced within each setting")


FINAL CUSTOM TABLE DESIGN
FINAL CUSTOM TABLE:
--------------------------------------------------
\begin{table}[htbp]
\centering
\caption{Simulation Results: Subspace Distance and ARI}
\label{tab:simulation_final}
\begin{tabular}{llccccccc}
\toprule
Setting & Method & \multicolumn{3}{c}{Subspace Distance $\times 100$} & \multicolumn{3}{c}{ARI $\times 100$} \\
\cmidrule(lr){3-5} \cmidrule(lr){6-8}
& & $n=200$ & $n=500$ & $n=1000$ & $n=200$ & $n=500$ & $n=1000$ \\
\midrule
\multirow{3}{*}{(I)} & CKDR-$m^*$ & 10.4 (0.2) & 5.3 (0.1) & \textbf{3.4 (0.0)} & \textbf{99.5 (0.2)} & \textbf{99.4 (0.6)} & \textbf{99.4 (0.6)} \\
 & CKDR & \textbf{10.3 (0.4)} & \textbf{5.2 (0.0)} & 3.8 (0.5) & 80.6 (1.9) & 94.7 (1.3) & 98.1 (0.8) \\
 & RS-ES & 12.8 (0.1) & 6.5 (0.1) & 4.3 (0.0) & 99.5 (0.1) & 97.6 (1.2) & 98.8 (0.8) \\
\midrule
\multirow{3}{*}{(II)} & CKDR-$m^*$ & 55.9 (0.5) & 44.0 (0.6) & 34.1 (0.8) & \textbf{61.1 (1.7)} & \textbf{87.9 (2.1)} & \textbf{94.3 (1.5)} \\
 & CKDR & \textbf{55.9 (0.5)} 