# Can the spectral gap of a filtration predict whether a dataset is a circle or an infinity sign? 

In this notebook I assess whether the smallest nonzero eigenvalues obtained from a filtration of point-cloud data can be used to classify it. 
The goal is to distinguish between pointclouds of $S^1$ and of $S^1 \vee S^1$. 

1. For each of the two spaces, 50 datasets with 50 noisy points in each are generated. 
2. A filtration is generated from the data using Gudhi's implementation of Alpha complexes.
3. 20 uniformly spaced indices $I$ in the filtration are selected, and for each of the approximately 200 simplicial complex pairs $(i, j) \in I^2$ the smallest nonzero eigenvalue $\lambda^q_{i,j}$ of the persistent Laplacian is computed at each simplicial complex dimension $q$. 
4. A Logistic regression model is trained using the nonpersistent eigenvalues in dimension 1: $(\lambda^1_{i,i})_{i \in I}$. This is the _non-persistent_ model.
5. A Logistic regression model is trained using the eigenvalues in dimension 1: $(\lambda^1_{i,j})_{(i,j) \in I^2}$. This is the _persistent_ model.
6. A paired t-test is run to assess whether the persistent model performs differently than the non-persistent model. 

## Generate data

In [1]:
import tadasets
dataset = [(tadasets.dsphere(n=50, d=1, r=2, noise=0.6, seed=i), 0) for i in range(50)] + [(tadasets.infty_sign(n=50, noise=0.6, seed=i), 1) for i in range(50)]

## Cross-validation scaffolding

In [2]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def run_cross_validation(
    dataset,
    feature_extractor,
    classifier=None,
    n_splits=5,
    random_state=42
):
    """
    Runs k-fold cross-validation on given data.

    Parameters
    ----------
    dataset : list of (data, label) tuples
    feature_extractor : function mapping data -> feature vector (1D numpy array)
    classifier : sklearn-like classifier (implements fit & predict). Defaults to LogisticRegression.
    n_splits : int, number of folds
    random_state : int, seed for shuffling

    Returns
    -------
    accuracies : list of float accuracy scores per fold
    """
    # Build feature matrix X and label vector y
    features = [feature_extractor(data) for data, _ in dataset]
    # Determine max feature length
    max_len = max(f.shape[0] for f in features)
    # Pad features to uniform length
    features_padded = [np.pad(f, (0, max_len - f.shape[0]), mode='constant') for f in features]
    X = np.vstack(features_padded)
    y = np.array([label for _, label in dataset])

    # Default classifier
    if classifier is None:
        classifier = LogisticRegression(random_state=42)

    # Set up cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    accuracies = []

    # Fold loop
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # Standardize features
        mu = X_train.mean(axis=0)
        sigma = X_train.std(axis=0)
        X_train_std = (X_train - mu) / sigma
        X_test_std = (X_test - mu) / sigma

        # Train & evaluate
        clf = classifier
        clf.fit(X_train_std, y_train)
        preds = clf.predict(X_test_std)
        accuracies.append(accuracy_score(y_test, preds))

    return accuracies

## Nonpersistent smallest nonzero eigenvalue 

In [3]:
from persistent_laplacians.eigenvalues import compute_eigenvalues
def extract_nonpersistent_feature(data):
    result = compute_eigenvalues(
        data,
        num_indices=20,
        use_scipy=True,
        use_stepwise_schur=False,
        zero_tol=1e-6
    )
    # Filter result to nonpersistent dim 1 features
    nonpersistent_dim1 = [
        (k[0], v)
        for k, v in result[1].items()
        if k[0] == k[1]
    ]
    nonpersistent_dim1.sort(key=lambda x: x[0])
    # Return first element of each or zero if missing
    return np.array([vec[0] if vec else 0 for _, vec in nonpersistent_dim1])

In [4]:
accuracies_nonpersistent = run_cross_validation(
    dataset=dataset,
    feature_extractor=extract_nonpersistent_feature,
    classifier=LogisticRegression(),
    n_splits=5,
    random_state=42
)

mean_acc = np.mean(accuracies_nonpersistent)
std_acc = np.std(accuracies_nonpersistent)
print(f"Cross-validated accuracies: {accuracies_nonpersistent}")
print(f"Mean accuracy: {mean_acc:.3f} ± {std_acc:.3f}")

  return pl.smallest_eigenvalue(


Cross-validated accuracies: [1.0, 0.95, 0.9, 0.7, 0.9]
Mean accuracy: 0.890 ± 0.102


## Persistent smallest eigenvalue 

In [5]:
from persistent_laplacians.eigenvalues import compute_eigenvalues
def extract_persistent_feature(data):
    result = compute_eigenvalues(
        data,
        num_indices=20,
        use_scipy=True,
        use_stepwise_schur=False,
        zero_tol=1e-6
    )
    dim1_result = [x for x in result[1].items()]
    dim1_result.sort(key=lambda x: x[1])
    dim1_result.sort(key=lambda x: x[0])
    # Return first element of each or zero if missing
    return np.array([vec[0] if vec else 0 for _, vec in dim1_result])

In [6]:
accuracies_persistent = run_cross_validation(
    dataset=dataset,
    feature_extractor=extract_persistent_feature,
    classifier=LogisticRegression(),
    n_splits=5,
    random_state=42
)

mean_acc = np.mean(accuracies_persistent)
std_acc = np.std(accuracies_persistent)
print(f"Cross-validated accuracies: {accuracies_persistent}")
print(f"Mean accuracy: {mean_acc:.3f} ± {std_acc:.3f}")

  return pl.smallest_eigenvalue(


Cross-validated accuracies: [0.95, 0.95, 0.85, 0.7, 0.9]
Mean accuracy: 0.870 ± 0.093


## Paired t-test

In [7]:
from scipy.stats import ttest_rel 

# Paired t-test
t_stat, p_val = ttest_rel(accuracies_persistent, accuracies_nonpersistent)
print(f"paired t-test p = {p_val:.3f}")

paired t-test p = 0.178
