In [2]:
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import normalize
from scipy.spatial.distance import pdist, squareform

def compute_contrast_metrics(X, metric_name, preprocess=None):
    if preprocess:
        X_proc = preprocess(X)
    else:
        X_proc = X
    D = pdist(X_proc, metric=metric_name)
    D_min = D.min()
    D_max = D.max()
    contrast_ratio = (D_max - D_min) / D_min if D_min > 0 else np.nan
    print(f"=== {metric_name.upper()} ===")
    print(f"Mean: {D.mean():.4f}, Std: {D.std():.4f}")
    print(f"Min: {D_min:.4f}, Max: {D_max:.4f}")
    print(f"Contrast Ratio: {(contrast_ratio):.4f}\n")
    return contrast_ratio

# Load MNIST (first 1000 samples)
print("Loading MNIST...")
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X = mnist['data'][:1000] / 255.0  # Normalize pixel values to [0,1]

# Distance metrics to evaluate
metrics = {
    "euclidean": None,
    "cityblock": None,
    "cosine": lambda x: normalize(x, norm='l2'),
    "correlation": None,
    "angular": lambda x: normalize(x, norm='l2')
}

# Custom metric: angular distance
def angular_distance(u, v):
    dot = np.dot(u, v)
    cos_sim = np.clip(dot / (np.linalg.norm(u) * np.linalg.norm(v)), -1, 1)
    return np.arccos(cos_sim)

# Run contrast ratio analysis
for name, preprocess in metrics.items():
    if name == "angular":
        # Angular needs manual pdist
        X_unit = normalize(X, norm='l2')
        D = pdist(X_unit, metric=lambda u, v: np.arccos(np.clip(np.dot(u, v), -1, 1)))
        D_min = D.min()
        D_max = D.max()
        contrast_ratio = (D_max - D_min) / D_min if D_min > 0 else np.nan
        print(f"=== ANGULAR ===")
        print(f"Mean: {D.mean():.4f}, Std: {D.std():.4f}")
        print(f"Min: {D_min:.4f}, Max: {D_max:.4f}")
        print(f"Contrast Ratio: {contrast_ratio:.4f}\n")
    else:
        compute_contrast_metrics(X, name, preprocess)


Loading MNIST...
=== EUCLIDEAN ===
Mean: 10.0637, Std: 1.4156
Min: 1.1742, Max: 14.8922
Contrast Ratio: 11.6832

=== CITYBLOCK ===
Mean: 128.5313, Std: 30.0895
Min: 7.1686, Max: 250.8039
Contrast Ratio: 33.9863

=== COSINE ===
Mean: 0.6005, Std: 0.1313
Min: 0.0181, Max: 0.9657
Contrast Ratio: 52.4222

=== CORRELATION ===
Mean: 0.7021, Std: 0.1475
Min: 0.0200, Max: 1.1356
Contrast Ratio: 55.8315

=== ANGULAR ===
Mean: 1.1545, Std: 0.1495
Min: 0.1904, Max: 1.5365
Contrast Ratio: 7.0686



In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from scipy.spatial.distance import pdist
import seaborn as sns

def compute_robust_contrast(distances):
    nonzero_D = distances[distances > 0]
    if len(nonzero_D) == 0:
        return np.nan
    low = np.percentile(nonzero_D, 1)
    high = np.percentile(nonzero_D, 99)
    return (high - low) / low if low > 0 else np.nan

def contrast_by_metric_and_pca(data, metrics, pca_components_list):
    results = []
    for n_components in pca_components_list:
        if n_components < data.shape[1]:
            pca = PCA(n_components=n_components)
            data_pca = pca.fit_transform(data)
        else:
            data_pca = data

        for metric in metrics:
            if metric == 'cosine':
                data_proc = normalize(data_pca, norm='l2')
            else:
                data_proc = data_pca

            distances = pdist(data_proc, metric=metric)
            contrast = compute_robust_contrast(distances)

            results.append({
                'pca_components': n_components,
                'metric': metric,
                'robust_contrast': contrast
            })

    return results

def plot_contrast(results):
    import pandas as pd
    df = pd.DataFrame(results)

    sns.set(style="whitegrid")
    plt.figure(figsize=(8, 6))
    for metric in df['metric'].unique():
        subset = df[df['metric'] == metric]
        plt.plot(subset['pca_components'], subset['robust_contrast'], marker='o', label=metric)

    plt.title("Robust Contrast Ratio vs PCA Dimensionality")
    plt.xlabel("PCA Components")
    plt.ylabel("Robust Contrast Ratio")
    plt.legend(title="Distance Metric")
    plt.tight_layout()
    plt.show()

# === MAIN ===
if __name__ == "__main__":
    # Load your SCOTUS embeddings
    embs = np.load('scotus_embeddings.npy')

    # Sample for memory efficiency
    np.random.seed(42)
    sample = embs[np.random.choice(embs.shape[0], size=1000, replace=False)]

    # Set up config
    metrics = ['euclidean', 'cosine', 'correlation']
    pca_components_list = [250, 500, 1000]

    # Run analysis
    results = contrast_by_metric_and_pca(sample, metrics, pca_components_list)

    # Plot
    plot_contrast(results)


FileNotFoundError: [Errno 2] No such file or directory: 'scotus_embeddings.npy'