In [1]:
!pip install -q openml tabpfn

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m

### V2

In [3]:
import numpy as np
import openml
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.manifold import TSNE
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from tabpfn import TabPFNClassifier
import seaborn as sns

# Keeping the existing dataset loading and processing functions...

def evaluate_embeddings(dataset_name, use_all_ensemble_members=True):
    """
    Analyze TabPFN embeddings in detail for a dataset.

    Args:
        dataset_name: Name of the dataset to analyze
        use_all_ensemble_members: Whether to combine all ensemble members or just use first one
    """
    print(f"\n{'='*50}\nDetailed Analysis for Dataset: {dataset_name}\n{'='*50}")

    # Load and split dataset (reusing existing code)
    X, y, categorical_indicator, attribute_names, full_name = load_dataset(dataset_name)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    # Sample if needed (TabPFN limitation)
    max_samples = 3000
    if len(X_train) > max_samples:
        rng = np.random.RandomState(42)
        train_indices = rng.choice(len(X_train), max_samples, replace=False)
        X_train_sample = X_train[train_indices]
        y_train_sample = y_train[train_indices]
    else:
        train_indices = np.arange(len(X_train))
        X_train_sample = X_train
        y_train_sample = y_train

    # Get sample sizes for val and test sets
    val_sample_size = min(1000, len(X_val))
    test_sample_size = min(1000, len(X_test))

    val_indices = np.random.RandomState(42).choice(len(X_val), val_sample_size, replace=False)
    test_indices = np.random.RandomState(43).choice(len(X_test), test_sample_size, replace=False)

    X_val_sample = X_val[val_indices]
    y_val_sample = y_val[val_indices]
    X_test_sample = X_test[test_indices]
    y_test_sample = y_test[test_indices]

    # Train TabPFN
    N_ensemble = 8
    tabpfn = TabPFNClassifier(device='cuda' if torch.cuda.is_available() else 'cpu',
                              n_estimators=N_ensemble,
                              ignore_pretraining_limits=True)

    print(f"Fitting TabPFN on {len(X_train_sample)} samples")
    tabpfn.fit(X_train_sample, y_train_sample)

    # Get TabPFN accuracy
    val_preds = process_in_chunks(tabpfn, X_val, chunk_size=max_samples, method='predict')
    test_preds = process_in_chunks(tabpfn, X_test, chunk_size=max_samples, method='predict')

    # Ensure we have predictions for all samples
    y_val_eval = y_val[:len(val_preds)] if len(val_preds) < len(y_val) else y_val
    y_test_eval = y_test[:len(test_preds)] if len(test_preds) < len(y_test) else y_test

    val_accuracy = accuracy_score(y_val_eval, val_preds)
    test_accuracy = accuracy_score(y_test_eval, test_preds)

    print(f"TabPFN accuracy - Validation: {val_accuracy:.4f}, Test: {test_accuracy:.4f}")

    # Extract embeddings
    print("\nExtracting TabPFN embeddings...")
    train_emb_raw = tabpfn.get_embeddings(X_train_sample)
    val_emb_raw = tabpfn.get_embeddings(X_val_sample)
    test_emb_raw = tabpfn.get_embeddings(X_test_sample)

    print(f"Raw embedding shapes - Train: {train_emb_raw.shape}, Val: {val_emb_raw.shape}, Test: {test_emb_raw.shape}")

    # Process embeddings - either use first batch or combine all ensemble members
    if use_all_ensemble_members and len(train_emb_raw.shape) == 3 and train_emb_raw.shape[0] > 1:
        # Average across ensemble members
        print("Averaging embeddings across all ensemble members")
        train_embeddings = np.mean(train_emb_raw, axis=0)
        val_embeddings = np.mean(val_emb_raw, axis=0)
        test_embeddings = np.mean(test_emb_raw, axis=0)
    else:
        # Use first batch/member
        print("Using first ensemble member for embeddings")
        if len(train_emb_raw.shape) == 3:
            train_embeddings = train_emb_raw[0]
            val_embeddings = val_emb_raw[0]
            test_embeddings = test_emb_raw[0]
        else:
            train_embeddings = train_emb_raw
            val_embeddings = val_emb_raw
            test_embeddings = test_emb_raw

    print(f"Processed embedding shapes - Train: {train_embeddings.shape}, Val: {val_embeddings.shape}, Test: {test_embeddings.shape}")

    # Calculate and visualize embedding statistics
    print("\nEmbedding Statistics:")
    emb_mean = np.mean(train_embeddings, axis=0)
    emb_std = np.std(train_embeddings, axis=0)
    print(f"  Mean range: [{np.min(emb_mean):.4f}, {np.max(emb_mean):.4f}]")
    print(f"  Std range: [{np.min(emb_std):.4f}, {np.max(emb_std):.4f}]")
    print(f"  Overall min: {np.min(train_embeddings):.4f}, max: {np.max(train_embeddings):.4f}")

    # Try multiple classifiers on the embeddings
    print("\nTesting different classifiers on TabPFN embeddings:")

    # 1. KNN with different metrics
    k = max(5, int(np.sqrt(len(train_embeddings))))

    for metric in ['euclidean', 'manhattan', 'cosine']:
        knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
        knn.fit(train_embeddings, y_train_sample)
        val_acc = accuracy_score(y_val_sample, knn.predict(val_embeddings))
        test_acc = accuracy_score(y_test_sample, knn.predict(test_embeddings))
        print(f"  KNN ({metric}) - Val: {val_acc:.4f}, Test: {test_acc:.4f}")

    # 2. SVM classifier
    svm = SVC(kernel='rbf')
    svm.fit(train_embeddings, y_train_sample)
    val_acc = accuracy_score(y_val_sample, svm.predict(val_embeddings))
    test_acc = accuracy_score(y_test_sample, svm.predict(test_embeddings))
    print(f"  SVM (RBF) - Val: {val_acc:.4f}, Test: {test_acc:.4f}")

    # 3. Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(train_embeddings, y_train_sample)
    val_acc = accuracy_score(y_val_sample, rf.predict(val_embeddings))
    test_acc = accuracy_score(y_test_sample, rf.predict(test_embeddings))
    print(f"  Random Forest - Val: {val_acc:.4f}, Test: {test_acc:.4f}")

    # 4. Neural Network
    mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42)
    mlp.fit(train_embeddings, y_train_sample)
    val_acc = accuracy_score(y_val_sample, mlp.predict(val_embeddings))
    test_acc = accuracy_score(y_test_sample, mlp.predict(test_embeddings))
    print(f"  Neural Network - Val: {val_acc:.4f}, Test: {test_acc:.4f}")

    # 5. Try with standardized embeddings
    print("\nTesting with standardized embeddings:")
    scaler = StandardScaler()
    train_emb_scaled = scaler.fit_transform(train_embeddings)
    val_emb_scaled = scaler.transform(val_embeddings)
    test_emb_scaled = scaler.transform(test_embeddings)

    for model_name, model in [
        ("KNN", KNeighborsClassifier(n_neighbors=k)),
        ("SVM", SVC(kernel='rbf')),
        ("RF", RandomForestClassifier(n_estimators=100, random_state=42)),
        ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42))
    ]:
        model.fit(train_emb_scaled, y_train_sample)
        val_acc = accuracy_score(y_val_sample, model.predict(val_emb_scaled))
        test_acc = accuracy_score(y_test_sample, model.predict(test_emb_scaled))
        print(f"  {model_name} (standardized) - Val: {val_acc:.4f}, Test: {test_acc:.4f}")

    # 6. Try with PCA-reduced embeddings
    print("\nTesting PCA-reduced embeddings with varying components:")

    # Try different numbers of components
    n_components_list = [2, 5, 10, 20, 50, 100]

    for n_components in n_components_list:
        if n_components >= min(train_embeddings.shape):
            continue

        pca = PCA(n_components=n_components)
        train_emb_pca = pca.fit_transform(train_embeddings)
        val_emb_pca = pca.transform(val_embeddings)
        test_emb_pca = pca.transform(test_embeddings)

        # Calculate variance explained
        var_explained = np.sum(pca.explained_variance_ratio_)

        for model_name, model in [
            ("KNN", KNeighborsClassifier(n_neighbors=k)),
            ("SVM", SVC(kernel='rbf')),
            ("RF", RandomForestClassifier(n_estimators=100, random_state=42)),
            ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42))
        ]:
            model.fit(train_emb_pca, y_train_sample)
            val_acc = accuracy_score(y_val_sample, model.predict(val_emb_pca))
            test_acc = accuracy_score(y_test_sample, model.predict(test_emb_pca))
            print(f"  {model_name} (PCA-{n_components}, {var_explained:.4f} var) - Val: {val_acc:.4f}, Test: {test_acc:.4f}")

    # 7. Visualize a few examples with t-SNE
    print("\nGenerating t-SNE visualization...")

    # Combine train and validation for visualization
    combined_emb = np.vstack([train_embeddings[:500], val_embeddings[:500]])  # Limit size for speed
    combined_labels = np.concatenate([y_train_sample[:500], y_val_sample[:500]])

    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    embedded = tsne.fit_transform(combined_emb)

    # Split back
    train_embedded = embedded[:500]
    val_embedded = embedded[500:]

    # Analyze class separation in t-SNE space
    print("Class separation analysis in t-SNE space:")
    knn_tsne = KNeighborsClassifier(n_neighbors=k)
    knn_tsne.fit(train_embedded, y_train_sample[:500])
    val_acc_tsne = accuracy_score(y_val_sample[:500], knn_tsne.predict(val_embedded))
    print(f"  KNN accuracy on t-SNE: {val_acc_tsne:.4f}")

    # Analyze raw vs. TabPFN embeddings
    print("\nComparing raw features vs TabPFN embeddings:")

    # Normalize raw features
    scaler_raw = StandardScaler()
    X_train_scaled = scaler_raw.fit_transform(X_train_sample)
    X_val_scaled = scaler_raw.transform(X_val_sample)
    X_test_scaled = scaler_raw.transform(X_test_sample)

    # Compare on same models
    for model_name, model_raw, model_emb in [
        ("KNN", KNeighborsClassifier(n_neighbors=k), KNeighborsClassifier(n_neighbors=k)),
        ("SVM", SVC(kernel='rbf'), SVC(kernel='rbf')),
        ("RF", RandomForestClassifier(n_estimators=100, random_state=42), RandomForestClassifier(n_estimators=100, random_state=42)),
        ("MLP", MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42), MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42))
    ]:
        # Raw features
        model_raw.fit(X_train_scaled, y_train_sample)
        raw_val_acc = accuracy_score(y_val_sample, model_raw.predict(X_val_scaled))
        raw_test_acc = accuracy_score(y_test_sample, model_raw.predict(X_test_scaled))

        # Embeddings (scaled)
        model_emb.fit(train_emb_scaled, y_train_sample)
        emb_val_acc = accuracy_score(y_val_sample, model_emb.predict(val_emb_scaled))
        emb_test_acc = accuracy_score(y_test_sample, model_emb.predict(test_emb_scaled))

        print(f"  {model_name} - Raw: {raw_test_acc:.4f}, Embeddings: {emb_test_acc:.4f}, Diff: {emb_test_acc-raw_test_acc:.4f}")

    # Analyze confusion matrices
    print("\nConfusion matrix analysis:")

    # Get predictions from TabPFN and KNN
    tabpfn_val_preds = tabpfn.predict(X_val_sample)

    knn_raw = KNeighborsClassifier(n_neighbors=k)
    knn_raw.fit(X_train_scaled, y_train_sample)
    knn_raw_val_preds = knn_raw.predict(X_val_scaled)

    knn_emb = KNeighborsClassifier(n_neighbors=k)
    knn_emb.fit(train_emb_scaled, y_train_sample)
    knn_emb_val_preds = knn_emb.predict(val_emb_scaled)

    # Calculate confusion matrices
    cm_tabpfn = confusion_matrix(y_val_sample, tabpfn_val_preds)
    cm_knn_raw = confusion_matrix(y_val_sample, knn_raw_val_preds)
    cm_knn_emb = confusion_matrix(y_val_sample, knn_emb_val_preds)

    # Print diagonal elements (correct classifications per class)
    print("Correct classifications per class (diagonal of confusion matrix):")
    print("  TabPFN:", np.diag(cm_tabpfn))
    print("  KNN-Raw:", np.diag(cm_knn_raw))
    print("  KNN-Emb:", np.diag(cm_knn_emb))

    return {
        "dataset": dataset_name,
        "tabpfn_val_acc": val_accuracy,
        "tabpfn_test_acc": test_accuracy,
        "embeddings_shape": train_embeddings.shape,
        # Add other metrics as needed
    }

# Main function
if __name__ == "__main__":
    import torch

    # Fix random seed for reproducibility
    np.random.seed(42)
    torch.manual_seed(42)

    # Analyze a single dataset in detail
    dataset_name = 'har'  # Replace with dataset of interest
    results = evaluate_embeddings(dataset_name)

    print("\nAnalysis complete!")


Detailed Analysis for Dataset: har


  X, y, categorical_indicator, attribute_names = dataset.get_data(


Fitting TabPFN on 3000 samples


  X, y, feature_names_in, n_features_in = validate_Xy_fit(


TabPFN accuracy - Validation: 0.9877, Test: 0.9903

Extracting TabPFN embeddings...
Raw embedding shapes - Train: (8, 3000, 192), Val: (8, 1000, 192), Test: (8, 1000, 192)
Averaging embeddings across all ensemble members
Processed embedding shapes - Train: (3000, 192), Val: (1000, 192), Test: (1000, 192)

Embedding Statistics:
  Mean range: [-6.7500, 5.8633]
  Std range: [0.0782, 1.5527]
  Overall min: -8.5547, max: 8.5703

Testing different classifiers on TabPFN embeddings:
  KNN (euclidean) - Val: 0.9900, Test: 0.9860
  KNN (manhattan) - Val: 0.9890, Test: 0.9870
  KNN (cosine) - Val: 0.9900, Test: 0.9860
  SVM (RBF) - Val: 0.9890, Test: 0.9880
  Random Forest - Val: 0.9890, Test: 0.9860
  Neural Network - Val: 0.9910, Test: 0.9880

Testing with standardized embeddings:
  KNN (standardized) - Val: 0.9900, Test: 0.9860
  SVM (standardized) - Val: 0.9910, Test: 0.9880
  RF (standardized) - Val: 0.9890, Test: 0.9860
  MLP (standardized) - Val: 0.9920, Test: 0.9880

Testing PCA-reduced e

### Revised Version of V1

In [10]:
import numpy as np
import openml
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from tabpfn import TabPFNClassifier

# Load datasets from OpenML (keeping same function)
def load_dataset(dataset_name):
    if dataset_name == 'airlines':
        dataset = openml.datasets.get_dataset(1169)  # Airlines dataset
    elif dataset_name == 'albert':
        dataset = openml.datasets.get_dataset(189356)  # Albert dataset
    elif dataset_name == 'volkert':
        dataset = openml.datasets.get_dataset(41166)  # Volkert dataset
    elif dataset_name == 'higgs':
        dataset = openml.datasets.get_dataset(44129)  # Higgs dataset
    elif dataset_name == 'har':
        dataset = openml.datasets.get_dataset(1478)  # Har dataset
    else:
        raise ValueError(f"Unknown dataset: {dataset_name}")

    X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="array", target=dataset.default_target_attribute
    )

    return X, y, categorical_indicator, attribute_names, dataset.name

# Process data in chunks (improved with consistent error handling)
def process_in_chunks(model, X, chunk_size=3000, method='predict'):
    """Process large datasets in chunks of maximum size 3000 (TabPFN limitation)"""
    n_samples = len(X)
    n_chunks = int(np.ceil(n_samples / chunk_size))

    all_results = []
    first_result_shape = None

    for i in range(n_chunks):
        start_idx = i * chunk_size
        end_idx = min((i + 1) * chunk_size, n_samples)

        X_chunk = X[start_idx:end_idx]

        try:
            # Process the chunk
            if method == 'predict':
                chunk_result = model.predict(X_chunk)
            elif method == 'get_embeddings':
                chunk_result = model.get_embeddings(X_chunk)
            else:
                raise ValueError(f"Unknown method: {method}")

            # Store the shape of the first result
            if i == 0:
                first_result_shape = chunk_result.shape

                # For 2D results, we only care about the second dimension onwards
                if len(first_result_shape) > 1:
                    feature_dims = first_result_shape[1:]
                else:
                    feature_dims = ()

            all_results.append(chunk_result)
        except Exception as e:
            print(f"Error processing chunk {i}: {e}")
            # Return partial results if we have some
            if all_results:
                break
            else:
                raise

    # Combine results safely
    try:
        return np.concatenate(all_results)
    except ValueError as e:
        print(f"Concatenation error: {e}")

        # For 1D arrays (like predictions), just concatenate what we have
        if all_results and len(all_results[0].shape) == 1:
            result = np.concatenate([r for r in all_results if len(r) > 0])
        else:
            # For 2D arrays, ensure consistent dimensions
            if all_results and len(all_results[0].shape) > 1:
                feature_dim = all_results[0].shape[1]
                valid_results = [r for r in all_results if r.shape[1] == feature_dim]

                if not valid_results:
                    raise ValueError("No valid results to concatenate")

                result = np.concatenate(valid_results)
            else:
                raise ValueError("Unable to concatenate results")

        return result

# Improved KNN classifier evaluation
def evaluate_knn_classifier(X_train, y_train, X_val, y_val, X_test, y_test,
                            train_embeddings=None, val_embeddings=None, test_embeddings=None):
    """
    Evaluate KNN classifier performance on both raw features and embeddings.
    Ensures consistent sample handling and proper alignment of predictions.
    """
    results = {}

    # 1. KNN on raw features (with proper standardization)
    print("\nEvaluating KNN on raw features:")

    # Set k to be sqrt(n) or at least 5
    k = max(5, int(np.sqrt(len(X_train))))
    print(f"Using k={k} for KNN classification")

    # Standardize the data (important for KNN)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Train KNN classifier on raw features
    knn_raw = KNeighborsClassifier(n_neighbors=k)
    knn_raw.fit(X_train_scaled, y_train)

    # Evaluate on validation set
    val_preds_knn_raw = knn_raw.predict(X_val_scaled)
    val_accuracy_knn_raw = accuracy_score(y_val, val_preds_knn_raw)

    # Evaluate on test set
    test_preds_knn_raw = knn_raw.predict(X_test_scaled)
    test_accuracy_knn_raw = accuracy_score(y_test, test_preds_knn_raw)

    print(f"  Validation accuracy (raw features): {val_accuracy_knn_raw:.4f}")
    print(f"  Test accuracy (raw features): {test_accuracy_knn_raw:.4f}")

    results['knn_raw_val_accuracy'] = val_accuracy_knn_raw
    results['knn_raw_test_accuracy'] = test_accuracy_knn_raw

    # 2. KNN on embeddings (if provided)
    if train_embeddings is not None and val_embeddings is not None and test_embeddings is not None:
        print("\nEvaluating KNN on TabPFN embeddings:")

        # Make sure we have exactly matching samples and labels
        assert len(train_embeddings) <= len(y_train), "Too many training embeddings"
        assert len(val_embeddings) <= len(y_val), "Too many validation embeddings"
        assert len(test_embeddings) <= len(y_test), "Too many test embeddings"

        # Get corresponding labels for the embeddings
        y_train_emb = y_train[:len(train_embeddings)]
        y_val_emb = y_val[:len(val_embeddings)]
        y_test_emb = y_test[:len(test_embeddings)]

        print(f"  Using {len(y_train_emb)} training labels, {len(y_val_emb)} validation labels, {len(y_test_emb)} test labels")
        print(f"  Label distribution in train: {np.unique(y_train_emb, return_counts=True)}")

        # Standardize the embeddings
        scaler_emb = StandardScaler()
        train_embeddings_std = scaler_emb.fit_transform(train_embeddings)
        val_embeddings_std = scaler_emb.transform(val_embeddings)
        test_embeddings_std = scaler_emb.transform(test_embeddings)

        # Train KNN classifier on standardized embeddings
        knn_emb = KNeighborsClassifier(n_neighbors=k)
        knn_emb.fit(train_embeddings_std, y_train_emb)

        # Evaluate on validation set
        val_preds_knn_emb = knn_emb.predict(val_embeddings_std)
        val_accuracy_knn_emb = accuracy_score(y_val_emb, val_preds_knn_emb)

        # Evaluate on test set
        test_preds_knn_emb = knn_emb.predict(test_embeddings_std)
        test_accuracy_knn_emb = accuracy_score(y_test_emb, test_preds_knn_emb)

        print(f"  Validation accuracy (standardized embeddings): {val_accuracy_knn_emb:.4f}")
        print(f"  Test accuracy (standardized embeddings): {test_accuracy_knn_emb:.4f}")

        results['knn_emb_val_accuracy'] = val_accuracy_knn_emb
        results['knn_emb_test_accuracy'] = test_accuracy_knn_emb

        # 3. KNN on PCA-reduced embeddings with specific component counts
        print("\nEvaluating KNN on PCA-reduced TabPFN embeddings:")

        # Use specific component counts (like in evaluate_embeddings) instead of percentages
        pca_components = [2, 5, 10, 20, 50, 100]
        pca_components = [c for c in pca_components if c < train_embeddings.shape[1]]

        best_val_accuracy = 0
        best_test_accuracy = 0
        best_n_components = 0

        print(f"  Testing PCA components: {pca_components}")

        for n_components in pca_components:
            try:
                # Apply PCA to standardized embeddings
                pca = PCA(n_components=n_components, random_state=42)  # Set random state for reproducibility
                train_embeddings_pca = pca.fit_transform(train_embeddings_std)
                val_embeddings_pca = pca.transform(val_embeddings_std)
                test_embeddings_pca = pca.transform(test_embeddings_std)

                # Train KNN on PCA-reduced embeddings
                knn_pca = KNeighborsClassifier(n_neighbors=k)
                knn_pca.fit(train_embeddings_pca, y_train_emb)

                # Evaluate on validation set
                val_preds_knn_pca = knn_pca.predict(val_embeddings_pca)
                val_accuracy_knn_pca = accuracy_score(y_val_emb, val_preds_knn_pca)

                # Evaluate on test set
                test_preds_knn_pca = knn_pca.predict(test_embeddings_pca)
                test_accuracy_knn_pca = accuracy_score(y_test_emb, test_preds_knn_pca)

                # Calculate variance explained
                var_explained = np.sum(pca.explained_variance_ratio_)

                print(f"  PCA components={n_components}: Val accuracy={val_accuracy_knn_pca:.4f}, "
                      f"Test accuracy={test_accuracy_knn_pca:.4f}, Variance explained={var_explained:.4f}")

                # Track best validation performance
                if val_accuracy_knn_pca > best_val_accuracy:
                    best_val_accuracy = val_accuracy_knn_pca
                    best_test_accuracy = test_accuracy_knn_pca
                    best_n_components = n_components
                    best_variance = var_explained

            except Exception as e:
                print(f"  Error with n_components={n_components}: {e}")
                continue

        # Record best PCA results
        if best_n_components > 0:
            print(f"  Best PCA results: components={best_n_components}, "
                  f"val_accuracy={best_val_accuracy:.4f}, test_accuracy={best_test_accuracy:.4f}")
            results['knn_pca_best_components'] = best_n_components
            results['knn_pca_val_accuracy'] = best_val_accuracy
            results['knn_pca_test_accuracy'] = best_test_accuracy
            results['pca_variance_explained'] = best_variance

    return results

# Main analysis function with consistent random seed handling
def analyze_dataset(dataset_name):
    print(f"\n{'='*50}\nAnalyzing dataset: {dataset_name}\n{'='*50}")

    # Fix random seed for all random operations
    random_seed = 42
    np.random.seed(random_seed)

    # Load dataset
    X, y, categorical_indicator, attribute_names, full_name = load_dataset(dataset_name)

    # Split data into train/val/test (70/15/15) with fixed random state
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=random_seed)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_seed)

    print(f"Dataset shapes:")
    print(f"  Train: {X_train.shape}, {np.unique(y_train, return_counts=True)}")
    print(f"  Val:   {X_val.shape}, {np.unique(y_val, return_counts=True)}")
    print(f"  Test:  {X_test.shape}, {np.unique(y_test, return_counts=True)}")

    # TabPFN can only handle 3000 samples at a time
    max_samples = 3000

    # Use RandomState for consistent sampling
    rng = np.random.RandomState(random_seed)

    # Randomly select samples if we have more than max_samples
    if len(X_train) > max_samples:
        # Random selection with fixed seed for reproducibility
        train_indices = rng.choice(len(X_train), max_samples, replace=False)
        X_train_sample = X_train[train_indices]
        y_train_sample = y_train[train_indices]
        print(f"Randomly selected {max_samples} samples from {len(X_train)} training samples")
    else:
        train_indices = np.arange(len(X_train))
        X_train_sample = X_train
        y_train_sample = y_train
        print(f"Using all {len(X_train)} training samples")

    # 3. Load TabPFN and get predictions
    N_ensemble = 8  # Number of models in ensemble
    try:
        import torch
        tabpfn = TabPFNClassifier(device='cuda' if torch.cuda.is_available() else 'cpu',
                                n_estimators=N_ensemble,
                                ignore_pretraining_limits=True)
    except ImportError:
        # Fallback if no torch
        tabpfn = TabPFNClassifier(n_estimators=N_ensemble,
                                ignore_pretraining_limits=True)

    # Fit TabPFN on the selected samples
    print(f"Fitting TabPFN on {len(X_train_sample)} samples")
    tabpfn.fit(X_train_sample, y_train_sample)

    # Get predictions on validation and test sets
    try:
        val_preds = tabpfn.predict(X_val)
        val_accuracy = accuracy_score(y_val, val_preds)
    except Exception as e:
        print(f"Error during validation prediction: {e}")
        # Try chunked prediction as fallback
        try:
            val_preds = process_in_chunks(tabpfn, X_val, chunk_size=max_samples, method='predict')
            val_accuracy = accuracy_score(y_val[:len(val_preds)], val_preds)
        except Exception as e2:
            print(f"Chunked prediction also failed: {e2}")
            val_accuracy = np.nan

    try:
        test_preds = tabpfn.predict(X_test)
        test_accuracy = accuracy_score(y_test, test_preds)
    except Exception as e:
        print(f"Error during test prediction: {e}")
        # Try chunked prediction as fallback
        try:
            test_preds = process_in_chunks(tabpfn, X_test, chunk_size=max_samples, method='predict')
            test_accuracy = accuracy_score(y_test[:len(test_preds)], test_preds)
        except Exception as e2:
            print(f"Chunked prediction also failed: {e2}")
            test_accuracy = np.nan

    print(f"TabPFN on initial training set:")
    print(f"  Validation accuracy: {val_accuracy:.4f}")
    print(f"  Test accuracy: {test_accuracy:.4f}")

    # 4. Extract embeddings from TabPFN with consistent sampling
    print("Extracting embeddings from TabPFN")

    # Use consistent sampling with reproducible RandomState
    val_sample_size = min(1000, len(X_val))
    val_indices = rng.choice(len(X_val), val_sample_size, replace=False)
    X_val_sample = X_val[val_indices]
    y_val_sample = y_val[val_indices]

    test_sample_size = min(1000, len(X_test))
    test_indices = rng.choice(len(X_test), test_sample_size, replace=False)
    X_test_sample = X_test[test_indices]
    y_test_sample = y_test[test_indices]

    # Get embeddings
    train_embeddings_raw = tabpfn.get_embeddings(X_train_sample)
    val_embeddings_raw = tabpfn.get_embeddings(X_val_sample)
    test_embeddings_raw = tabpfn.get_embeddings(X_test_sample)

    print(f"Embeddings shapes - Train: {train_embeddings_raw.shape}, "
          f"Val: {val_embeddings_raw.shape}, Test: {test_embeddings_raw.shape}")

    # Process embeddings - average across ensemble members if available
    if len(train_embeddings_raw.shape) == 3 and train_embeddings_raw.shape[0] > 1:
        print("Averaging embeddings across ensemble members")
        train_embeddings = np.mean(train_embeddings_raw, axis=0)
        val_embeddings = np.mean(val_embeddings_raw, axis=0)
        test_embeddings = np.mean(test_embeddings_raw, axis=0)
    else:
        # For backward compatibility, handle original format
        if len(train_embeddings_raw.shape) == 3:
            train_embeddings = train_embeddings_raw[0]
            val_embeddings = val_embeddings_raw[0]
            test_embeddings = test_embeddings_raw[0]
        else:
            train_embeddings = train_embeddings_raw
            val_embeddings = val_embeddings_raw
            test_embeddings = test_embeddings_raw

    print(f"Processed embedding shapes - Train: {train_embeddings.shape}, "
          f"Val: {val_embeddings.shape}, Test: {test_embeddings.shape}")

    # Check that all embeddings have the same feature dimension
    if not (train_embeddings.shape[1] == val_embeddings.shape[1] == test_embeddings.shape[1]):
        print(f"WARNING: Feature dimensions don't match! Train: {train_embeddings.shape[1]}, " +
              f"Val: {val_embeddings.shape[1]}, Test: {test_embeddings.shape[1]}")

    # Evaluate KNN classifier with the properly processed embeddings
    knn_results = evaluate_knn_classifier(
        X_train_sample, y_train_sample,
        X_val_sample, y_val_sample,
        X_test_sample, y_test_sample,
        train_embeddings, val_embeddings, test_embeddings
    )

    return {
        'dataset': full_name,
        'full_train_size': len(X_train),
        'sample_train_size': len(X_train_sample),
        'full_val_accuracy': val_accuracy,
        'full_test_accuracy': test_accuracy,
        # Add KNN results
        'knn_raw_val_accuracy': knn_results.get('knn_raw_val_accuracy', np.nan),
        'knn_raw_test_accuracy': knn_results.get('knn_raw_test_accuracy', np.nan),
        'knn_emb_val_accuracy': knn_results.get('knn_emb_val_accuracy', np.nan),
        'knn_emb_test_accuracy': knn_results.get('knn_emb_test_accuracy', np.nan),
        'knn_pca_best_components': knn_results.get('knn_pca_best_components', np.nan),
        'knn_pca_val_accuracy': knn_results.get('knn_pca_val_accuracy', np.nan),
        'knn_pca_test_accuracy': knn_results.get('knn_pca_test_accuracy', np.nan),
        'pca_variance_explained': knn_results.get('pca_variance_explained', np.nan)
    }

# Main execution function
def main():
    import torch
    import pandas as pd

    # Fix random seed for reproducibility
    random_seed = 42
    np.random.seed(random_seed)
    torch.manual_seed(random_seed)

    dataset_names = ['har', 'volkert', 'higgs', 'airlines', 'albert']

    # Store results
    results = []

    # Process each dataset
    for dataset_name in tqdm(dataset_names, desc="Processing datasets"):
        try:
            result = analyze_dataset(dataset_name)
            results.append(result)
            print("\n\n=== SUMMARY OF RESULTS ===")
            cols_to_show = ['dataset', 'full_test_accuracy',
                          'knn_raw_test_accuracy', 'knn_emb_test_accuracy',
                          'knn_pca_best_components', 'knn_pca_test_accuracy', 'pca_variance_explained']

            # Convert result to DataFrame for display
            results_df = pd.DataFrame([result])
            print(results_df[cols_to_show])
        except Exception as e:
            print(f"Error processing dataset {dataset_name}: {e}")
            import traceback
            traceback.print_exc()

    # Create summary dataframe
    results_df = pd.DataFrame(results)
    results_df.to_csv('/content/drive/MyDrive/tabpfn-knn-results.csv', index=False)

if __name__ == "__main__":
    main()

  X, y, categorical_indicator, attribute_names = dataset.get_data(



Analyzing dataset: har
Dataset shapes:
  Train: (7209, 561), (array([0, 1, 2, 3, 4, 5]), array([1183, 1120,  986, 1214, 1359, 1347]))
  Val:   (1545, 561), (array([0, 1, 2, 3, 4, 5]), array([255, 198, 217, 294, 280, 301]))
  Test:  (1545, 561), (array([0, 1, 2, 3, 4, 5]), array([284, 226, 203, 269, 267, 296]))
Randomly selected 3000 samples from 7209 training samples
Fitting TabPFN on 3000 samples


  X, y, feature_names_in, n_features_in = validate_Xy_fit(


TabPFN on initial training set:
  Validation accuracy: 0.9877
  Test accuracy: 0.9903
Extracting embeddings from TabPFN
Embeddings shapes - Train: (8, 3000, 192), Val: (8, 1000, 192), Test: (8, 1000, 192)
Averaging embeddings across ensemble members
Processed embedding shapes - Train: (3000, 192), Val: (1000, 192), Test: (1000, 192)

Evaluating KNN on raw features:
Using k=54 for KNN classification
  Validation accuracy (raw features): 0.8920
  Test accuracy (raw features): 0.9100

Evaluating KNN on TabPFN embeddings:
  Using 3000 training labels, 1000 validation labels, 1000 test labels
  Label distribution in train: (array([0, 1, 2, 3, 4, 5]), array([513, 446, 403, 511, 569, 558]))
  Validation accuracy (standardized embeddings): 0.9900
  Test accuracy (standardized embeddings): 0.9870

Evaluating KNN on PCA-reduced TabPFN embeddings:
  Testing PCA components: [2, 5, 10, 20, 50, 100]
  PCA components=2: Val accuracy=0.9200, Test accuracy=0.9290, Variance explained=0.5309
  PCA compon

Processing datasets:  20%|██        | 1/5 [10:39<42:38, 639.52s/it]

  PCA components=100: Val accuracy=0.9900, Test accuracy=0.9870, Variance explained=0.9996
  Best PCA results: components=10, val_accuracy=0.9900, test_accuracy=0.9860


=== SUMMARY OF RESULTS ===
  dataset  full_test_accuracy  knn_raw_test_accuracy  knn_emb_test_accuracy  \
0     har            0.990291                   0.91                  0.987   

   knn_pca_best_components  knn_pca_test_accuracy  pca_variance_explained  
0                       10                  0.986                0.931394  

Analyzing dataset: volkert


  X, y, categorical_indicator, attribute_names = dataset.get_data(


Dataset shapes:
  Train: (40817, 180), (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([5111, 1228, 8040, 2043, 2464, 6644, 1050, 8948,  919, 4370]))
  Val:   (8746, 180), (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([1127,  257, 1755,  437,  518, 1370,  223, 1947,  214,  898]))
  Test:  (8747, 180), (array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), array([1140,  280, 1724,  450,  517, 1400,  207, 1911,  228,  890]))
Randomly selected 3000 samples from 40817 training samples
Fitting TabPFN on 3000 samples
TabPFN on initial training set:
  Validation accuracy: 0.6062
  Test accuracy: 0.5941
Extracting embeddings from TabPFN
Embeddings shapes - Train: (8, 3000, 192), Val: (8, 1000, 192), Test: (8, 1000, 192)
Averaging embeddings across ensemble members
Processed embedding shapes - Train: (3000, 192), Val: (1000, 192), Test: (1000, 192)

Evaluating KNN on raw features:
Using k=54 for KNN classification
  Validation accuracy (raw features): 0.5000
  Test accuracy (raw features): 0.5150

Evaluating KN

Processing datasets:  40%|████      | 2/5 [14:52<20:35, 411.89s/it]

  PCA components=100: Val accuracy=0.6150, Test accuracy=0.5880, Variance explained=0.9996
  Best PCA results: components=100, val_accuracy=0.6150, test_accuracy=0.5880


=== SUMMARY OF RESULTS ===
   dataset  full_test_accuracy  knn_raw_test_accuracy  knn_emb_test_accuracy  \
0  volkert            0.594147                  0.515                  0.588   

   knn_pca_best_components  knn_pca_test_accuracy  pca_variance_explained  
0                      100                  0.588                0.999576  

Analyzing dataset: higgs


  X, y, categorical_indicator, attribute_names = dataset.get_data(


Dataset shapes:
  Train: (658112, 24), (array([0, 1]), array([329118, 328994]))
  Val:   (141024, 24), (array([0, 1]), array([70435, 70589]))
  Test:  (141024, 24), (array([0, 1]), array([70527, 70497]))
Randomly selected 3000 samples from 658112 training samples
Fitting TabPFN on 3000 samples
Error during validation prediction: CUDA error: invalid configuration argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Error during test prediction: CUDA error: invalid configuration argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

TabPFN on initial training set:
  Validation accuracy: 0.7164
  Tes

Processing datasets:  60%|██████    | 3/5 [29:04<20:25, 612.86s/it]

  PCA components=50: Val accuracy=0.6820, Test accuracy=0.6960, Variance explained=0.9964
  PCA components=100: Val accuracy=0.6840, Test accuracy=0.6970, Variance explained=0.9995
  Best PCA results: components=100, val_accuracy=0.6840, test_accuracy=0.6970


=== SUMMARY OF RESULTS ===
  dataset  full_test_accuracy  knn_raw_test_accuracy  knn_emb_test_accuracy  \
0   Higgs            0.714453                  0.591                  0.697   

   knn_pca_best_components  knn_pca_test_accuracy  pca_variance_explained  
0                      100                  0.697                0.999477  

Analyzing dataset: airlines


  X, y, categorical_indicator, attribute_names = dataset.get_data(


Dataset shapes:
  Train: (377568, 7), (array([0, 1]), array([209214, 168354]))
  Val:   (80907, 7), (array([0, 1]), array([44802, 36105]))
  Test:  (80908, 7), (array([0, 1]), array([45103, 35805]))
Randomly selected 3000 samples from 377568 training samples
Fitting TabPFN on 3000 samples
Error during validation prediction: CUDA error: invalid configuration argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Error during test prediction: CUDA error: invalid configuration argument
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

TabPFN on initial training set:
  Validation accuracy: 0.6353
  Test acc

Processing datasets:  80%|████████  | 4/5 [31:20<07:04, 424.67s/it]

  PCA components=100: Val accuracy=0.6360, Test accuracy=0.6250, Variance explained=0.9996
  Best PCA results: components=20, val_accuracy=0.6400, test_accuracy=0.6260


=== SUMMARY OF RESULTS ===
    dataset  full_test_accuracy  knn_raw_test_accuracy  knn_emb_test_accuracy  \
0  airlines             0.63477                  0.592                  0.625   

   knn_pca_best_components  knn_pca_test_accuracy  pca_variance_explained  
0                       20                  0.626                 0.97656  

Analyzing dataset: albert


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/openml/datasets/functions.py", line 1168, in _get_dataset_description
    with description_file.open(encoding="utf8") as fh:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/pathlib.py", line 1044, in open
    return io.open(self, mode, buffering, encoding, errors, newline)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '/root/.cache/openml/org/openml/www/datasets/189356/description.xml'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-10-e378f9beba3e>", line 416, in main
    result = analyze_dataset(dataset_name)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-10-e378f9beba3e>", line 249, in analyze_dataset
    X, y, categorical_indicator, attribute_names, full_name = load_dataset(dataset_name)
          

Error processing dataset albert: https://www.openml.org/api/v1/xml/data/189356 returned code 111: Unknown dataset





In [None]:
# Dataset names
    dataset_names = ['har', 'volkert', 'higgs', 'airlines', 'albert']

    # Store results
    results = []

    # Process each dataset
    for dataset_name in tqdm(dataset_names, desc="Processing datasets"):
        try:
            result = analyze_dataset(dataset_name)
            results.append(result)
        except Exception as e:
            print(f"Error processing dataset {dataset_name}: {e}")
            import traceback
            traceback.print_exc()

    # Create summary dataframe
    results_df = pd.DataFrame(results)