In [1]:
import numpy as np
import pandas as pd
import sklearn.neighbors
import sklearn.model_selection
import sklearn.datasets
import sklearn.manifold
import sklearn.decomposition

In [2]:
def score_embedding(data, labels, k=10, n_neighbors=5, random_state=42):
    model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
    cv = sklearn.model_selection.StratifiedKFold(n_splits=k, random_state=random_state)
    return sklearn.model_selection.cross_val_score(model, data, labels, cv=cv)

In [3]:
def compare_embeddings(embeddings, n_neighbors=5, k=10, random_state=42):
    cross_val_scores = {}
    for alg in embeddings.keys():
        cross_val_scores[alg] = score_embedding(embeddings[alg][['x', 'y']].values, 
                                                embeddings[alg]['labels'].values,
                                                n_neighbors=n_neighbors,
                                                k=k,
                                                random_state=random_state)
    return pd.DataFrame(cross_val_scores).assign(n_neighbors=n_neighbors)

In [4]:
digits = sklearn.datasets.load_digits()

pendigits = {}
pendigits['tsne'] = pd.read_csv('embedding_pendigits_tsne1.csv', index_col=0)
pendigits['umap'] = pd.read_csv('embedding_pendigits_umap1.csv', index_col=0)
pendigits['lvis'] = pd.read_csv('embedding_pendigits_largevis1.csv', index_col=0)
pendigits['fits'] = pd.read_csv('embedding_pendigits_fitsne1.csv', index_col=0)

pendigits['leig'] = pd.DataFrame(
    sklearn.manifold.SpectralEmbedding(n_components=2).fit_transform(digits.data), 
    columns=('x', 'y')).assign(labels=digits.target)

In [5]:
df = pd.concat(compare_embeddings(pendigits, n_neighbors=n) for n in (10,20,40,80,160))

In [6]:
df.groupby('n_neighbors', ).mean()

Unnamed: 0_level_0,fits,leig,lvis,tsne,umap
n_neighbors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,0.972896,0.77779,0.965704,0.976742,0.97293
20,0.973395,0.777803,0.973416,0.973383,0.976201
40,0.95892,0.777756,0.952252,0.955583,0.953891
80,0.949431,0.768924,0.945017,0.947783,0.95057
160,0.948863,0.746717,0.920763,0.948888,0.95057


In [7]:
df.groupby('n_neighbors', ).std() * 2

Unnamed: 0_level_0,fits,leig,lvis,tsne,umap
n_neighbors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,0.041183,0.112792,0.053206,0.033481,0.043622
20,0.036275,0.115692,0.043507,0.033307,0.035008
40,0.057385,0.111581,0.065633,0.06436,0.059832
80,0.054461,0.111257,0.071807,0.060277,0.056849
160,0.054342,0.107641,0.084504,0.064613,0.056849
