In [1]:
import numpy as np
import pandas as pd
import sklearn.neighbors
import sklearn.model_selection
import sklearn.datasets
import sklearn.manifold
import sklearn.decomposition

In [2]:
def score_embedding(data, labels, k=10, n_neighbors=5, random_state=42):
    model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
    cv = sklearn.model_selection.StratifiedKFold(n_splits=k, random_state=random_state)
    return sklearn.model_selection.cross_val_score(model, data, labels, cv=cv)

In [3]:
def compare_embeddings(embeddings, n_neighbors=5, k=10, random_state=42):
    cross_val_scores = {}
    for alg in embeddings.keys():
        cross_val_scores[alg] = score_embedding(embeddings[alg][['x', 'y']].values, 
                                                embeddings[alg]['labels'].values,
                                                n_neighbors=n_neighbors,
                                                k=k,
                                                random_state=random_state)
    return pd.DataFrame(cross_val_scores).assign(n_neighbors=n_neighbors)

In [8]:
shuttle = {}
shuttle['tsne'] = pd.read_csv('embedding_shuttle_tsne.csv', index_col=0)
shuttle['umap'] = pd.read_csv('embedding_shuttle_umap.csv', index_col=0)
shuttle['lvis'] = pd.read_csv('embedding_shuttle_largevis1.csv', index_col=0)
shuttle['fits'] = pd.read_csv('embedding_shuttle_fitsne1.csv', index_col=0)
shuttle['leig'] = pd.read_csv('embedding_shuttle_eigenmaps.csv', index_col=0)
shuttle['pca'] = pd.read_csv('embedding_shuttle_pca.csv', index_col=0)

In [9]:
df = pd.concat(compare_embeddings(shuttle, n_neighbors=n, k=10) for n in (100,200,400,800,1600,3200))

In [10]:
df.groupby('n_neighbors').mean()

Unnamed: 0_level_0,fits,leig,lvis,pca,tsne,umap
n_neighbors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.993207,0.962224,0.992103,0.832621,0.994379,0.992931
200,0.989397,0.956948,0.987069,0.820828,0.992241,0.989845
400,0.981569,0.948914,0.976173,0.814672,0.990207,0.988155
800,0.961328,0.941569,0.957241,0.804172,0.968517,0.988466
1600,0.933983,0.918189,0.904293,0.792138,0.926569,0.981104
3200,0.859949,0.894535,0.85043,0.785914,0.827724,0.957241


In [11]:
df.groupby('n_neighbors').std() * 2

Unnamed: 0_level_0,fits,leig,lvis,pca,tsne,umap
n_neighbors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.002545,0.003998,0.003122,0.012838,0.002127,0.002496
200,0.003471,0.00602,0.003053,0.007306,0.00195,0.002292
400,0.003335,0.005765,0.002975,0.006987,0.002445,0.002274
800,0.004482,0.005708,0.003711,0.003374,0.005085,0.002143
1600,0.006326,0.006455,0.00662,0.002544,0.004735,0.002421
3200,0.002454,0.005905,0.008154,0.000504,0.004041,0.005485
