In [1]:
import numpy as np
import pandas as pd
import sklearn.neighbors
import sklearn.model_selection
import sklearn.datasets
import sklearn.manifold
import sklearn.decomposition

In [2]:
def score_embedding(data, labels, k=10, n_neighbors=5, random_state=42):
    model = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
    cv = sklearn.model_selection.StratifiedKFold(n_splits=k, random_state=random_state)
    return sklearn.model_selection.cross_val_score(model, data, labels, cv=cv)

In [3]:
def compare_embeddings(embeddings, n_neighbors=5, k=10, random_state=42):
    cross_val_scores = {}
    for alg in embeddings.keys():
        cross_val_scores[alg] = score_embedding(embeddings[alg][['x', 'y']].values, 
                                                embeddings[alg]['labels'].values,
                                                n_neighbors=n_neighbors,
                                                k=k,
                                                random_state=random_state)
    return pd.DataFrame(cross_val_scores).assign(n_neighbors=n_neighbors)

In [8]:
mnist = {}
mnist['tsne'] = pd.read_csv('embedding_mnist_tsne1.csv', index_col=0)
mnist['umap'] = pd.read_csv('embedding_mnist_umap1.csv', index_col=0)
mnist['lvis'] = pd.read_csv('embedding_mnist_largevis1.csv', index_col=0)
mnist['fits'] = pd.read_csv('embedding_mnist_fitsne1.csv', index_col=0)
mnist['leig'] = pd.read_csv('embedding_mnist_eigenmaps1.csv', index_col=0)
mnist['pca'] = pd.read_csv('embedding_mnist_pca.csv', index_col=0)

In [9]:
df = pd.concat(compare_embeddings(mnist, n_neighbors=n, k=20) for n in (100,200,400,800,1600,3200))

In [10]:
df.groupby('n_neighbors').mean()

Unnamed: 0_level_0,fits,leig,lvis,pca,tsne,umap
n_neighbors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.96243,0.668203,0.962086,0.462462,0.966814,0.967086
200,0.96133,0.667402,0.961501,0.467119,0.965571,0.967044
400,0.961073,0.664216,0.961372,0.468376,0.963672,0.966844
800,0.95693,0.659859,0.961087,0.468161,0.962886,0.966372
1600,0.950072,0.651231,0.947202,0.466532,0.959158,0.965572
3200,0.94183,0.639146,0.92,0.45929,0.946073,0.964472


In [11]:
df.groupby('n_neighbors').std() * 2

Unnamed: 0_level_0,fits,leig,lvis,pca,tsne,umap
n_neighbors,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100,0.014832,0.015889,0.014769,0.022992,0.014501,0.013522
200,0.014796,0.015668,0.014725,0.023741,0.015189,0.013566
400,0.015148,0.015925,0.014899,0.024198,0.015481,0.013541
800,0.016067,0.016802,0.015353,0.022643,0.015933,0.014173
1600,0.017657,0.014491,0.015143,0.023009,0.016245,0.014441
3200,0.018426,0.017428,0.017172,0.022325,0.016894,0.014494
