In [None]:
%matplotlib inline

In [None]:
FEATURES = "features.npy"  # Path to feature matrix (M, N)
LABELS = "labels.npy"      # Path to labels, for visualization (M,)

In [None]:
import numpy


X = numpy.load(FEATURES)

Y = numpy.load(LABELS)

## PCA

In [None]:
N_COMPONENTS = 50     # Feature dimension after PCA
SCALE = False         # Center and scale features before PCA, optional

In [None]:
import sklearn.decomposition
import sklearn.preprocessing


pca = sklearn.decomposition.PCA(n_components=N_COMPONENTS, svd_solver="full")

if SCALE:
    X_scaled = sklearn.preprocessing.scale(X)
else:
    X_scaled = X
    
pca.fit(X_scaled)

X_pca = pca.transform(X_scaled)

## tSNE

In [None]:
# Parameters for tSNE. See documentation for details:
#
# http://scikit-learn.org/stable/modules/generated/sklearn.manifold.TSNE.html

OPTIONS = {
    "angle": 0.5,
    "early_exaggeration": 12,
    "init": "random",
    "learning_rate": 200.0,
    "method": "barnes_hut",
    "metric": "euclidean",
    "min_grad_norm": 1e-7,
    "n_components": 2,
    "n_iter": 1000,
    "n_iter_without_progress": 300,
    "perplexity": 30,
    "random_state": numpy.random.randint(256),
    "verbose": 0
}

In [None]:
from sklearn import manifold


tsne = manifold.TSNE(**OPTIONS)

%time tsne.fit_transform(X_pca)

## Visualize

In [None]:
import pandas


df = pandas.DataFrame(data=Z, columns=["V1", "V2"])

df.insert(0, "label", pandas.Series(Y, index=df.index))

In [None]:
import ggplot


ggplot.ggplot(df, ggplot.aes("V1", "V2", color="label")) + \
    ggplot.geom_point()