In [7]:
import numpy as np
import time
import os
import pandas as pd

from sklearn import datasets
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, FastICA

from plotly.offline import download_plotlyjs, init_notebook_mode, plot,iplot

#Always run this the command before at the start of notebook
init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Number of dimensions
ndims = 3
scatterClass = go.Scatter3d
if ndims == 2:
    scatterClass = go.Scatter
    
# Label of the plot
plot_title = 'Visualise.html'

# Dimensionality reduction
reduce_funcs = {
    'ica': FastICA,
    'pca': PCA
}

In [2]:
def get_mnist_dataset():
    digits = datasets.load_digits()
    data = digits.data
    labels = digits.target
    return data, labels


def get_preposition_dataset(kind):
    directory = "C:\\Users\\naflaki\\Desktop\\3D visualisation\\"
    if kind == 'hs':
        filename = "HS.csv"
    else:
        filename = "CBOW.csv"
    filepath = os.path.join(directory, filename)
    df = pd.read_csv(filepath, index_col=0)
    return df.values, df.index.to_numpy()


def convert_mnist_labels_to_string(labels):
    return np.array([str(label) for label in labels])


def perform_dim_reduction(kind, data, n_components, **kwargs):
    dim_reduce_kind = reduce_funcs[kind]
    func = dim_reduce_kind(n_components=n_components, **kwargs)
    data_reduced = func.fit_transform(data, y=None)
    if hasattr(func, 'explained_variance_ratio_'):
        print('Cumulative explained variation for {} principal components: {}'
              .format(n_components, np.sum(func.explained_variance_ratio_)))
    return data_reduced


def perform_tsne(data, verbose=1, perplexity=3, n_iter=4000):    
    tsne = TSNE(n_components=ndims, verbose=verbose, perplexity=perplexity, n_iter=n_iter)
    time_start = time.time()
    result = tsne.fit_transform(data)
    print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))
    return result

def perform_mds(data, n_iter=1000):
    from sklearn.manifold import MDS
    model = MDS(n_components=ndims, dissimilarity='precomputed', random_state=7, verbose=1, max_iter=n_iter)
    result = model.fit_transform(data)
    stress = model.stress_
    print('Stress value is: {}'.format(stress))
    return result

In [3]:
def visualise(data_tsne, labels):
    unique_labels = np.unique(labels)
    traces = []
    for label in unique_labels:
        ind = np.where(labels==label)
        coords = dict(
            x=data_tsne[ind, 0].ravel(),
            y=data_tsne[ind, 1].ravel()
        )
        if ndims == 3:
            coords['z'] = data_tsne[ind, 2].ravel()

        ids = labels[ind];

        trace = scatterClass(
            name=label.strip(),
            # markers+text will display the labels on top of the points
            mode='markers+text',
            textposition='top center',
            
            # markers only will not display the labels
            # mode='markers',
            marker=dict(
                size=5,
                line=dict(
                    width=0.5
                ),
                opacity=1
            ),
            text=ids,
            **coords
        )
        traces.append(trace)
    layout = go.Layout(
        hovermode='closest',
        title=plot_title,
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )
    
    fig = go.Figure(data=traces, layout=layout)
    return fig
#     plot = py.iplot(fig, filename=plot_title, show_link = True)
#     print('{}'.format(plot.resource))

In [4]:
# To show Kritin tomorrow:
data, labels = get_mnist_dataset()
data = data[:100, :]
labels = labels[:100]
labels = convert_mnist_labels_to_string(labels)
data = perform_tsne(data, n_iter=1000)

[t-SNE] Computing 10 nearest neighbors...
[t-SNE] Indexed 100 samples in 0.000s...
[t-SNE] Computed neighbors for 100 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 100 / 100
[t-SNE] Mean sigma: 8.144098
[t-SNE] KL divergence after 250 iterations with early exaggeration: 113.439171
[t-SNE] KL divergence after 1000 iterations: 4.465498
t-SNE done! Time elapsed: 0.7330427169799805 seconds


In [5]:
fig = visualise(data, labels)
plot = iplot(fig, filename=plot_title, show_link = True)
if plot is not None:
    print('{}'.format(plot.resource))

AttributeError: 'NoneType' object has no attribute 'resource'

In [8]:
data, labels = get_preposition_dataset('hs')
data = perform_dim_reduction('pca', data, ndims)
#data = perform_mds(data, n_iter=10000)
# fig = visualise(data, labels)
# iplot(fig, filename=plot_title, show_link = True)

fig = visualise(data, labels)
plot = iplot(fig, filename=plot_title, show_link = True)
if plot is not None:
    print('{}'.format(plot.resource))

Cumulative explained variation for 3 principal components: 0.7820382814437948


In [None]:
data, labels = get_preposition_dataset('hs')
data = perform_tsne(data, n_iter=10000)
fig = visualise(data, labels)
iplot(fig, filename=plot_title, show_link = True)