In [98]:
from time import time
import numpy as np
import json
from sklearn import (manifold, datasets, decomposition, ensemble,
                     discriminant_analysis, random_projection)
import skimage.io, skimage.transform


data_path = './extraction/data/'
tfidf_path = './tfidf.json'
meta_path = './meta_tfidf.json'
meta = json.load(open(meta_path, 'r'))
good_id_to_filename = meta['ids']
i_to_id = meta['itoid']
reg_ext = ".jpg"
min_ext = ".min.jpg"

X = D = np.array(json.load(open(tfidf_path, 'r'))['D'])#[:500,:500]
names = np.asarray([good_id_to_filename[i_to_id[str(i)]] for i in range(len(D))])
#np.savetxt("D.csv", D, delimiter=",")
#np.savetxt("labels.csv", names, fmt="%s", delimiter=",",encoding='utf-8')

In [104]:
def get_thumbnail_path(i, min=True):
    if min is True:
        path = data_path+good_id_to_filename[i_to_id[str(i)]]+min_ext
    else:
        path = data_path+good_id_to_filename[i_to_id[str(i)]]+reg_ext
    return path


def get_thumbnails():
    for i in range(len(X)):
        filename = get_thumbnail_path(i)
        image = skimage.io.imread(filename)
        #image = skimage.transform.resize(image, (16, 16))
        yield image

In [105]:
#----------------------------------------------------------------------
# t-SNE embedding of the digits dataset
print("Computing t-SNE embedding")
tsne = manifold.TSNE(n_components=2, random_state=0, learning_rate=100, early_exaggeration=40, perplexity=10, metric="precomputed", n_iter=1000)
t0 = time()
X_tsne = tsne.fit_transform(X)

Computing t-SNE embedding


In [106]:
thumbnail_iter = get_thumbnails()

In [108]:
X_tsne.shape

(3006, 2)

In [None]:
def get_images(path=data_path):
    """return a list of dictionaries for all images"""
    x=X_tsne[:,0]
    y=X_tsne[:,1]
    max_x = np.max(x)
    max_y = np.max(x)
    
    for i in range(len(x)):
        d = dict(
            xref="x",
            yref="y",
            sizex=max_x/10,
            sizey=max_y/10,
            xanchor="center",
            yanchor="middle",
            x=x[i],
            y=y[i],
            source=get_thumbnail_path(i),
        )
        dlist.append(d)
        
    return dlist

images = get_images()

In [114]:
import random
import numpy as np
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

trace0=go.Scatter(
    x=X_tsne[:,0],
    y=X_tsne[:,1],
    mode='markers',
    marker= dict(size= 14,
                line= dict(width=1),
                opacity= 0.0
               ),
    text=names) # The hover text goes here... 

layout=go.Layout(
    images=images,
    title= 't-SNE',
    hovermode='closest',
    showlegend= False
)
fig = go.Figure(data=[trace0], layout=layout)
iplot(fig)

In [None]:
##----------------------------------------------------------------------
## Scale and visualize the embedding vectors
#def plot_embedding(X, title=None):
#    x_min, x_max = np.min(X, 0), np.max(X, 0)
#    #X = (X - x_min) / (x_max - x_min)
#
#    plt.figure(figsize=[12.8, 9.6])
#    ax = plt.subplot(111)
#    for i in range(X.shape[0]):
#        plt.plot(X[i, 0], X[i, 1])
#        plt.text(X[i, 0], X[i, 1], names[i])
#
#    if hasattr(offsetbox, 'AnnotationBbox'):
#        # only print thumbnails with matplotlib > 1.0
#        shown_images = np.array([[1., 1.]])  # just something big
#        for i in range(X.shape[0]):
#            dist = np.sum((X[i] - shown_images) ** 2, 1)
#            #if np.min(dist) < 4e-3:
#            #    # don't show points that are too close
#            #    continue
#            shown_images = np.r_[shown_images, [X[i]]]
#            imagebox = offsetbox.AnnotationBbox(
#                offsetbox.OffsetImage(next(thumbnail_iter), zoom=1),
#                X[i])
#            ax.add_artist(imagebox)
#    plt.xticks([]), plt.yticks([])
#    if title is not None:
#        plt.title(title)