## Lotart Dataset

In [38]:
import numpy as np
import pandas as pd

In [116]:
df_top = pd.read_csv('vectors_top.csv', header=None)
df_top.shape

(28172, 1001)

In [169]:
# to np
X = pd.DataFrame.as_matrix(df_top.loc[1:28100, 1:1000])
X.shape

(28100, 1000)

### Autoencoder: classic but mixed latent vectors ...

In [4]:
from keras.layers import Input, Dense
from keras.models import Model

# this is the size of our encoded representations
encoding_dim = 32  # 32 floats -> compression of factor 24.5, assuming the input is 784 floats

# this is our input placeholder
input_img = Input(shape=(1000,))
# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_img)
# "decoded" is the lossy reconstruction of the input
decoded = Dense(1000, activation='sigmoid')(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(input_img, decoded)

Using TensorFlow backend.


In [5]:
# this model maps an input to its encoded representation
encoder = Model(input_img, encoded)

In [6]:
# create a placeholder for an encoded (32-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))

In [7]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

In [11]:
autoencoder.fit(X,X,epochs=50,batch_size=256, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x114670cf8>

In [16]:
Y = encoder.predict(X)

### Autoencoder: reparametrising the latent space using KL-divergence

In [135]:
from sklearn.decomposition import PCA
pca = PCA(n_components=128)
X = pca.fit_transform(X)

In [136]:
X.shape

(28100, 128)

In [162]:
from keras.layers import Input, Dense, Lambda, Layer
from keras.models import Model
from keras import backend as K
from keras import metrics
from keras import objectives

In [171]:
#hyperparameters
batch_size = 100
original_dim = 1000
latent_dim = 32
intermediate_dim = 128
epochs = 10
epsilon_std = 1.0

In [172]:

#encoder
x = Input(batch_shape=(batch_size, original_dim))
h1 = Dense(intermediate_dim, activation='relu')(x)
h2 = Dense(intermediate_dim, activation='relu')(h1)
z_mean = Dense(latent_dim)(h2)
z_log_var = Dense(latent_dim)(h2)

print(z_mean)
print(z_log_var)

Tensor("dense_78/BiasAdd:0", shape=(100, 32), dtype=float32)
Tensor("dense_79/BiasAdd:0", shape=(100, 32), dtype=float32)


In [173]:
def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.)
    return z_mean + K.exp(z_log_var / 2) * epsilon

# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

#latent hidden state
print(z)

Tensor("lambda_12/add:0", shape=(100, 32), dtype=float32)


In [174]:
#decoder
# we instantiate these layers separately so as to reuse them later
decoder_h1 = Dense(intermediate_dim, activation='relu')
decoder_h2 = Dense(intermediate_dim, activation='relu')
decoder_mean = Dense(original_dim, activation='sigmoid')
h_decoded = decoder_h1(decoder_h2(z))
x_decoded_mean = decoder_mean(h_decoded)

print(x_decoded_mean)

Tensor("dense_82/Sigmoid:0", shape=(100, 1000), dtype=float32)


In [181]:
#loss
def vae_loss(x, x_decoded_mean):
    xent_loss = original_dim * objectives.binary_crossentropy(x, x_decoded_mean)
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    return kl_loss + xent_loss

vae = Model(x, x_decoded_mean)
vae.compile(optimizer='rmsprop', loss=vae_loss)

In [182]:
vae.fit(X, X, shuffle=True, epochs=epochs, batch_size=batch_size,verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x12d560ac8>

In [183]:
# build a model to project inputs on the latent space
encoder = Model(x, z_mean)

In [184]:
T  = encoder.predict(X, batch_size=batch_size)
T.shape

(28100, 32)

In [185]:
from sklearn.manifold import TSNE

model = TSNE(n_components=2, random_state=0, n_iter=5000, perplexity=25)
T_all = model.fit_transform(T[:10000])

In [156]:
T_all.shape

(10000, 2)

In [187]:
x = T_all[:, 0]
y = T_all[:, 1]

n = ['http://localhost:5000/data/preview/'+i for i in df_top.loc[1:10000, 0]]

In [188]:
from bokeh.layouts import row, column
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, HoverTool, BoxSelectTool, CustomJS, Rect, Div
from bokeh.io import output_notebook

In [189]:
output_notebook()

In [191]:
source = ColumnDataSource(dict(x=x, y=y, n=n))
p1 = figure(tools='box_select,box_zoom,wheel_zoom,pan,reset', plot_width=400, plot_height=400)
p1.scatter(x='x',y='y', source=source)

hover = HoverTool()
hover = HoverTool(
        tooltips="""
           <img src="@n" height="75" width="75" />
        """
    )

# Finally add/enable the tool
p1.add_tools(hover)

jscode="""
    var data = source.data;
    var start = cb_obj.start;
    var end = cb_obj.end;
    data['%s'] = [start + (end - start) / 2];
    data['%s'] = [end - start];
    source.trigger('change');
"""

source_zoom = ColumnDataSource({'x': [], 'y': [], 'width': [], 'height': []})

p1.x_range.callback = CustomJS(
        args=dict(source=source_zoom), code=jscode % ('x', 'width'))
p1.y_range.callback = CustomJS(
        args=dict(source=source_zoom), code=jscode % ('y', 'height'))

p2 = figure(tools='', plot_width=400, plot_height=400)
p2.scatter(x='x',y='y', source=source)
rect = Rect(x='x', y='y', width='width', height='height', fill_alpha=0.1, line_color='black', fill_color='black')
p2.add_glyph(source_zoom, rect)


div = Div(width=800, height=600)
jscode_select  = """
        var inds = cb_obj.selected['1d'].indices;
        var d = cb_obj.data;
        var args = [];
        for (var i=0; i<inds.length; i++ ) {
            args.push('<img style="display: inline; margin:3px;" src="'+ d['n'][inds[i]] +'" height="75" width="75" />');
        }
        div.text = args.join("");
    """
source.callback = CustomJS(args=dict(div=div), code=jscode_select)

layout = column(row(p1, p2), div)

show(layout)
