In [1]:
from sklearn.manifold import TSNE, MDS
from keras.models import load_model
from IPython.display import SVG, Audio, display
from keras.utils.vis_utils import model_to_dot
import numpy as np
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [2]:
import sys
sys.path.append('../')
from data import LibriSpeechDataset
from utils import whiten, contrastive_loss
from config import LIBRISPEECH_SAMPLING_RATE

SyntaxError: invalid syntax (utils.py, line 73)

In [None]:
model_path = '../models/convnet_contrastive_loss.hdf5'
downsampling = 4

In [None]:
siamese = load_model(model_path, custom_objects=[con])

In [None]:
SVG(model_to_dot(siamese, show_shapes=True).create(prog='dot', format='svg'))

In [None]:
SVG(model_to_dot(siamese.layers[2], show_shapes=True).create(prog='dot', format='svg'))

### Extract encoder

In [None]:
from keras.models import Model
from keras.layers import Input

In [None]:
inputs = Input(shape=(12000,1))

encoded = siamese.layers[2](inputs)

In [None]:
encoder = Model(inputs=inputs, outputs=encoded)

In [None]:
encoder = siamese.layers[2]
encoder.compile(loss='mse',optimizer='adam')

In [None]:
SVG(model_to_dot(encoder, show_shapes=True).create(prog='dot', format='svg'))

### Get data

In [None]:
def preprocessor(downsampling, whitening=True):
    def preprocessor_(batch):
        ([i_1, i_2], labels) = batch
        i_1 = i_1[:, ::downsampling, :]
        i_2 = i_2[:, ::downsampling, :]
        if whitening:
            i_1, i_2 = whiten(i_1), whiten(i_2)

        return [i_1, i_2], labels

    return preprocessor_


whiten_downsample = preprocessor(downsampling, whitening=True)

In [None]:
training_set = ['train-clean-100', 'train-clean-360']
train = LibriSpeechDataset(training_set, 3, stochastic=False)

### Random samples

In [None]:
n_samples = 250

In [None]:
Z = [train[i] for i in np.random.randint(0,len(train),size=n_samples)]
X = np.stack(zip(*Z)[0])[:, :, np.newaxis]
y = np.stack(zip(*Z)[1])[:, np.newaxis]
X.shape, y.shape

In [None]:
[X, _], _ = whiten_downsample(([X, X], []))
X.shape

### Random samples from subset of speakers

In [None]:
n_speakers = 20
m_samples = 10
n_random_speakers = train.df['speaker_id'].sample(n_speakers).values

In [None]:
# Get m samples from each speaker
X, y = [], []
for i in n_random_speakers:
    ids = train.df[train.df['speaker_id']==i]['id'].sample(m_samples).values
    Z = [train[i] for i in ids]
    X_ = np.stack(zip(*Z)[0])[:, :, np.newaxis]
    y_ = np.stack(zip(*Z)[1])[:, np.newaxis]
    [X_, _], _ = whiten_downsample(([X_, X_], []))
    
    X.append(X_)
    y.append(ids)
    
X = np.concatenate(X)
y = np.concatenate(y)

# Embed

In [None]:
embeddings = encoder.predict(X)
embeddings.shape

# Dimensionality Reduction

In [None]:
def compare_samples(a, b):
    sample_a = train[a]
    sample_b = train[b]

    print 'Sample A ({}):'.format(a)
    display(Audio(data=sample_a[0],rate=LIBRISPEECH_SAMPLING_RATE))
    print 'Sample B ({}):'.format(b)
    display(Audio(data=sample_b[0],rate=LIBRISPEECH_SAMPLING_RATE))

In [None]:
mds = MDS()

mds_embeddings = mds.fit_transform(embeddings)
mds_embeddings.shape

In [None]:
tsne = TSNE(perplexity=30,learning_rate=500)

tsne_embeddings = tsne.fit_transform(embeddings)
tsne_embeddings.shape

In [None]:
two_d_embeddings = tsne_embeddings
# two_d_embeddings = mds_embeddings

In [None]:
gender_markers = np.array([0 if train.df[train.df['id']==i]['sex'].values[0] == 'M' else 1 for i in y])

In [None]:
Z = np.hstack([two_d_embeddings, y[:, np.newaxis], gender_markers[:, np.newaxis]])
m = Z[Z[:, 3] == 0]
f = Z[Z[:, 3] == 1]
m.shape, f.shape

In [None]:
plt.figure(figsize=(12,12))

plt.scatter(m[:, 0], m[:, 1], c=m[:, 2], marker='o')
plt.scatter(f[:, 0], f[:, 1], c=f[:, 2], marker='x')

# for x_, y_, idx in zip(two_d_embeddings[:, 0], two_d_embeddings[:, 1], y):
#     plt.text(x_, y_, idx)
    
plt.grid()
plt.show()

In [None]:
compare_samples(76518, 20765)