# Timbre metric training

This notebook was used to train the timbre distance metric. We use the triplet network implementation from the [ISMIR 2020 metric learning tutorial](https://github.com/bmcfee/ismir2020-metric-learning/).

Copyright 2020 InterDigital R&D and Télécom Paris.  
Author: Ondřej Cífka

In [24]:
import itertools
import os
import pickle
import tempfile

import librosa
import numpy as np
from scipy.spatial.distance import cosine as cosine_distance
import tensorflow as tf
from tqdm.auto import tqdm

from ss_vq_vae.models import triplet_network

In [2]:
SR = 16000
BATCH_SIZE = 16
DATA_DIR = '../data/mixing_secrets/metric_train'

In [3]:
def load_audio_mfcc(path):
    a, _ = librosa.load(path, sr=SR)
    return librosa.feature.mfcc(a, sr=SR, n_mfcc=13, hop_length=500)[1:].T

In [45]:
model, backbone = triplet_network.build_model(num_features=12)

Model: "backbone"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, None, 12)]        0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, None, 64)          3136      
_________________________________________________________________
batch_normalization_12 (Batc (None, None, 64)          256       
_________________________________________________________________
activation_12 (Activation)   (None, None, 64)          0         
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, None, 64)          0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, None, 64)          16448     
_________________________________________________________________
batch_normalization_13 (Batc (None, None, 64)          256

In [5]:
TMP_DIR = tempfile.mkdtemp()
TMP_DIR

'/tmp/tmpzxvrbtdy'

In [31]:
# Cache the pre-processed dataset in the temporary directory
single_loader, dataset_size = triplet_network.data_loader(os.path.join(DATA_DIR, 'triplets_train'), load_fn=load_audio_mfcc, batch_size=1)
i_len = len(str(dataset_size - 1))
with open(os.path.join(TMP_DIR, 'list'), 'w') as f_list:
    for i, (triplet, _) in tqdm(enumerate(single_loader()), total=dataset_size):
        paths = []
        for name in ['anchor', 'positive', 'negative']:
            [example] = triplet[name + '_input']
            path = '{}_{}.npy'.format(str(i).zfill(i_len), name)
            np.save(os.path.join(TMP_DIR, path), example, allow_pickle=False)
            paths.append(path)
        print(*paths, sep='\t', file=f_list)

HBox(children=(FloatProgress(value=0.0, max=7381.0), HTML(value='')))




In [42]:
train_loader, steps_per_epoch = triplet_network.data_loader(os.path.join(TMP_DIR, 'list'), load_fn=np.load, batch_size=BATCH_SIZE, shuffle=True, repeat=True)

In [32]:
val_loader, val_dataset_size = triplet_network.data_loader(os.path.join(DATA_DIR, 'triplets_val'), load_fn=load_audio_mfcc, batch_size=1)
val_data = list(val_loader())

In [46]:
optimizer = tf.keras.optimizers.Adam(lr=0.001)

model.compile(optimizer, loss=triplet_network.triplet_hinge_loss)

for _ in range(2):
    pred = model.predict(iter(val_data)).reshape(-1, 2)
    print('Accuracy:', np.mean(pred.argmax(axis=1) == 0))
    print(pred[:4])
    
    model.fit(train_loader(),
        epochs=1,
        verbose=1,
        steps_per_epoch=steps_per_epoch,
    )


pred = model.predict(iter(val_data)).reshape(-1, 2)
print('Accuracy:', np.mean(pred.argmax(axis=1) == 0))
print(pred[:4])

Accuracy: 0.97
[[0.97825205 0.92818373]
 [0.9945916  0.9573082 ]
 [0.98921    0.91431683]
 [0.8972999  0.86211026]]
Accuracy: 0.98
[[0.9114496  0.2761086 ]
 [0.9806918  0.2920639 ]
 [0.9503249  0.32286048]
 [0.78040844 0.36132288]]
Accuracy: 0.98
[[0.855681   0.319054  ]
 [0.94909924 0.53117394]
 [0.9613851  0.36606544]
 [0.8170693  0.27370113]]


In [47]:
model.save_weights('checkpoint.ckpt')