In [1]:
import tensorflow as tf
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import os
import itertools
import random
import taglib
import jams
import muda
import time
print(tf.__version__, librosa.__version__)
%matplotlib inline

('1.2.1', '0.5.1')


# Pipeline for Second Hand Songset

In [2]:
TRAIN_FOLDER = "/Users/markostamenovic/code/shs/shs_train"

In [3]:
def txt_to_cliques(shs_loc):
    '''
    reads a textfile of song cliques in shs
    creates a dictionary out of second hand songset 'cliques'
    or groups of cover songs and returns the dict of cliques
    based on their msd id
    '''
    shs = list(open(shs_loc))
    shs = shs[14:]
    cliques = {}
    for ent in shs:
        ent = ent.replace('\n','')
        if ent[0] == '%':
            tempKey = ent.lower()
            cliques[tempKey] = []
        else:
            cliques[tempKey].append(ent.split("<SEP>")[0]+'.mp3')
    return cliques
cliques = txt_to_cliques(os.path.join(TRAIN_FOLDER, "shs_dataset_train.txt"))

In [4]:
def get_labels(cliques):
    # get and flatten all combination of coversongs
    positive_examples = (list(itertools.combinations(val,2)) for key,val in cliques.items())
    positive_examples = [i for j in positive_examples for i in j]
    positive_len_og = len(positive_examples)
    positive_labels = [[0,1] for _ in positive_examples]
    # generate negative examples of an equivalent length to the positive examples list
    song_from_each_clique = (random.choice(val) for key,val in cliques.items())
    negative_examples = itertools.combinations(song_from_each_clique,2)
    negative_examples = list(itertools.islice(negative_examples, positive_len_og))
    negative_labels = [[1,0] for _ in negative_examples]
    # 
    x = positive_examples + negative_examples
    y = positive_labels + negative_labels
    return zip(x,y)

In [5]:
def join_files_to_path(targets_labels):
    return [(map(lambda x: os.path.join(TRAIN_FOLDER, x), target), label) for target,label in targets_labels] 

In [6]:
corrupted_folder = "/Users/markostamenovic/code/shs/shs_train/small_size_mp3s"

def prune_corrupted_files(targets_labels):
    corrupted_folder = os.path.join(TRAIN_FOLDER, "small_size_mp3s")
    pruned = []
    corrupted = [i for i in os.listdir(corrupted_folder) if i.endswith("mp3")]
    clean = [i for i in os.listdir(TRAIN_FOLDER) if i.endswith("mp3")]
    
    for target, label in targets_labels:
        if target[0] in corrupted or target[1] in corrupted:
            pass
        if target[0] not in clean or target[1] not in clean:
            pass
        else:
            pruned.append((target,label))
    return pruned
    

In [8]:
def process_one_file(file_path, _duration, augmentation = True):
#     1 loop, best of 3: 578 ms per loop
#     t0 = time.time()
    file_path = file_path[0]
    stretch_scale = .05
#     duration = np.multiply( _duration, ( np.add( np.multiply( stretch_scale, 2), 1))) # we do this to pad for potential stretch
    f_len = np.float32(taglib.File(file_path).length)
    # require(duration >= f_len) check that duration is below length
    start_bound = np.subtract(f_len, _duration)
    start = np.float(np.random.randint(0,start_bound))
    y, sr = librosa.core.load(path = file_path,
                              offset = None,
                              duration = _duration,  
                              res_type='kaiser_fast')
    ''' deformation pipeline'''
    if augmentation:
        jam = jams.JAMS()
        jam.file_metadata.duration = librosa.get_duration(y=y, sr=sr)
        jam.file_metadata.identifiers = os.path.basename(file_path)
        j_orig = muda.jam_pack(jam, _audio=dict(y=y, sr=sr))
        pitch_shift = muda.deformers.RandomPitchShift(n_samples=1, mean=0.0, sigma=1.0)
        time_stretch = muda.deformers.RandomTimeStretch(n_samples=1, location=0.0, scale=stretch_scale)
    #     noise = muda.deformers.BackgroundNoise(n_samples=1)
    #     drc = muda.deformers.DynamicRangeCompression(preset=muda.deformers.PRESETS.keys())
    #     drc = muda.deformers.PRESETS.keys()
    #     drc.append(False)
    #     drc = random.choice(drc)
    #     if drc:
    #         compress = muda.deformers.DynamicRangeCompression(preset=drc)
    #     else:
    #         compress = time_stretch
        pipeline = muda.Pipeline(steps=[('pitch_shift', pitch_shift),
                                        ('time_stretch', time_stretch)])
        out = list(pipeline.transform(j_orig))[0]
        y_t, sr_t = out.sandbox.muda._audio.values()
        '''zero pad for deformation length change'''
    else:
        y_t = y
        sr_t = sr
        
    required_samples = _duration*sr_t
    deformed_samples = y_t.shape[0]
    sample_diff = abs(deformed_samples - required_samples)

    if deformed_samples > required_samples:
        y_t = y_t[:required_samples]
    else:
        y_t = np.lib.pad(y_t, (0,sample_diff), 'constant', constant_values=(0, 0))
    
    '''get cqt'''
    cqt = librosa.core.cqt(y_t, sr=sr_t,
                           hop_length=1024, 
                           fmin=None, n_bins=84, 
                           bins_per_octave=12, 
                           tuning=0.0, 
                           filter_scale=1, 
                           norm=1, 
                           sparsity=0.01, 
                           window='hann', 
                           scale=True)
    
#     elapsed = time.time() - t0
#     print("elapsed = {}").format(elapsed)
    return np.abs(cqt).astype("float32")

In [9]:
# plt.figure(figsize=(10, 4))
# librosa.display.specshow(librosa.power_to_db(S,ref=np.max),
#                           y_axis='mel', fmax=11025,
#                           x_axis='time')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Mel spectrogram')
# plt.tight_layout()

In [10]:
targets_labels = join_files_to_path(prune_corrupted_files(get_labels(cliques)))
targets_t_0 = tf.cast(tf.expand_dims(tf.convert_to_tensor(np.array(targets_labels)[:,0][:,0]),1), tf.string)
targets_t_1 = tf.cast(tf.expand_dims(tf.convert_to_tensor(np.array(targets_labels)[:,0][:,1]),1), tf.string)
labels_t = tf.convert_to_tensor(np.array(targets_labels)[:,1].astype("float32"))

In [11]:
f0, f1, labels = tf.train.slice_input_producer(
    [targets_t_0, targets_t_1, labels_t], 
    num_epochs=None, 
    shuffle=True, 
    seed=None, 
    capacity=10000)

In [12]:
def get_slices(filename, sr=22050, hop_length=1024, duration=15, file_read_dur=29):
    frames_pre_sec = int(sr/hop_length)
    slice_size = duration*(frames_pre_sec)
#     print(slice_size)
    spec_1_t = tf.py_func(process_one_file, [filename, file_read_dur], tf.float32)
#     _, ncols = spec_1_t.shape.as_list()
    ncols=625
    spec_1_t.set_shape([84, 625])
    at_win = tf.map_fn(lambda i: spec_1_t[:,i:i+slice_size], tf.range(ncols - slice_size+1), dtype=tf.float32)
    at_shuff = tf.random_shuffle(at_win)
    return at_shuff

In [13]:
specs = tf.stack([get_slices(f0), get_slices(f1)],1)

In [None]:
batch_j = tf.train.shuffle_batch_join(
    
    [[specs]]*8,                           # this many parallel
    batch_size = 16,                       # dequeue this many
    capacity = 8192,                       # this is the queue capacity
    min_after_dequeue = 7192,              # minimum after dq
    enqueue_many = True,                   # push multiple examples to queue for each input producer
    shapes = tf.TensorShape([2, 84, 315])  # shape of the non-batch dimension

)

In [15]:
batch = tf.train.shuffle_batch(
    
    [specs],                            
    batch_size = 16,                       # dequeue this many
    capacity = 8192,                       # this is the queue capacity
    min_after_dequeue = 7192,              # minimum after dq
    num_threads=8,
    enqueue_many = True,                   # push multiple examples to queue for each input producer 
    shapes = tf.TensorShape([2, 84, 315])  # shape of the non-batch dimension

)

In [17]:
with tf.Session() as sess:
    init = tf.global_variables_initializer()
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    sps = []
    for _ in range(100):
        t0 = time.time()
        a = sess.run(batch)
        print(a.shape)
        elapsed = time.time() - t0
        steps_sec = 1/elapsed
        print("steps per sec = {}").format(steps_sec)
        sps.append(steps_sec)
    print("mean steps per sec = {}").format(np.mean(sps[4:])) #drop the first few steps bc thats filling queue
    coord.request_stop()
    coord.join(threads)

(16, 2, 84, 315)
steps per sec = 0.0311478069152
(16, 2, 84, 315)
steps per sec = 0.308465083386
(16, 2, 84, 315)
steps per sec = 0.335959619844
(16, 2, 84, 315)
steps per sec = 7.3252076111
(16, 2, 84, 315)
steps per sec = 7.38224552548
(16, 2, 84, 315)
steps per sec = 204.910059114
(16, 2, 84, 315)
steps per sec = 166.137368296
(16, 2, 84, 315)
steps per sec = 128.616233786
(16, 2, 84, 315)
steps per sec = 180.765590656
(16, 2, 84, 315)
steps per sec = 295.602508986
(16, 2, 84, 315)
steps per sec = 303.583092067
(16, 2, 84, 315)
steps per sec = 176.9300599
(16, 2, 84, 315)
steps per sec = 63.4078732539
(16, 2, 84, 315)
steps per sec = 0.334113379219
(16, 2, 84, 315)
steps per sec = 116.69970229
(16, 2, 84, 315)
steps per sec = 13.8389336149
(16, 2, 84, 315)
steps per sec = 10.1574459532
(16, 2, 84, 315)
steps per sec = 157.882406083
(16, 2, 84, 315)
steps per sec = 34.1158423009
(16, 2, 84, 315)
steps per sec = 17.1279973865
(16, 2, 84, 315)
steps per sec = 173.914831861
(16, 2, 84, 