## Synthesising single samples from a trained model

In [1]:
import tensorflow as tf
import numpy as np
import json
from IPython.display import display, Audio
from tqdm import tqdm
import librosa
import soundfile
import os

#### Get the trained model and class labels

In [2]:
path_to_generator = 'checkpoints/23-04-2021_19h/generator.h5'
path_to_labels = 'checkpoints/23-04-2021_19h/label_names.json'
path_to_output = 'checkpoints/23-04-2021_19h/generated_audio'
if not os.path.exists(path_to_output):
    os.makedirs(path_to_output)

z_dim = 100
sample_rate = 16000

In [3]:
#load the generator
generator = tf.keras.models.load_model(path_to_generator)



In [4]:
#read the labels from the generated dictionary during training
with open(path_to_labels) as json_file:
    label_names = json.load(json_file)
label_names

{'0': 'cardboard_paper',
 '1': 'carpet_rug',
 '2': 'concrete_cement_pavement',
 '3': 'dirt_gravel',
 '4': 'grass_leaves_twings',
 '5': 'wood'}

#### Generating a single sample (with label)

In [5]:
#create noise and label
label = 0
noise = np.random.normal(0,1, (1, z_dim))
label_synth = np.array(label).reshape(-1,1)

#synthesise the audio
%time synth_audio = generator.predict([noise, label_synth])

#listen to the synthesised audio
display(Audio(np.squeeze(synth_audio[0]), rate = sample_rate))

CPU times: user 383 ms, sys: 25.9 ms, total: 409 ms
Wall time: 332 ms


### Batch generation

In [6]:
#how many samples per label
n_samples_label = 100

In [7]:
for emotion in tqdm(label_names):
    noise  = tf.random.normal(shape=(n_samples_label, z_dim))
    label_synth = tf.constant(int(emotion), shape=(n_samples_label,1))
    synth_audio = generator.predict([noise, label_synth])
    for i in range(n_samples_label):
        # librosa.output.write_wav(f'{label_names[emotion]}_{i}.wav', 
        #                           y = np.squeeze(synth_audio[i]), 
        #                           sr = sample_rate, 
        #                           norm=False) 
        soundfile.write(file = '%s/%s_%s.wav' % (path_to_output, label_names[emotion], i),
                                    data = np.squeeze(synth_audio[i]), 
                                    samplerate = sample_rate, 
                                    subtype=None, 
                                    endian=None, 
                                    format=None, 
                                    closefd=True)

100%|██████████| 6/6 [00:10<00:00,  1.75s/it]
