In [1]:
import tensorflow as tf
from keras import layers
from loguru import logger
import librosa
import librosa.display
from librosa import mel_frequencies
import numpy as np
import tensorflow_datasets as tfds
from tensorflow.io import FixedLenFeature, parse_single_example

from pathlib2 import Path
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import IntProgress
%matplotlib inline



print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [15]:
DATA_DIR = Path("../data/").resolve()
DATA_DIR

WindowsPath('D:/Projects/rimworld-of-sound/data')

In [24]:
class HParams(object):
    """ Hparams was removed from tf 2.0alpha so this is a placeholder
    """
    def __init__(self, **kwargs):
        self.__dict__.update(kwargs)

hparams = HParams( 
    # network
    batch_size = 32,
    # spectrogramming
    sample_rate = 16000,
    create_spectrogram = True,
    win_length = 1024,
    n_fft = 1024,
    hop_length= 400,
    ref_level_db = 50,
    min_level_db = -100,
    # mel scaling
    num_mel_bins = 128,
    mel_lower_edge_hertz = 0,
    mel_upper_edge_hertz = 8000,
    # inversion
    power = 1.5, # for spectral inversion
    griffin_lim_iters = 50,
    pad=True,
    #
)

In [44]:
batch_size = 32
audio_length = 64_000

features = {
    "id": FixedLenFeature([], dtype=tf.string),
    "pitch": FixedLenFeature([1], dtype=tf.int64),
    "velocity": FixedLenFeature([1], dtype=tf.int64),
    "audio": FixedLenFeature([64000], dtype=tf.float32),
    "instrument/source": FixedLenFeature([1], dtype=tf.int64),
    "instrument/family": FixedLenFeature([1], dtype=tf.int64),
    "instrument/label": FixedLenFeature([1], dtype=tf.int64),
}
DATA_PATH_VALID = '../data/nsynth-valid.tfrecord'
ds_train = tf.data.TFRecordDataset(DATA_PATH_VALID)\
#     .map(_parse_function) \
#     .batch(batch_size)
DATA_PATH_TEST = '../data/nsynth-test.tfrecord'
ds_test = tf.data.TFRecordDataset(DATA_PATH_TEST)\
#     .map(_parse_function) \
#     .batch(batch_size)

def _stft_tensorflow(signals, hparams):
    return tf.signal.stft(
        signals,
        hparams.win_length,
        hparams.hop_length,
        hparams.n_fft,
        pad_end=True,
        window_fn=tf.signal.hann_window,
    )

for raw_record in ds_test.take(1):
    example = tf.io.parse_single_example(raw_record, features)
    print(_stft_tensorflow(example["audio"], hparams))

InvalidArgumentError: Feature: id (data type: string) is required but could not be found. [Op:ParseExampleV2]

In [17]:
# ds_train, ds_test = tfds.load(
#     name="nsynth", split=["valid", "test"], data_dir=DATA_DIR
# )

In [18]:
def _normalize_tensorflow(S, hparams):
    return tf.clip_by_value((S - hparams.min_level_db) / -hparams.min_level_db, 0, 1)

def _tf_log10(x):
    numerator = tf.math.log(x)
    denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
    return numerator / denominator


def _amp_to_db_tensorflow(x):
    return 20 * _tf_log10(tf.clip_by_value(tf.abs(x), 1e-5, 1e100))


def _stft_tensorflow(signals, hparams):
    return tf.signal.stft(
        signals,
        hparams.win_length,
        hparams.hop_length,
        hparams.n_fft,
        pad_end=True,
        window_fn=tf.signal.hann_window,
    )


def spectrogram_tensorflow(y, hparams):
    D = _stft_tensorflow(y, hparams)
    S = _amp_to_db_tensorflow(tf.abs(D)) - hparams.ref_level_db
    return _normalize_tensorflow(S, hparams)

In [20]:
def _one_hot(tensor: tf.Tensor, size) -> tf.Tensor:
    """
    One hot encode a tensor and return it as 1D tensor
    :param tensor:
    :param size: number of unique values in tensor
    :return:
    """
    hot_tensor = tf.one_hot(tensor, size)
    shaped_tensor = tf.reshape(hot_tensor, (size,))
    return shaped_tensor

def _stft(tensor: tf.Tensor, audio_length, frame_length=255, frame_step=128) -> tf.Tensor:

    # Concatenate audio with padding so that all audio clips will be of the 
    # same length

    return tf.abs(tf.signal.stft(tensor, frame_length, frame_step))



In [21]:
@tf.autograph.experimental.do_not_convert
def _parse_function(example_proto):
    # Schema
    audio_length=64_000
    features = {
        "pitch": tf.io.FixedLenFeature([1], dtype=tf.int64),
        "audio": tf.io.FixedLenFeature([audio_length], dtype=tf.float32),
        "velocity": tf.io.FixedLenFeature([1], dtype=tf.int64),
        "instrument_family": tf.io.FixedLenFeature([1], dtype=tf.int64),
    }
    example = tf.io.parse_single_example(example_proto, features)
    data = _stft(example['audio'], audio_length)
    label_name = 'instrument_family'
    label_value_count = 11 #instrument family has 11 types 0,1,2,3,...10
    #label_value_count = 128
    # label_name = 'velocity'
    # label_value_count = 4
    label = _one_hot(example[label_name], label_value_count)
    return data, label

In [22]:
def plot_sliding_spectrum(S_abs):
    librosa.display.specshow(librosa.amplitude_to_db(S_abs, ref=np.max), y_axis='log', x_axis='time')

def run():
    DATA_PATH = '../data/nsynth-test.tfrecord'
    batch_size = 32
    audio_length = 64_000
    parsed_dataset = tf.data \
        .TFRecordDataset(DATA_PATH) \
        .map(_parse_function) \
        .batch(batch_size)
    
    
    df = tfds.as_dataframe(parsed_dataset.take(1))


        
    
#     model = tf.keras.Sequential([
#         layers.Input(shape=(audio_length,), batch_size=batch_size),
#         layers.Reshape(target_shape=(audio_length, 1)),
#         layers.Conv1D(32, 10, activation='relu'),
#         layers.MaxPooling1D(2),
#         layers.Conv1D(64, 10, activation='relu'),
#         layers.MaxPooling1D(256),
#         layers.Flatten(),
#         layers.Dense(128, activation='softmax')
#     ])
#     model.compile(
#         optimizer='adam',
#         loss='binary_crossentropy',
#         metrics=[tf.keras.metrics.Accuracy()]
#     )
#     model.summary()
#     model.fit(parsed_dataset, epochs=50)
if __name__ == "__main__":
    run()

TypeError: sequence item 0: expected str instance, int found