In [76]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""
import soundfile as sf
import resampy
import numpy as np
import minispec
import keras
import pyfftw
import time
import random
from sklearn.externals import joblib
import pickle as pk
from IPython.display import Audio

minispec.set_fftlib(pyfftw.interfaces.numpy_fft)

US8K_CLASSES = {
    0: 'air_conditioner',
    1: 'car_horn',
    2: 'children_playing',
    3: 'dog_bark',
    4: 'drilling',
    5: 'engine_idling',
    6: 'gun_shot',
    7: 'jackhammer',
    8: 'siren',
    9: 'street_music'
}

In [2]:
def pcm2float(sig, dtype='float64'):
    """Convert PCM signal to floating point with a range from -1 to 1.
    Use dtype='float32' for single precision.
    Parameters
    ----------
    sig : array_like
        Input array, must have integral type.
    dtype : data type, optional
        Desired (floating point) data type.
    Returns
    -------
    numpy.ndarray
        Normalized floating point data.
    See Also
    --------
    float2pcm, dtype
    """
    sig = np.asarray(sig)
    if sig.dtype.kind not in 'iu':
        raise TypeError("'sig' must be an array of integers")
    dtype = np.dtype(dtype)
    if dtype.kind != 'f':
        raise TypeError("'dtype' must be a floating point type")

    i = np.iinfo(sig.dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    return (sig.astype(dtype) - offset) / abs_max

In [3]:
POOLINGS = {
    'linear': {
        6144: (8, 8),
        512: (32, 24),
    },
    'mel128': {
        6144: (4, 8),
        512: (16, 24),
    },
    'mel256': {
        6144: (8, 8),
        512: (32, 24),
    }
}

In [4]:


def construct_mlp_model(input_shape, weight_decay=1e-5, num_classes=10):
    """
    Constructs a multi-layer perceptron model
    Args:
        input_shape: Shape of input data
                     (Type: tuple[int])
        weight_decay: L2 regularization factor
                      (Type: float)
    Returns:
        model: L3 CNN model
               (Type: keras.models.Model)
        input: Model input
               (Type: list[keras.layers.Input])
        output:Model output
                (Type: keras.layers.Layer)
    """
    l2_weight_decay = keras.regularizers.l2(weight_decay)
    inp = keras.layers.Input(shape=input_shape, dtype='float32')
    y = keras.layers.Dense(512, activation='relu', kernel_regularizer=l2_weight_decay)(inp)
    y = keras.layers.Dense(128, activation='relu', kernel_regularizer=l2_weight_decay)(y)
    y = keras.layers.Dense(num_classes, activation='softmax', kernel_regularizer=l2_weight_decay)(y)
    m = keras.models.Model(inputs=inp, outputs=y)
    m.name = 'urban_sound_classifier'

    return m, inp, y


In [30]:
def amplitude_to_db(x, amin=1e-10, dynamic_range=80.0):
    magnitude = np.abs(S)
    power = np.square(magnitude, out=magnitude)

    ref_value = power.max()

    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
    log_spec -= log_spec.max()

    log_spec = np.maximum(log_spec, -dynamic_range)

    return log_spec

In [94]:
audio_dir = "/beegfs/jtc440/UrbanSound8K/audio/fold1"
audio_path = os.path.join(audio_dir, random.choice(os.listdir(audio_dir)))
model_path = "/scratch/jtc440/sonyc_el3_models/openl3_audio_mel256_music.h5"
classifier_path = "/scratch/jtc440/us8k-music-melspec2-512emb-model/model.h5"

In [65]:
ls /scratch/jtc440/us8k-music-melspec2-512emb-model/

[0m[01;32mconfig.json[0m*             [01;32mhistory_csvlog.csv[0m*  [01;32mmodel.h5[0m*     [01;32mstdizer.pkl[0m*
[01;32mhistory_checkpoint.pkl[0m*  [01;32mmin_max_scaler.pkl[0m*  [01;32mresults.pkl[0m*


In [95]:
with open('/scratch/jtc440/us8k-music-melspec2-512emb-model/stdizer.pkl', 'rb') as f:
    stdizer = joblib.load(f)

In [96]:
Audio(audio_path)

In [48]:
model = keras.models.load_model(model_path)
model_type = os.path.basename(model_path).split('_')[2]
embedding_size = 512

pool_size = POOLINGS[model_type][embedding_size]
y_a = keras.layers.MaxPooling2D(pool_size=pool_size, padding='same')(model.output)
y_a = keras.layers.Flatten()(y_a)
model = keras.models.Model(inputs=model.input, outputs=y_a)




In [49]:
m_class, inp, out = construct_mlp_model((512,))
m_class.load_weights(classifier_path)

In [97]:
start_ts = time.time()

audio_data, sr = sf.read(audio_path)
audio_data = audio_data.flatten()


if sr != 48000:
    resampy.resample(audio_data, sr, 48000)
    
frames = minispec.util.frame(audio_data, frame_length=48000, hop_length=48000).T

frame_specs = []

for frame in frames:
    # Compute spectrogram
    if model_type == 'mel256':
        S = np.abs(minispec.core.stft(frame, n_fft=2048, hop_length=242,
                                      window='hann', center=True,
                                      pad_mode='constant'))
        S = minispec.feature.melspectrogram(sr=48000, S=S,
                                                     n_mels=256, power=1.0,
                                                     htk=True)
    elif model_type == 'mel128':
        S = np.abs(minispec.core.stft(frame, n_fft=2048, hop_length=242,
                                      window='hann', center=True,
                                      pad_mode='constant'))
        S = minispec.feature.melspectrogram(sr=48000, S=S,
                                                     n_mels=128, power=1.0,
                                                     htk=True)
    else:

        S = np.abs(minispec.core.stft(frame, n_fft=512, hop_length=242,
                                               window='hann', center=True,
                                               pad_mode='constant'))
    S = amplitude_to_db(np.array(S))
    frame_specs.append(S)

# Convert amplitude to dB
spec_data = np.array(frame_specs)[:,:,:,np.newaxis]
emb_data = model.predict(spec_data)
emb_data = stdizer.transform(emb_data)
output = m_class.predict(emb_data)
label_idx = output.mean(axis=0).argmax()
label = US8K_CLASSES[label_idx]

end_ts = time.time()

print("Took {} seconds".format(end_ts - start_ts))



Took 4.278005361557007 seconds


In [99]:
(output * 100).astype(int)

array([[ 0,  0,  0, 17,  0,  0,  0,  0,  0, 81],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0, 98],
       [ 0,  0,  0,  7,  1,  0,  0,  0,  0, 91],
       [ 0,  0,  0,  2,  1,  0,  0,  0,  0, 95],
       [ 0,  0,  0,  5,  2,  0,  0,  0,  0, 91],
       [ 0,  0,  0,  2,  1,  0,  0,  0,  0, 96],
       [ 0,  0,  0,  3,  0,  0,  0,  0,  0, 96],
       [ 0,  0,  0,  2,  1,  0,  0,  0,  0, 95]])

In [98]:
label

'street_music'