In [1]:
import tensorflow as tf
import numpy as np

In [61]:
labels = ['_background_noise_', 'backward', 'bed', 'bird', 'cat', 'dog',
       'down', 'eight', 'five', 'follow', 'forward', 'four', 'go',
       'happy', 'house', 'learn', 'left', 'marvin', 'nine', 'no', 'off',
       'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three',
       'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero']

In [3]:
def get_spectrogram(waveform, frame_length=550, frame_step=128, mel=True):
    
    spectrogram = tf.signal.stft(waveform, frame_length=frame_length, frame_step=frame_step)
    spectrogram = tf.abs(spectrogram)
    num_spectrogram_bins = spectrogram.shape[-1]
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 0.0, 3000.0, 120
    mel_matrix = tf.signal.linear_to_mel_weight_matrix(num_mel_bins=num_mel_bins, 
                                                       num_spectrogram_bins=num_spectrogram_bins, 
                                                       sample_rate=16000, 
                                                       lower_edge_hertz=lower_edge_hertz, 
                                                       upper_edge_hertz=upper_edge_hertz)
    mel_spectrogram = tf.tensordot(spectrogram, mel_matrix, 1)
    mel_spectrogram.set_shape(spectrogram.shape[:-1].concatenate(mel_matrix.shape[-1:]))
    mel_spectrogram = tf.math.log(mel_spectrogram + 1e-6)
    
    if mel:
        return mel_spectrogram[...,tf.newaxis]
    return np.log(spectrogram.numpy().T + np.finfo(float).eps)[..., tf.newaxis]

In [68]:
model = tf.keras.models.load_model('models/CNN_Bi_3convLSTMPooling_softmax/')
x = tf.io.read_file('tests/three.wav')
test_audio, samplerate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=-1)
test_audio = tf.reshape(test_audio, (1, -1))
test_audio = test_audio.numpy()
samplerate = samplerate.numpy()

In [69]:
test_audio.shape

(1, 21153)

In [70]:
new_test_audio = test_audio[:,1000:17000]

In [71]:
new_test_audio.shape

(1, 16000)

In [72]:
from IPython.display import Audio
Audio(new_test_audio, rate=16000)

In [73]:
test_spectrogram = get_spectrogram(tf.convert_to_tensor(new_test_audio, dtype=tf.float32))

In [74]:
test_spectrogram.shape

TensorShape([1, 121, 120, 1])

In [75]:
labels[np.argmax(model.predict(test_spectrogram))]



'three'