In [1]:
import numpy as np
import soundfile
import librosa

import os

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Conv2D, Bidirectional, GRU, Dense, TimeDistributed
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.layers import Input, MaxPooling2D, Reshape, MaxPool2D, Activation
from tensorflow.keras.optimizers import Adam

In [6]:
def compute_spectrogram_feature(samples, sample_rate, stride_ms=10.0,
                                window_ms=20.0, max_freq=None, eps=1e-14):
    """Compute the spectrograms for the input samples(waveforms).
    More about spectrogram computation, please refer to:
    https://en.wikipedia.org/wiki/Short-time_Fourier_transform.
    """
    if max_freq is None:
        max_freq = sample_rate / 2
    if max_freq > sample_rate / 2:
        raise ValueError("max_freq must not be greater than half of sample rate.")

    if stride_ms > window_ms:
        raise ValueError("Stride size must not be greater than window size.")

    stride_size = int(0.001 * sample_rate * stride_ms)
    window_size = int(0.001 * sample_rate * window_ms)

    # Extract strided windows
    truncate_size = (len(samples) - window_size) % stride_size
    samples = samples[:len(samples) - truncate_size]
    nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
    nstrides = (samples.strides[0], samples.strides[0] * stride_size)
    windows = np.lib.stride_tricks.as_strided(
      samples, shape=nshape, strides=nstrides)
    assert np.all(
      windows[:, 1] == samples[stride_size:(stride_size + window_size)])

    # Window weighting, squared Fast Fourier Transform (fft), scaling
    weighting = np.hanning(window_size)[:, None]
    fft = np.fft.rfft(windows * weighting, axis=0)
    fft = np.absolute(fft)
    fft = fft**2
    scale = np.sum(weighting**2) * sample_rate
    fft[1:-1, :] *= (2.0 / scale)
    fft[(0, -1), :] /= scale
    # Prepare fft frequency list
    freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])

    # Compute spectrogram feature
    ind = np.where(freqs <= max_freq)[0][-1] + 1
    specgram = np.log(fft[:ind, :] + eps)
    return np.transpose(specgram, (1, 0))

In [7]:
path = '../data/toy_raw/174/168635/174-168635-0000.flac'

au, sr = soundfile.read(path)

In [12]:
feature = compute_spectrogram_feature(au, 16000, 10.0, 20.0)
feature.shape

(452, 161)

In [2]:
def ctc_loss_lambda_func(y_true, y_pred):
    """Function for computing the CTC loss"""

    if len(y_true.shape) > 2:
        y_true = tf.squeeze(y_true)

    input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False)
    input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True)
    label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64")

    loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    loss = tf.reduce_mean(loss)

    return loss


def deep_speech(input_size, units, rnn_layers, is_bi, activation = 'relu', output_dim=29, learning_rate=3e-4):
    """ Build a recurrent + convolutional network for speech 
    """
    
    # Main acoustic input
    input_data = Input(name='the_input', shape=(input_size[0], input_size[1], 1))
    x = input_data
    
    padding = (20, 5)
    x = cnn = Conv2D(filters=32, 
                     kernel_size=(41,11), 
                     strides=(2,2), 
                     padding=[[0, 0], [padding[0], padding[0]], [padding[1], padding[1]], [0, 0]])(x)
    x = BatchNormalization()(x)
    
#     padding = (10, 5)
#     x = cnn = Conv2D(filters=32, 
#                      kernel_size=(21,11), 
#                      strides=(2,1), 
#                      padding=[[0, 0], [padding[0], padding[0]], [padding[1], padding[1]], [0, 0]])(x)
#     x = BatchNormalization()(x)
    
    shape = x.get_shape()
    x = Reshape((shape[1], shape[2] * shape[3]))(x)
    
    # Add a recurrent layer
    if is_bi:
        for i in range(rnn_layers):
            # Add recurrent layer
            x = Bidirectional(GRU(units, activation=activation,
                return_sequences=True, name='rnn_{}'.format(i+1)))(x)
            
            #Add batch normalization 
            x = BatchNormalization()(x)
    else:
        for i in range(rnn_layers):
            # Add recurrent layer
            x = GRU(units, activation=activation,
                return_sequences=True, name='rnn_{}'.format(i+1))(x)
            
            #Add batch normalization 
            x = BatchNormalization()(x)
    
    # Add a TimeDistributed(Dense(output_dim)) layer
    time_dense = TimeDistributed(Dense(output_dim))(x)
    
    # Add softmax activation layer
    y_pred = Activation('softmax', name='softmax')(time_dense)
    
    # Specify the model
    model = Model(inputs=input_data, outputs=y_pred)
    
    #compile model
    optimizer = Adam(learning_rate=learning_rate) 
    model.compile(optimizer=optimizer, loss= ctc_loss_lambda_func)
    model.summary()
    
    return model

In [3]:
import train
import config

data_detail = train.get_data_detail('toy_final')

model = deep_speech(input_size = (data_detail['max_input_length'] , data_detail['num_features']), 
                                    units = config.model_architecture['units_rnn'], 
                                    rnn_layers = config.model_architecture['rnn_layers'], 
                                    is_bi = config.model_architecture['is_bi'])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, 1406, 40, 1)]     0         
_________________________________________________________________
conv2d (Conv2D)              (None, 703, 20, 32)       14464     
_________________________________________________________________
batch_normalization (BatchNo (None, 703, 20, 32)       128       
_________________________________________________________________
reshape (Reshape)            (None, 703, 640)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 703, 512)          1379328   
_________________________________________________________________
batch_normalization_1 (Batch (None, 703, 512)          2048      
_________________________________________________________________
bidirectional_1 (Bidirection (None, 703, 512)          118272