In [1]:
import numpy as np
import soundfile
import librosa

import os

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Conv2D, Bidirectional, GRU, Dense, TimeDistributed
from tensorflow.keras.layers import Dropout, BatchNormalization
from tensorflow.keras.layers import Input, MaxPooling2D, Reshape, MaxPool2D, Activation
from tensorflow.keras.optimizers import Adam

In [6]:
def ctc_loss_lambda_func(y_true, y_pred):
    """Function for computing the CTC loss"""

    if len(y_true.shape) > 2:
        y_true = tf.squeeze(y_true)

    input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False)
    input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True)
    label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64")

    loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    loss = tf.reduce_mean(loss)

    return loss


def deep_speech(input_size, units, rnn_layers, is_bi, activation = 'relu', output_dim=29, learning_rate=3e-4):
    """ Build a recurrent + convolutional network for speech 
    """
    
    # Main acoustic input
    input_data = Input(name='the_input', shape=(input_size[0], input_size[1], 1))
    x = input_data
    
    padding = (20, 5)
    x = Conv2D(filters=32, 
                     kernel_size=(41,11), 
                     strides=(2,2), 
                     padding=[[0, 0], [padding[0], padding[0]], [padding[1], padding[1]], [0, 0]])(x)
    x = BatchNormalization()(x)
    
    padding = (10, 5)
    x = Conv2D(filters=32, 
                     kernel_size=(21,11), 
                     strides=(2,1), 
                     padding=[[0, 0], [padding[0], padding[0]], [padding[1], padding[1]], [0, 0]])(x)
    x = BatchNormalization()(x)
    
    shape = x.get_shape()
    x = Reshape((shape[1], shape[2] * shape[3]))(x)
    
    # Add a recurrent layer
    if is_bi:
        for i in range(rnn_layers):
            # Add recurrent layer
            x = Bidirectional(GRU(units, activation=activation,
                return_sequences=True, name='rnn_{}'.format(i+1)))(x)
            
            #Add batch normalization 
            x = BatchNormalization()(x)
    else:
        for i in range(rnn_layers):
            # Add recurrent layer
            x = GRU(units, activation=activation,
                return_sequences=True, name='rnn_{}'.format(i+1))(x)
            
            #Add batch normalization 
            x = BatchNormalization()(x)
    
    # Add a TimeDistributed(Dense(output_dim)) layer
    time_dense = TimeDistributed(Dense(output_dim))(x)
    
    # Add softmax activation layer
    y_pred = Activation('softmax', name='softmax')(time_dense)
    
    # Specify the model
    model = Model(inputs=input_data, outputs=y_pred)
    
    #compile model
    optimizer = Adam(learning_rate=learning_rate) 
    model.compile(optimizer=optimizer, loss= ctc_loss_lambda_func)
    model.summary()
    
    return model

In [5]:
import train
import config

data_detail = train.get_data_detail('toy_final')

model = deep_speech(input_size = (data_detail['max_input_length'] , data_detail['num_features']), 
                                    units = config.model_architecture['units_rnn'], 
                                    rnn_layers = config.model_architecture['rnn_layers'], 
                                    is_bi = config.model_architecture['is_bi'])

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       [(None, 3263, 161, 1)]    0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 1632, 81, 32)      14464     
_________________________________________________________________
batch_normalization_5 (Batch (None, 1632, 81, 32)      128       
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 806, 81, 32)       236576    
_________________________________________________________________
batch_normalization_6 (Batch (None, 806, 81, 32)       128       
_________________________________________________________________
reshape_1 (Reshape)          (None, 806, 2592)         0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 806, 512)          4377