<a href="https://colab.research.google.com/github/mohsen-goodarzi/DeepSpeech-with-keras/blob/master/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from keras import layers
from keras import models
from keras import initializers
from keras.activations import relu
import keras.backend as K

In [0]:
def ctc_lambda_func(args):
    labels, y_pred, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

def clipped_relu(x):
    return relu(x, max_value=20)

Create DeepSpeech 1 model with dropout:

In [0]:

def DeepSpeech1(input_dim=26,output_dim=37,fc_size=1024, rnn_size=512,dropout=[0.1, 0.1, 0.1,0.1,0.1]):
  init=initializers.random_normal(stddev=0.046875)

  model=models.Sequential()
  
  #first 3 FC layers
  model.add(layers.TimeDistributed(layers.Dense(fc_size,name='fc1',kernel_initializer=init,bias_initializer=init,activation=clipped_relu),input_shape=(None,input_dim)))
  model.add(layers.TimeDistributed(layers.Dropout(dropout[0])))
  model.add(layers.TimeDistributed(layers.Dense(fc_size,name='fc2',kernel_initializer=init,bias_initializer=init,activation=clipped_relu)))
  model.add(layers.TimeDistributed(layers.Dropout(dropout[1])))
  model.add(layers.TimeDistributed(layers.Dense(fc_size,name='fc3',kernel_initializer=init,bias_initializer=init,activation=clipped_relu)))
  model.add(layers.TimeDistributed(layers.Dropout(dropout[2])))

  # Layer 4: BiDirectional RNN
  model.add(layers.Bidirectional(layers.LSTM(rnn_size,name='bilstm4',kernel_initializer=initializers.he_normal(),return_sequences=True,activation=relu, dropout=dropout[3]),merge_mode='sum'))

  # Layer 5: FC
  model.add(layers.TimeDistributed(layers.Dense(fc_size,name='fc5',kernel_initializer=init,bias_initializer=init,activation=clipped_relu)))
  model.add(layers.TimeDistributed(layers.Dropout(dropout[4])))

  # Layer 6: softmax output
  model.add(layers.TimeDistributed(layers.Dense(output_dim,name='out',kernel_initializer=init,bias_initializer=init,activation='softmax')))

  return model
  


Add CTC loss:

In [0]:
def add_ctc(model_core):
    input_data=model_core.inputs[0]
    y_pred=model_core.outputs[0]

    labels = layers.Input(name='the_labels', shape=(None,), dtype='int32')
    input_length = layers.Input(name='input_length', shape=(1,), dtype='int32')
    label_length = layers.Input(name='label_length', shape=(1,), dtype='int32')

    loss_out = layers.Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([labels, y_pred, input_length, label_length])

    model = models.Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)

    return model



In [31]:
core_model=DeepSpeech1()
final_model= add_ctc(core_model)
final_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
time_distributed_118_input (Inp (None, None, 26)     0                                            
__________________________________________________________________________________________________
time_distributed_118 (TimeDistr (None, None, 1024)   27648       time_distributed_118_input[0][0] 
__________________________________________________________________________________________________
time_distributed_119 (TimeDistr (None, None, 1024)   0           time_distributed_118[0][0]       
__________________________________________________________________________________________________
time_distributed_120 (TimeDistr (None, None, 1024)   1049600     time_distributed_119[0][0]       
____________________________________________________________________________________________