### Overview

<b>Objective</b>: To calculate the Floating Point Operations of a given precompiled deepspeech model 

<b>Assumptions:</b> 
1. I am not considering any hardware metrics like memory size, bandwith, clock speed, sockets, use of any specific ASIC/GPU
2. As I was not provided with a 10 second audio sample,I will be making my own asumption in the regard 
3. the code presented has been taken from padlepadle/deepspeech but has been modified a bit to make it presentable and executable
4. I am only considering english characters(size=1 byte per character)

In order to calculate floating point operations we will be counting [multiply-accumulate operations](https://en.wikipedia.org/wiki/Multiply%E2%80%93accumulate_operation).

Why multiply-accumulate? 
Many of the computations in neural networks are dot products
ie-
y = w[0]*x[0] + w[1]*x[1] + w[2]*x[2] + ... + w[n-1]*x[n-1]
w(vector) = weights and x(vector) = input y(scalar) = layer’s outputs. Typically a layer will have multiple outputs

so for every 2 multiply operations there will be 1 addition.Therefore,generalising the macc for n dot product operations yields 'n' + 'n-1'  = 2n - 1 Flops


In terms of FLOPS,  performs  FLOPS since there are n multiplications and n - 1 additions.

### Evaluating the Deepspeech Network

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from itertools import groupby
import sys
import os
import numpy as np
from math import log
import multiprocessing
from collections import Iterable

In [None]:
import paddle.v2 as paddle

In [None]:
"""Contains DeepSpeech2 layers and networks."""
def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
                  padding, act, index_range_data):
    """Convolution layer with batch normalization.
    :param input: Input layer.
    :type input: LayerOutput
    :param filter_size: The x dimension of a filter kernel. Or input a tuple for
                        two image dimension.
    :type filter_size: int|tuple|list
    :param num_channels_in: Number of input channels.
    :type num_channels_in: int
    :type num_channels_out: Number of output channels.
    :type num_channels_in: out
    :param padding: The x dimension of the padding. Or input a tuple for two
                    image dimension.
    :type padding: int|tuple|list
    :param act: Activation type.
    :type act: BaseActivation
    :param index_range_data: Index range to indicate sub region.
    :type index_range_data: LayerOutput
    :return: Batch norm layer after convolution layer.
    :rtype: LayerOutput
    """
    conv_layer = paddle.layer.img_conv(
        input=input,
        filter_size=filter_size,
        num_channels=num_channels_in,
        num_filters=num_channels_out,
        stride=stride,
        padding=padding,
        act=paddle.activation.Linear(),
        bias_attr=False)
    batch_norm = paddle.layer.batch_norm(input=conv_layer, act=act)
    # reset padding part to 0
    scale_sub_region = paddle.layer.scale_sub_region(
        batch_norm, index_range_data, value=0.0)
    return scale_sub_region


def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
    """Bidirectonal simple rnn layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.
    :param name: Name of the layer.
    :type name: string
    :param input: Input layer.
    :type input: LayerOutput
    :param size: Number of RNN cells.
    :type size: int
    :param act: Activation type.
    :type act: BaseActivation
    :param share_weights: Whether to share input-hidden weights between
                          forward and backward directional RNNs.
    :type share_weights: bool
    :return: Bidirectional simple rnn layer.
    :rtype: LayerOutput
    """
    if share_weights:
        # input-hidden weights shared between bi-direcitonal rnn.
        input_proj = paddle.layer.fc(
            input=input,
            size=size,
            act=paddle.activation.Linear(),
            bias_attr=False)
        # batch norm is only performed on input-state projection
        input_proj_bn = paddle.layer.batch_norm(
            input=input_proj, act=paddle.activation.Linear())
        # forward and backward in time
        forward_simple_rnn = paddle.layer.recurrent(
            input=input_proj_bn, act=act, reverse=False)
        backward_simple_rnn = paddle.layer.recurrent(
            input=input_proj_bn, act=act, reverse=True)

    else:
        input_proj_forward = paddle.layer.fc(
            input=input,
            size=size,
            act=paddle.activation.Linear(),
            bias_attr=False)
        input_proj_backward = paddle.layer.fc(
            input=input,
            size=size,
            act=paddle.activation.Linear(),
            bias_attr=False)
        # batch norm is only performed on input-state projection
        input_proj_bn_forward = paddle.layer.batch_norm(
            input=input_proj_forward, act=paddle.activation.Linear())
        input_proj_bn_backward = paddle.layer.batch_norm(
            input=input_proj_backward, act=paddle.activation.Linear())
        # forward and backward in time
        forward_simple_rnn = paddle.layer.recurrent(
            input=input_proj_bn_forward, act=act, reverse=False)
        backward_simple_rnn = paddle.layer.recurrent(
            input=input_proj_bn_backward, act=act, reverse=True)

    return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn])


def bidirectional_gru_bn_layer(name, input, size, act):
    """Bidirectonal gru layer with sequence-wise batch normalization.
    The batch normalization is only performed on input-state weights.
    :param name: Name of the layer.
    :type name: string
    :param input: Input layer.
    :type input: LayerOutput
    :param size: Number of RNN cells.
    :type size: int
    :param act: Activation type.
    :type act: BaseActivation
    :return: Bidirectional simple rnn layer.
    :rtype: LayerOutput
    """
    input_proj_forward = paddle.layer.fc(
        input=input,
        size=size * 3,
        act=paddle.activation.Linear(),
        bias_attr=False)
    input_proj_backward = paddle.layer.fc(
        input=input,
        size=size * 3,
        act=paddle.activation.Linear(),
        bias_attr=False)
    # batch norm is only performed on input-related projections
    input_proj_bn_forward = paddle.layer.batch_norm(
        input=input_proj_forward, act=paddle.activation.Linear())
    input_proj_bn_backward = paddle.layer.batch_norm(
        input=input_proj_backward, act=paddle.activation.Linear())
    # forward and backward in time
    forward_gru = paddle.layer.grumemory(
        input=input_proj_bn_forward, act=act, reverse=False)
    backward_gru = paddle.layer.grumemory(
        input=input_proj_bn_backward, act=act, reverse=True)
    return paddle.layer.concat(input=[forward_gru, backward_gru])


def conv_group(input, num_stacks, index_range_datas):
    """Convolution group with stacked convolution layers.
    :param input: Input layer.
    :type input: LayerOutput
    :param num_stacks: Number of stacked convolution layers.
    :type num_stacks: int
    :param index_range_datas: Index ranges for each convolution layer.
    :type index_range_datas: tuple|list
    :return: Output layer of the convolution group.
    :rtype: LayerOutput
    """
    conv = conv_bn_layer(
        input=input,
        filter_size=(11, 41),
        num_channels_in=1,
        num_channels_out=32,
        stride=(3, 2),
        padding=(5, 20),
        act=paddle.activation.BRelu(),
        index_range_data=index_range_datas[0])
    for i in xrange(num_stacks - 1):
        conv = conv_bn_layer(
            input=conv,
            filter_size=(11, 21),
            num_channels_in=32,
            num_channels_out=32,
            stride=(1, 2),
            padding=(5, 10),
            act=paddle.activation.BRelu(),
            index_range_data=index_range_datas[i + 1])
    output_num_channels = 32
    output_height = 160 // pow(2, num_stacks) + 1
    return conv, output_num_channels, output_height


def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
    """RNN group with stacked bidirectional simple RNN layers.
    :param input: Input layer.
    :type input: LayerOutput
    :param size: Number of RNN cells in each layer.
    :type size: int
    :param num_stacks: Number of stacked rnn layers.
    :type num_stacks: int
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward directional RNNs.
                              It is only available when use_gru=False.
    :type share_weights: bool
    :return: Output layer of the RNN group.
    :rtype: LayerOutput
    """
    output = input
    for i in xrange(num_stacks):
        if use_gru:
            output = bidirectional_gru_bn_layer(
                name=str(i),
                input=output,
                size=size,
                act=paddle.activation.Relu())
            # BRelu does not support hppl, need to add later. Use Relu instead.
        else:
            output = bidirectional_simple_rnn_bn_layer(
                name=str(i),
                input=output,
                size=size,
                act=paddle.activation.BRelu(),
                share_weights=share_rnn_weights)
    return output


def deep_speech_v2_network(audio_data,
                           text_data,
                           seq_offset_data,
                           seq_len_data,
                           index_range_datas,
                           dict_size,
                           num_conv_layers=2,
                           num_rnn_layers=3,
                           rnn_size=256,
                           use_gru=False,
                           share_rnn_weights=True):
    """The DeepSpeech2 network structure.
    :param audio_data: Audio spectrogram data layer.
    :type audio_data: LayerOutput
    :param text_data: Transcription text data layer.
    :type text_data: LayerOutput
    :param seq_offset_data: Sequence offset data layer.
    :type seq_offset_data: LayerOutput
    :param seq_len_data: Valid sequence length data layer.
    :type seq_len_data: LayerOutput
    :param index_range_datas: Index ranges data layers.
    :type index_range_datas: tuple|list
    :param dict_size: Dictionary size for tokenized transcription.
    :type dict_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_size: RNN layer size (number of RNN cells).
    :type rnn_size: int
    :param use_gru: Use gru if set True. Use simple rnn if set False.
    :type use_gru: bool
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward direction RNNs.
                              It is only available when use_gru=False.
    :type share_weights: bool
    :return: A tuple of an output unnormalized log probability layer (
             before softmax) and a ctc cost layer.
    :rtype: tuple of LayerOutput
    """
    # convolution group
    conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
        input=audio_data,
        num_stacks=num_conv_layers,
        index_range_datas=index_range_datas)
    # convert data form convolution feature map to sequence of vectors
    conv2seq = paddle.layer.block_expand(
        input=conv_group_output,
        num_channels=conv_group_num_channels,
        stride_x=1,
        stride_y=1,
        block_x=1,
        block_y=conv_group_height)
    # remove padding part
    remove_padding_data = paddle.layer.sub_seq(
        input=conv2seq,
        offsets=seq_offset_data,
        sizes=seq_len_data,
        act=paddle.activation.Linear(),
        bias_attr=False)
    # rnn group
    rnn_group_output = rnn_group(
        input=remove_padding_data,
        size=rnn_size,
        num_stacks=num_rnn_layers,
        use_gru=use_gru,
        share_rnn_weights=share_rnn_weights)
    fc = paddle.layer.fc(
        input=rnn_group_output,
        size=dict_size + 1,
        act=paddle.activation.Linear(),
        bias_attr=True)
    # probability distribution with softmax
    log_probs = paddle.layer.mixed(
        input=paddle.layer.identity_projection(input=fc),
        act=paddle.activation.Softmax())
    # ctc cost
    ctc_loss = paddle.layer.warp_ctc(
        input=fc,
        label=text_data,
        size=dict_size + 1,
        blank=dict_size,
        norm_by_times=True)
    return log_probs, ctc_loss

## Flop Calculations

Lets break the network into basic units and try to evaluate each part of the code

### Fully-connected layer

*paddle.layer.fc(
            input=input,
            size=size,
            act=paddle.activation.Linear(),
            bias_attr=False)*

In the fully-connected layer above, all the inputs are connected to all the output and output at each layer is given by 
y = matmul(x, W) + b

consider __I__ to be the __input values__ and __J__ to be the __ouput values__ then dimensionsal represention is given by 

y(J)=x(I)*W(I,J)+b(J) 

now the multiply-aaumalate operations for the above code is I×J ((2I-1)×J Flops) as discussed above  

The bias b doesn’t really affect the number of MACCs as the dot product has one less addition than multiplication, so adding this bias value simply gets absorbed in that final multiply-accumulate.

Note: In case of batch size B, the resulting becomes (2I-1)×J×B;also,In the first code linear activation does not do anything 

let's define a fuction below to evaluate this 

In [None]:
def FLops_fc_layer(m, x, y):
    # per output element
    Multipication_Operations = m.input
    Addition_Operation = m.input - 1
    Bias_Operations = 1 if m.bias is not None else 0
    Number_Of_Elements = y.numel()
    Total_Operations = (Multipication_Operations + Addition_Operation + Bias_Operations) * Number_Of_Elements

    m.Total_FLops.append(Total_operations)

### Convolutional layer

*paddle.layer.img_conv(input=input, filter_size=filter_size, num_channels=num_channels_in, num_filters=num_channels_out, stride=stride, padding=padding, act=paddle.activation.Linear(), bias_attr=False)*

as per the [documentaion](https://www.paddlepaddle.org.cn/documentation/api/en/0.10.0/v2/config/layer.html#img-conv)
here convolutional layer currently supports rectangular kernels. 

Consider the convolutional layer with three-dimensional feature maps of size H(height) × W(width) × C(channels) and kernel size Kx(kernel width) × Ky(kernel height)

Although, the convolution layer consists of dilation factors,strides,padding,etc that should not be ignored,the dimensions of the output layer's feature map(Hout × Wout) have it accounted for.Also,we take the dot product of the weights and a Kx × Ky window of input values across all input channels Cin and because the layer has Cout different and repeat this Cout times to create all the output channels (bias and the activation function(linear) is ignored here).

For the above conv layer the number of Multiply Accumalte Operations are:
Cin × Kx × Ky × Hout × Wout × Cout


In [None]:
def FLops_img_conv(m, x, y):
    x = x[0]

    Cin = m.num_channels
    Cout = m.num_filters
    Kx, Ky = m.filter_size

    H_out = y.size(2)
    W_out = y.size(3)

    # Flops per output element
    Filter_Multipication_Operations = Kx * Ky * Cin
    Filter_Addition_Operations = Kx * Ky * Cin - 1
    Filter_Operations =  Filter_Multipication_Operations+Filter_Addition_Operations
    Bias_Operations = 1 if m.bias is not None else 0
    Operations_Per_Element = Filter_Operations + Bias_Operations

    # total Flops
    Number_Of_Output_Elements = y.numel()
    Output_Elements = Number_of_Output_Elements * W_out * H_out * Cout
    Total_Operations = output_elements * ops_per_element * Cin
    m.Total_FLops.append(Total_operations)


### Batch normalization

*paddle.layer.batch_norm(input=conv_layer, act=act)*

Batch normalization takes the output of a layer and applies the following formula to every single output value:

z = gamma * (y - mean) / sqrt(variance + epsilon) + beta
y is an element in the output from the previous layer.

Since we are only concerned MACC,we can count this as 4 operations being performed on different number of elements present in the input of y.

In [None]:
def FLops_bn_layer(m, x, y):
    x = x[0]

    Number_Of_Input_Elements = x.numel()
    # subtract, divide, gamma, beta
    Total_operations = 4 * Number_Of_Input_Elements

    m.Total_FLops.append(Total_operations)

### RNN

*paddle.layer.recurrent(input=input_proj_bn, act=act, reverse=False)*

As per the documentaion

It is just a fully connect layer through both time and neural network.

For each sequence [start, end] it performs the following computation:
outi=act(ini)  for $i=startout(i)=act(ini+outi−1∗W)$  for $start<i<=end$

If reversed is true, the order is reversed:
outi=act(ini)  for $i=endout(i)=act(ini+outi+1∗W)$  for $start<=i<end$

i.e RNN could be counted as a fully connected layer,as it is difficult to know how many time steps it will take for the neural network to process the input we can consider it has simmilar flops to paddle.layer.fc

### GRU

*paddle.layer.grumemory(input=input_proj_bn_forward, act=act, reverse=False)*

as per the doumentaion this function doesn't perform the multiplication operations $W_{r}x_{t}$, $W_{z}x_{t}$ and $W x_t$.Now assuming it only performs $U_{z}h_{t-1}$,$U_{r}h_{t-1}$ and $r_{t} \odot h_{t-1}$ (depending on whether reverse is true or not) and an additonal '+' function ,ie we can consider 
                         
                         'dot product' + 'addition' = n + n-1 + 1
                                                    = 2n Flops

In [None]:
def FLops_Gru_memory(m, x, y):
    # per output element
    Multipication_Operations = m.input
    Addition_Operation = m.input - 1
    Bias_Operations = 1 if m.bias is not None else 0
    Number_Of_Elements = y.numel()
    Total_Operations = (Multipication_Operations + Addition_Operation + Bias_Operations) * Number_Of_Elements

    m.Total_FLops.append(Total_operations)

### Activation functions

The activation functions mentioned here are RELU and BRELU apart from  linear which are genrally in the form:

y = max(x, 0)

In [None]:
def FLops_relu(m, x, y):
    x = x[0]

    Number_Of_Input_Elements = x.numel()
    Total_Operations = Number_Of_Input_Elements

    m.Total_FLops.append(Total_operations)

### Other Layers

*paddle.layer.mixed()*

A mixed layer will add all inputs together, then activate. Hence it performs only adds layes of inputs together and doesn't perform any multiply accumulate operations,we can count it as zero operations

In [None]:
def Zero_FLops(m, x, y):
    m.Total_FLops.append(0)

*paddle.layer.block_expand()*

converts data form convolution feature map to sequence of vectors.therefore this performs only zero MACC

*paddle.layer.identity_projection()*

this function increments the ith element in row of output by ith elment times the input or (input+offset) if offset is present.therefore,total number of addition operations would be the sum total of all inputs

In [None]:
def FLops_identity_projection(m, x, y):
    x = y.offset + x
    Total_Operations = x.sum()
    m.Total_FLops.append(Total_operations)

*paddle.activation.Softmax()*

softmax is calculated as follows $p_{k} = \dfrac{e^{f_{k}}}{\sum_{j} e^{f_{j}}}$.This roughly equates to 3 operations done on number of features (addition,division and exponential)

In [None]:
def Flops_softmax(m, x, y):
    x = x[0]

    batch_size, nfeatures = x.size()

    Total_Exponential = nfeatures
    Total_Addition = nfeatures - 1
    Total_Divisons = nfeatures
    Total_operations = batch_size * (total_exp + total_add + total_div)

    m.Total_FLops.append(Total_operations)

Now that we have developed a method to count the number of flops for individual layers,lets now define a function to calculate the flops of the whole network

Note: We are not considering CTC loss as it mostly involves memory operations rather than Multiply-Accumalate operations

In [None]:
#Calculating FLOPS of Layer
Registered_Layers = {
    paddle.layer.fc: FLops_fc_layer,
    paddle.layer.recurrent: FLops_fc_layer,
    
    paddle.layer.img_conv: FLops_img_conv,

    paddle.layer.batch_norm: FLops_bn_layer,

    paddle.layer.grumemory: FLops_Gru_memory,
    
    paddle.activation.relu: FLops_relu,
    paddle.activation.brelu: FLops_relu,

    paddle.layer.mixed: Zero_FLops,
    paddle.layer.block_expand: Zero_FLops,
    paddle.layer.identity_projection: FLops_identity_projection,
    
    paddle.activation.Softmax: Flops_softmax,
}


def Interpret_Model(model, inputs, verbose=True):
    
    collected_operations = []

    def add_layers(m):
        if len(list(m.children())) > 0:
            return

        m_type = m.type
        fn = None
        fn = Registered_Layers[m_type]

        if fn is None:
            if verbose:
                print("Method has not implemented counting method for ", m)
        else:
            if verbose:
                print("Registered FLOP counter for %s" % str(m))
        operations = m.add(fn)  # write a function to build the model.
        collected_operations.append(operations)

    training = model.training

    model.eval()
    model.apply(add_layers)

    Total_Flops = 0
    for m in model.modules():
        if len(list(m.children())) > 0:  # skip for non-leaf module
            continue
        total_ops += m.Total_Flops
        
    Total_Flops = Total_Flops.item()

    # reset model to original status
    model.train(training)
    for operations in collected_operations:
        operations.remove()

    # remove temporal buffers
    for n, m in model.Params():
        if len(list(m.children())) > 0:
            continue
        if "Total_Flops" in m._buffers:
            m._buffers.pop("Total_Flops")

    return Total_Flops

# function to print summary
def print_summary(nums, format="%.2f"):
    if not isinstance(nums, Iterable):
        nums = [nums]
    summary = []

    for num in nums:
        if num > 1e12:
            summary.append(format % (num / 1e12) + "T")
        elif num > 1e9:
            summary.append(format % (num / 1e9) + "G")
        elif num > 1e6:
            summary.append(format % (num / 1e6) + "M")
        elif num > 1e3:
            summary.append(format % (num / 1e3) + "K")
        else:
            summary.append(format % num + "B")

    summary = summary[0] if len(summary) == 1 else (*summary, )

    return summary

## CTC for Deepspeech

In [None]:
def ctc_greedy_decoder(probs_seq, vocabulary):
    """CTC greedy (best path) decoder.
    Path consisting of the most probable tokens are further post-processed to
    remove consecutive repetitions and all blanks.
    :param probs_seq: 2-D list of probabilities over the vocabulary for each
                      character. Each element is a list of float probabilities
                      for one character.
    :type probs_seq: list
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
    :return: Decoding result string.
    :rtype: baseline
    """
    # dimension verification
    for probs in probs_seq:
        if not len(probs) == len(vocabulary) + 1:
            raise ValueError("probs_seq dimension mismatchedd with vocabulary")
    # argmax to get the best index for each time step
    max_index_list = list(np.array(probs_seq).argmax(axis=1))
    # remove consecutive duplicate indexes
    index_list = [index_group[0] for index_group in groupby(max_index_list)]
    # remove blank indexes
    blank_index = len(vocabulary)
    index_list = [index for index in index_list if index != blank_index]
    # convert index list to string
    return ''.join([vocabulary[index] for index in index_list])

In [None]:
def ctc_beam_search_decoder(probs_seq,
                            beam_size,
                            vocabulary,
                            cutoff_prob=1.0,
                            cutoff_top_n=40,
                            ext_scoring_func=None,
                            nproc=False):
    """CTC Beam search decoder.
    It utilizes beam search to approximately select top best decoding
    labels and returning results in the descending order.
    The implementation is based on Prefix Beam Search
    (https://arxiv.org/abs/1408.2873), and the unclear part is
    redesigned. Two important modifications: 1) in the iterative computation
    of probabilities, the assignment operation is changed to accumulation for
    one prefix may comes from different paths; 2) the if condition "if l^+ not
    in A_prev then" after probabilities' computation is deprecated for it is
    hard to understand and seems unnecessary.
    :param probs_seq: 2-D list of probability distributions over each time
                      step, with each element being a list of normalized
                      probabilities over vocabulary and blank.
    :type probs_seq: 2-D list
    :param beam_size: Width for beam search.
    :type beam_size: int
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
    :param cutoff_prob: Cutoff probability in pruning,
                        default 1.0, no pruning.
    :type cutoff_prob: float
    :param ext_scoring_func: External scoring function for
                            partially decoded sentence, e.g. word count
                            or language model.
    :type external_scoring_func: callable
    :param nproc: Whether the decoder used in multiprocesses.
    :type nproc: bool
    :return: List of tuples of log probability and sentence as decoding
             results, in descending order of the probability.
    :rtype: list
    """
    # dimension check
    for prob_list in probs_seq:
        if not len(prob_list) == len(vocabulary) + 1:
            raise ValueError("The shape of prob_seq does not match with the "
                             "shape of the vocabulary.")

    # blank_id assign
    blank_id = len(vocabulary)

    # If the decoder called in the multiprocesses, then use the global scorer
    # instantiated in ctc_beam_search_decoder_batch().
    if nproc is True:
        global ext_nproc_scorer
        ext_scoring_func = ext_nproc_scorer

    ## initialize
    # prefix_set_prev: the set containing selected prefixes
    # probs_b_prev: prefixes' probability ending with blank in previous step
    # probs_nb_prev: prefixes' probability ending with non-blank in previous step
    prefix_set_prev = {'\t': 1.0}
    probs_b_prev, probs_nb_prev = {'\t': 1.0}, {'\t': 0.0}

    ## extend prefix in loop
    for time_step in xrange(len(probs_seq)):
        # prefix_set_next: the set containing candidate prefixes
        # probs_b_cur: prefixes' probability ending with blank in current step
        # probs_nb_cur: prefixes' probability ending with non-blank in current step
        prefix_set_next, probs_b_cur, probs_nb_cur = {}, {}, {}

        prob_idx = list(enumerate(probs_seq[time_step]))
        cutoff_len = len(prob_idx)
        #If pruning is enabled
        if cutoff_prob < 1.0 or cutoff_top_n < cutoff_len:
            prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True)
            cutoff_len, cum_prob = 0, 0.0
            for i in xrange(len(prob_idx)):
                cum_prob += prob_idx[i][1]
                cutoff_len += 1
                if cum_prob >= cutoff_prob:
                    break
            cutoff_len = min(cutoff_len, cutoff_top_n)
            prob_idx = prob_idx[0:cutoff_len]

        for l in prefix_set_prev:
            if not prefix_set_next.has_key(l):
                probs_b_cur[l], probs_nb_cur[l] = 0.0, 0.0

            # extend prefix by travering prob_idx
            for index in xrange(cutoff_len):
                c, prob_c = prob_idx[index][0], prob_idx[index][1]

                if c == blank_id:
                    probs_b_cur[l] += prob_c * (
                        probs_b_prev[l] + probs_nb_prev[l])
                else:
                    last_char = l[-1]
                    new_char = vocabulary[c]
                    l_plus = l + new_char
                    if not prefix_set_next.has_key(l_plus):
                        probs_b_cur[l_plus], probs_nb_cur[l_plus] = 0.0, 0.0

                    if new_char == last_char:
                        probs_nb_cur[l_plus] += prob_c * probs_b_prev[l]
                        probs_nb_cur[l] += prob_c * probs_nb_prev[l]
                    elif new_char == ' ':
                        if (ext_scoring_func is None) or (len(l) == 1):
                            score = 1.0
                        else:
                            prefix = l[1:]
                            score = ext_scoring_func(prefix)
                        probs_nb_cur[l_plus] += score * prob_c * (
                            probs_b_prev[l] + probs_nb_prev[l])
                    else:
                        probs_nb_cur[l_plus] += prob_c * (
                            probs_b_prev[l] + probs_nb_prev[l])
                    # add l_plus into prefix_set_next
                    prefix_set_next[l_plus] = probs_nb_cur[
                        l_plus] + probs_b_cur[l_plus]
            # add l into prefix_set_next
            prefix_set_next[l] = probs_b_cur[l] + probs_nb_cur[l]
        # update probs
        probs_b_prev, probs_nb_prev = probs_b_cur, probs_nb_cur

        ## store top beam_size prefixes
        prefix_set_prev = sorted(
            prefix_set_next.iteritems(), key=lambda asd: asd[1], reverse=True)
        if beam_size < len(prefix_set_prev):
            prefix_set_prev = prefix_set_prev[:beam_size]
        prefix_set_prev = dict(prefix_set_prev)

    beam_result = []
    for seq, prob in prefix_set_prev.items():
        if prob > 0.0 and len(seq) > 1:
            result = seq[1:]
            # score last word by external scorer
            if (ext_scoring_func is not None) and (result[-1] != ' '):
                prob = prob * ext_scoring_func(result)
            log_prob = log(prob)
            beam_result.append((log_prob, result))
        else:
            beam_result.append((float('-inf'), ''))

    ## output top beam_size decoding results
    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
    return beam_result

In [None]:
def ctc_beam_search_decoder_batch(probs_split,
                                  beam_size,
                                  vocabulary,
                                  num_processes,
                                  cutoff_prob=1.0,
                                  cutoff_top_n=40,
                                  ext_scoring_func=None):
    """CTC beam search decoder using multiple processes.
    :param probs_seq: 3-D list with each element as an instance of 2-D list
                      of probabilities used by ctc_beam_search_decoder().
    :type probs_seq: 3-D list
    :param beam_size: Width for beam search.
    :type beam_size: int
    :param vocabulary: Vocabulary list.
    :type vocabulary: list
    :param num_processes: Number of parallel processes.
    :type num_processes: int
    :param cutoff_prob: Cutoff probability in pruning,
                        default 1.0, no pruning.
    :type cutoff_prob: float
    :param num_processes: Number of parallel processes.
    :type num_processes: int
    :param ext_scoring_func: External scoring function for
                            partially decoded sentence, e.g. word count
                            or language model.
    :type external_scoring_function: callable
    :return: List of tuples of log probability and sentence as decoding
             results, in descending order of the probability.
    :rtype: list
    """
    if not num_processes > 0:
        raise ValueError("Number of processes must be positive!")

    # use global variable to pass the externnal scorer to beam search decoder
    global ext_nproc_scorer
    ext_nproc_scorer = ext_scoring_func
    nproc = True

    pool = multiprocessing.Pool(processes=num_processes)
    results = []
    for i, probs_list in enumerate(probs_split):
        args = (probs_list, beam_size, vocabulary, cutoff_prob, cutoff_top_n,
                None, nproc)
        results.append(pool.apply_async(ctc_beam_search_decoder, args))

    pool.close()
    pool.join()
    beam_search_results = [result.get() for result in results]
    return beam_search_results

In [None]:
import kenlm

In [None]:
"""External Scorer for Beam Search Decoder."""
class Scorer(object):
    """External scorer to evaluate a prefix or whole sentence in
       beam search decoding, including the score from n-gram language
       model and word count.
    :param alpha: Parameter associated with language model. Don't use
                  language model when alpha = 0.
    :type alpha: float
    :param beta: Parameter associated with word count. Don't use word
                count when beta = 0.
    :type beta: float
    :model_path: Path to load language model.
    :type model_path: basestring
    """

    def __init__(self, alpha, beta, model_path):
        self._alpha = alpha
        self._beta = beta
        if not os.path.isfile(model_path):
            raise IOError("Invaid language model path: %s" % model_path)
        self._language_model = kenlm.LanguageModel(model_path)

    # n-gram language model scoring
    def _language_model_score(self, sentence):
        #log10 prob of last word
        log_cond_prob = list(
            self._language_model.full_scores(sentence, eos=False))[-1][0]
        return np.power(10, log_cond_prob)

    # word insertion term
    def _word_count(self, sentence):
        words = sentence.strip().split(' ')
        return len(words)

    # reset alpha and beta
    def reset_params(self, alpha, beta):
        self._alpha = alpha
        self._beta = beta

    # execute evaluation
    def __call__(self, sentence, log=False):
        """Evaluation function, gathering all the different scores
        and return the final one.
        :param sentence: The input sentence for evalutation
        :type sentence: basestring
        :param log: Whether return the score in log representation.
        :type log: bool
        :return: Evaluation score, in the decimal or log.
        :rtype: float
        """
        lm = self._language_model_score(sentence)
        word_cnt = self._word_count(sentence)
        if log == False:
            score = np.power(lm, self._alpha) * np.power(word_cnt, self._beta)
        else:
            score = self._alpha * np.log(lm) + self._beta * np.log(word_cnt)
        return score

## Deepspeech Model

In [None]:
import time
import logging
import gzip
import copy
import inspect
from distutils.dir_util import mkpath

In [None]:
"""Contains DeepSpeech2 model."""
logging.basicConfig(
    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')


class DeepSpeech2Model(object):
    """DeepSpeech2Model class.
    :param vocab_size: Decoding vocabulary size.
    :type vocab_size: int
    :param num_conv_layers: Number of stacking convolution layers.
    :type num_conv_layers: int
    :param num_rnn_layers: Number of stacking RNN layers.
    :type num_rnn_layers: int
    :param rnn_layer_size: RNN layer size (number of RNN cells).
    :type rnn_layer_size: int
    :param pretrained_model_path: Pretrained model path. If None, will train
                                  from stratch.
    :type pretrained_model_path: basestring|None
    :param share_rnn_weights: Whether to share input-hidden weights between
                              forward and backward directional RNNs.Notice that
                              for GRU, weight sharing is not supported.
    :type share_rnn_weights: bool
    """

    def __init__(self, vocab_size, num_conv_layers, num_rnn_layers,
                 rnn_layer_size, use_gru, pretrained_model_path,
                 share_rnn_weights):
        self._create_network(vocab_size, num_conv_layers, num_rnn_layers,
                             rnn_layer_size, use_gru, share_rnn_weights)
        self._create_parameters(pretrained_model_path)
        self._inferer = None
        self._loss_inferer = None
        self._ext_scorer = None
        self._num_conv_layers = num_conv_layers
        self.logger = logging.getLogger("")
        self.logger.setLevel(level=logging.INFO)

    def train(self,
              train_batch_reader,
              dev_batch_reader,
              feeding_dict,
              learning_rate,
              gradient_clipping,
              num_passes,
              output_model_dir,
              is_local=True,
              num_iterations_print=100,
              test_off=False):
        """Train the model.
        :param train_batch_reader: Train data reader.
        :type train_batch_reader: callable
        :param dev_batch_reader: Validation data reader.
        :type dev_batch_reader: callable
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :param learning_rate: Learning rate for ADAM optimizer.
        :type learning_rate: float
        :param gradient_clipping: Gradient clipping threshold.
        :type gradient_clipping: float
        :param num_passes: Number of training epochs.
        :type num_passes: int
        :param num_iterations_print: Number of training iterations for printing
                                     a training loss.
        :type rnn_iteratons_print: int
        :param is_local: Set to False if running with pserver with multi-nodes.
        :type is_local: bool
        :param output_model_dir: Directory for saving the model (every pass).
        :type output_model_dir: basestring
        :param test_off: Turn off testing.
        :type test_off: bool
        """
        # prepare model output directory
        if not os.path.exists(output_model_dir):
            mkpath(output_model_dir)

        # adapt the feeding dict and reader according to the network
        adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict)
        adapted_train_batch_reader = self._adapt_data(train_batch_reader)
        adapted_dev_batch_reader = self._adapt_data(dev_batch_reader)

        # prepare optimizer and trainer
        optimizer = paddle.optimizer.Adam(
            learning_rate=learning_rate,
            gradient_clipping_threshold=gradient_clipping)
        trainer = paddle.trainer.SGD(
            cost=self._loss,
            parameters=self._parameters,
            update_equation=optimizer,
            is_local=is_local)

        # create event handler
        def event_handler(event):
            global start_time, cost_sum, cost_counter
            if isinstance(event, paddle.event.EndIteration):
                cost_sum += event.cost
                cost_counter += 1
                if (event.batch_id + 1) % num_iterations_print == 0:
                    output_model_path = os.path.join(output_model_dir,
                                                     "params.latest.tar.gz")
                    with gzip.open(output_model_path, 'w') as f:
                        trainer.save_parameter_to_tar(f)
                    print("\nPass: %d, Batch: %d, TrainCost: %f" %
                          (event.pass_id, event.batch_id + 1,
                           cost_sum / cost_counter))
                    cost_sum, cost_counter = 0.0, 0
                else:
                    sys.stdout.write('.')
                    sys.stdout.flush()
            if isinstance(event, paddle.event.BeginPass):
                start_time = time.time()
                cost_sum, cost_counter = 0.0, 0
            if isinstance(event, paddle.event.EndPass):
                if test_off:
                    print("\n------- Time: %d sec,  Pass: %d" %
                          (time.time() - start_time, event.pass_id))
                else:
                    result = trainer.test(
                        reader=adapted_dev_batch_reader,
                        feeding=adapted_feeding_dict)
                    print(
                        "\n------- Time: %d sec,  Pass: %d, "
                        "ValidationCost: %s" %
                        (time.time() - start_time, event.pass_id, result.cost))
                output_model_path = os.path.join(
                    output_model_dir, "params.pass-%d.tar.gz" % event.pass_id)
                with gzip.open(output_model_path, 'w') as f:
                    trainer.save_parameter_to_tar(f)

        # run train
        trainer.train(
            reader=adapted_train_batch_reader,
            event_handler=event_handler,
            num_passes=num_passes,
            feeding=adapted_feeding_dict)

    # TODO(@pkuyym) merge this function into infer_batch
    def infer_loss_batch(self, infer_data):
        """Model inference. Infer the ctc loss for a batch of speech
        utterances.
        :param infer_data: List of utterances to infer, with each utterance a
                           tuple of audio features and transcription text (empty
                           string).
        :type infer_data: list
        :return: List of ctc loss.
        :rtype: List of float
        """
        # define inferer
        if self._loss_inferer == None:
            self._loss_inferer = paddle.inference.Inference(
                output_layer=self._loss, parameters=self._parameters)
        # run inference
        return self._loss_inferer.infer(input=infer_data)

    def infer_batch_probs(self, infer_data, feeding_dict):
        """Infer the prob matrices for a batch of speech utterances.
        :param infer_data: List of utterances to infer, with each utterance
                           consisting of a tuple of audio features and
                           transcription text (empty string).
        :type infer_data: list
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :return: List of 2-D probability matrix, and each consists of prob
                 vectors for one speech utterancce.
        :rtype: List of matrix
        """
        # define inferer
        if self._inferer == None:
            self._inferer = paddle.inference.Inference(
                output_layer=self._log_probs, parameters=self._parameters)
        adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict)
        adapted_infer_data = self._adapt_data(infer_data)
        # run inference
        infer_results = self._inferer.infer(
            input=adapted_infer_data, feeding=adapted_feeding_dict)
        start_pos = [0] * (len(adapted_infer_data) + 1)
        for i in xrange(len(adapted_infer_data)):
            start_pos[i + 1] = start_pos[i] + adapted_infer_data[i][3][0]
        probs_split = [
            infer_results[start_pos[i]:start_pos[i + 1]]
            for i in xrange(0, len(adapted_infer_data))
        ]
        return probs_split

    def decode_batch_greedy(self, probs_split, vocab_list):
        """Decode by best path for a batch of probs matrix input.
        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :return: List of transcription texts.
        :rtype: List of basestring
        """
        results = []
        for i, probs in enumerate(probs_split):
            output_transcription = ctc_greedy_decoder(
                probs_seq=probs, vocabulary=vocab_list)
            results.append(output_transcription)
        return results

    def init_ext_scorer(self, beam_alpha, beam_beta, language_model_path,
                        vocab_list):
        """Initialize the external scorer.
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param language_model_path: Filepath for language model. If it is
                                    empty, the external scorer will be set to
                                    None, and the decoding method will be pure
                                    beam search without scorer.
        :type language_model_path: basestring|None
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        """
        if language_model_path != '':
            self.logger.info("begin to initialize the external scorer "
                             "for decoding")
            self._ext_scorer = Scorer(beam_alpha, beam_beta,
                                      language_model_path, vocab_list)
            lm_char_based = self._ext_scorer.is_character_based()
            lm_max_order = self._ext_scorer.get_max_order()
            lm_dict_size = self._ext_scorer.get_dict_size()
            self.logger.info("language model: "
                             "is_character_based = %d," % lm_char_based +
                             " max_order = %d," % lm_max_order +
                             " dict_size = %d" % lm_dict_size)
            self.logger.info("end initializing scorer")
        else:
            self._ext_scorer = None
            self.logger.info("no language model provided, "
                             "decoding by pure beam search without scorer.")

    def decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta,
                                 beam_size, cutoff_prob, cutoff_top_n,
                                 vocab_list, num_processes):
        """Decode by beam search for a batch of probs matrix input.
        :param probs_split: List of 2-D probability matrix, and each consists
                            of prob vectors for one speech utterancce.
        :param probs_split: List of matrix
        :param beam_alpha: Parameter associated with language model.
        :type beam_alpha: float
        :param beam_beta: Parameter associated with word count.
        :type beam_beta: float
        :param beam_size: Width for Beam search.
        :type beam_size: int
        :param cutoff_prob: Cutoff probability in pruning,
                            default 1.0, no pruning.
        :type cutoff_prob: float
        :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
                        characters with highest probs in vocabulary will be
                        used in beam search, default 40.
        :type cutoff_top_n: int
        :param vocab_list: List of tokens in the vocabulary, for decoding.
        :type vocab_list: list
        :param num_processes: Number of processes (CPU) for decoder.
        :type num_processes: int
        :return: List of transcription texts.
        :rtype: List of basestring
        """
        if self._ext_scorer != None:
            self._ext_scorer.reset_params(beam_alpha, beam_beta)
        # beam search decode
        num_processes = min(num_processes, len(probs_split))
        beam_search_results = ctc_beam_search_decoder_batch(
            probs_split=probs_split,
            vocabulary=vocab_list,
            beam_size=beam_size,
            num_processes=num_processes,
            ext_scoring_func=self._ext_scorer,
            cutoff_prob=cutoff_prob,
            cutoff_top_n=cutoff_top_n)

        results = [result[0][1] for result in beam_search_results]
        return results

    def _adapt_feeding_dict(self, feeding_dict):
        """Adapt feeding dict according to network struct.
        To remove impacts from padding part, we add scale_sub_region layer and
        sub_seq layer. For sub_seq layer, 'sequence_offset' and
        'sequence_length' fields are appended. For each scale_sub_region layer
        'convN_index_range' field is appended.
        :param feeding_dict: Feeding is a map of field name and tuple index
                             of the data that reader returns.
        :type feeding_dict: dict|list
        :return: Adapted feeding dict.
        :rtype: dict|list
        """
        adapted_feeding_dict = copy.deepcopy(feeding_dict)
        if isinstance(feeding_dict, dict):
            adapted_feeding_dict["sequence_offset"] = len(adapted_feeding_dict)
            adapted_feeding_dict["sequence_length"] = len(adapted_feeding_dict)
            for i in xrange(self._num_conv_layers):
                adapted_feeding_dict["conv%d_index_range" %i] = \
                        len(adapted_feeding_dict)
        elif isinstance(feeding_dict, list):
            adapted_feeding_dict.append("sequence_offset")
            adapted_feeding_dict.append("sequence_length")
            for i in xrange(self._num_conv_layers):
                adapted_feeding_dict.append("conv%d_index_range" % i)
        else:
            raise ValueError("Type of feeding_dict is %s, not supported." %
                             type(feeding_dict))

        return adapted_feeding_dict

    def _adapt_data(self, data):
        """Adapt data according to network struct.
        For each convolution layer in the conv_group, to remove impacts from
        padding data, we can multiply zero to the padding part of the outputs
        of each batch normalization layer. We add a scale_sub_region layer after
        each batch normalization layer to reset the padding data.
        For rnn layers, to remove impacts from padding data, we can truncate the
        padding part before output data feeded into the first rnn layer. We use
        sub_seq layer to achieve this.
        :param data: Data from data_provider.
        :type data: list|function
        :return: Adapted data.
        :rtype: list|function
        """

        def adapt_instance(instance):
            if len(instance) < 2 or len(instance) > 3:
                raise ValueError("Size of instance should be 2 or 3.")
            padded_audio = instance[0]
            text = instance[1]
            # no padding part
            if len(instance) == 2:
                audio_len = padded_audio.shape[1]
            else:
                audio_len = instance[2]
            adapted_instance = [padded_audio, text]
            # Stride size for conv0 is (3, 2)
            # Stride size for conv1 to convN is (1, 2)
            # Same as the network, hard-coded here
            padded_conv0_h = (padded_audio.shape[0] - 1) // 2 + 1
            padded_conv0_w = (padded_audio.shape[1] - 1) // 3 + 1
            valid_w = (audio_len - 1) // 3 + 1
            adapted_instance += [
                [0],  # sequence offset, always 0
                [valid_w],  # valid sequence length
                # Index ranges for channel, height and width
                # Please refer scale_sub_region layer to see details
                [1, 32, 1, padded_conv0_h, valid_w + 1, padded_conv0_w]
            ]
            pre_padded_h = padded_conv0_h
            for i in xrange(self._num_conv_layers - 1):
                padded_h = (pre_padded_h - 1) // 2 + 1
                pre_padded_h = padded_h
                adapted_instance += [
                    [1, 32, 1, padded_h, valid_w + 1, padded_conv0_w]
                ]
            return adapted_instance

        if isinstance(data, list):
            return map(adapt_instance, data)
        elif inspect.isgeneratorfunction(data):

            def adapted_reader():
                for instance in data():
                    yield map(adapt_instance, instance)

            return adapted_reader
        else:
            raise ValueError("Type of data is %s, not supported." % type(data))

    def _create_parameters(self, model_path=None):
        """Load or create model parameters."""
        if model_path is None:
            self._parameters = paddle.parameters.create(self._loss)
        else:
            self._parameters = paddle.parameters.Parameters.from_tar(
                gzip.open(model_path))

    def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers,
                        rnn_layer_size, use_gru, share_rnn_weights):
        """Create data layers and model network."""
        # paddle.data_type.dense_array is used for variable batch input.
        # The size 161 * 161 is only an placeholder value and the real shape
        # of input batch data will be induced during training.
        audio_data = paddle.layer.data(
            name="audio_spectrogram",
            type=paddle.data_type.dense_array(161 * 161))
        text_data = paddle.layer.data(
            name="transcript_text",
            type=paddle.data_type.integer_value_sequence(vocab_size))
        seq_offset_data = paddle.layer.data(
            name='sequence_offset',
            type=paddle.data_type.integer_value_sequence(1))
        seq_len_data = paddle.layer.data(
            name='sequence_length',
            type=paddle.data_type.integer_value_sequence(1))
        index_range_datas = []
        for i in xrange(num_rnn_layers):
            index_range_datas.append(
                paddle.layer.data(
                    name='conv%d_index_range' % i,
                    type=paddle.data_type.dense_vector(6)))

        self._log_probs, self._loss = deep_speech_v2_network(
            audio_data=audio_data,
            text_data=text_data,
            seq_offset_data=seq_offset_data,
            seq_len_data=seq_len_data,
            index_range_datas=index_range_datas,
            dict_size=vocab_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,
            rnn_size=rnn_layer_size,
            use_gru=use_gru,
            share_rnn_weights=share_rnn_weights)

building a pretrained model with 10 second speech data 

In [None]:
Model = DeepSpeech2Model(pretrained_model_path='baidu/models/parameters')

In [None]:
#input = #input of 10 second speech data

In [None]:
flops = Interpret_Model(Model,inputs=())
summary = print_summary(flops)