In [1]:
import os
import tensorflow as tf

AUTOTUNE = tf.data.experimental.AUTOTUNE

In [68]:
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Conv2D, Bidirectional, LSTM, GRU, Dense
from tensorflow.keras.layers import Dropout, BatchNormalization, LeakyReLU, PReLU
from tensorflow.keras.layers import Input, MaxPooling2D, Reshape, MaxPool2D, Lambda, AveragePooling2D
from tensorflow.keras.optimizers import Adam

import pandas as pd

ROOT = '../../data/toy_final/'
meta = pd.read_csv(os.path.join(ROOT,'metadata.csv'), index_col = 'index')

print(meta.shape)
display(meta.head())

(306, 4)


Unnamed: 0_level_0,filepath,label,label_length,spec_length
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,../../data/toy_raw/174/168635/174-168635-0000....,"[8, 5, 0, 8, 1, 4, 0, 14, 5, 22, 5, 18, 0, 2, ...",45,196
1,../../data/toy_raw/174/168635/174-168635-0001....,"[20, 8, 5, 0, 8, 5, 1, 18, 20, 0, 15, 6, 0, 20...",50,201
2,../../data/toy_raw/174/168635/174-168635-0002....,"[8, 9, 19, 0, 19, 9, 19, 20, 5, 18, 0, 1, 14, ...",224,684
3,../../data/toy_raw/174/168635/174-168635-0003....,"[8, 5, 0, 19, 21, 6, 6, 5, 18, 5, 4, 0, 1, 12,...",177,568
4,../../data/toy_raw/174/168635/174-168635-0004....,"[15, 14, 12, 25, 0, 1, 19, 0, 8, 5, 0, 23, 1, ...",169,550


In [3]:
print(meta.label_length.max())
print(meta.spec_length.max())

505
1406


In [50]:
#Each tf.train.Example record contains one or more "features", and the input pipeline typically converts these features into tensors.
def _parse_batch(record_batch):

    # Create a description of the features
    feature_description = {
        'feature': tf.io.FixedLenFeature([40,1406,1], tf.float32),
        'label': tf.io.FixedLenFeature([505], tf.int64),
    }

    # Parse the input `tf.Example` proto using the dictionary above
    example = tf.io.parse_example(record_batch, feature_description)
    
    return example['feature'], example['label']

In [51]:
def get_dataset_from_tfrecords(tfrecords_dir='tfrecords', split='train',
                               batch_size=64, n_epochs=10):
    if split not in ('train', 'test', 'validate'):
        raise ValueError("split must be either 'train', 'test' or 'validate'")

    # List all *.tfrecord files for the selected split
    pattern = os.path.join(tfrecords_dir, '{}*.tfrecord'.format(split))
    files_ds = tf.data.Dataset.list_files(pattern)

    # Disregard data order in favor of reading speed
    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = False
    files_ds = files_ds.with_options(ignore_order)

    # Read TFRecord files in an interleaved order
    ds = tf.data.TFRecordDataset(files_ds,
                                 compression_type='ZLIB')
    # Prepare batches
    ds = ds.batch(batch_size)

    # Parse a batch into a dataset of [audio, label] pairs
    ds = ds.map(lambda x: _parse_batch(x))

    # Repeat the training data for n_epochs. Don't repeat test/validate splits.
    if split == 'train':
        ds = ds.repeat(n_epochs)

    return ds.prefetch(buffer_size=AUTOTUNE)

In [52]:
train_ds = get_dataset_from_tfrecords(tfrecords_dir='../../data/toy_final/TFrecords/', split='train', )


In [60]:
def ctc_loss_lambda_func(y_true, y_pred):
    """Function for computing the CTC loss"""

    if len(y_true.shape) > 2:
        y_true = tf.squeeze(y_true)

    input_length = tf.math.reduce_sum(y_pred, axis=-1, keepdims=False)
    input_length = tf.math.reduce_sum(input_length, axis=-1, keepdims=True)
    label_length = tf.math.count_nonzero(y_true, axis=-1, keepdims=True, dtype="int64")

    loss = K.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    loss = tf.reduce_mean(loss)

    return loss

def build_model(input_size, d_model, learning_rate=3e-4):
    """
    Convolucional Recurrent Neural Network by Puigcerver et al.
    Reference:
        Joan Puigcerver.
        Are multidimensional recurrent layers really necessary for handwritten text recognition?
        In: Document Analysis and Recognition (ICDAR), 2017 14th
        IAPR International Conference on, vol. 1, pp. 67–72. IEEE (2017)
        Carlos Mocholí Calvo and Enrique Vidal Ruiz.
        Development and experimentation of a deep learning system for convolutional and recurrent neural networks
        Escola Tècnica Superior d’Enginyeria Informàtica, Universitat Politècnica de València, 2018
    """

    input_data = Input(name="input", shape=input_size)

    conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(input_data)
    pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)
    
    conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
    pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)
    
    conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)
    batch_norm_3 = BatchNormalization()(conv_3)
    conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(batch_norm_3)
    pool_4 = MaxPool2D(pool_size=(2, 2))(conv_4)
    
    conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
    batch_norm_5 = BatchNormalization()(conv_5)
    conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
    batch_norm_6 = BatchNormalization()(conv_6)
    pool_6 = MaxPool2D(pool_size=(2, 2))(batch_norm_6)
    
    conv_7 = Conv2D(512, (2,2), activation = 'relu', padding='same')(pool_6)
    batch_norm_7 = BatchNormalization()(conv_7)
    pool_7 = MaxPool2D(pool_size=(1, 4))(batch_norm_7) 
    
    shape = pool_7.get_shape()
    blstm = Reshape((shape[1], shape[2] * shape[3]))(pool_7)
    
    blstm = Bidirectional(LSTM(256, return_sequences=True, dropout = 0.5))(blstm)
    blstm = Bidirectional(LSTM(256, return_sequences=True, dropout = 0.5))(blstm)
    blstm = Dropout(rate=0.5)(blstm)
    output_data = Dense(d_model, activation = 'softmax')(blstm)

#     optimizer = RMSprop(learning_rate=learning_rate)
    optimizer = Adam(learning_rate=learning_rate)
    
    model = Model(inputs=input_data, outputs=output_data)
    model.compile(optimizer=optimizer, loss=ctc_loss_lambda_func)
    model.summary()
    return model

In [71]:
def build_baseline_model(input_size, d_model, learning_rate=3e-4):

    input_data = Input(name="input", shape=input_size)

    conv_1 = Conv2D(32, (3,3), activation = 'relu', padding='same')(input_data)
    pool_1 = MaxPool2D(pool_size=(2, 20), strides=2)(conv_1)
    
    conv_2 = Conv2D(64, (3,3), activation = 'relu', padding='same')(pool_1)
    batch_norm_2 = BatchNormalization()(conv_2)
    
    conv_3 = Conv2D(64, (3,3), activation = 'relu', padding='same')(batch_norm_2)
    batch_norm_3 = BatchNormalization()(conv_3)
    pool_3 = MaxPool2D(pool_size=(2, 20))(batch_norm_3)
    
    shape = pool_3.get_shape()
    blstm = Reshape((shape[1], shape[2] * shape[3]))(pool_3)
    
    blstm = Bidirectional(LSTM(64, return_sequences=True, dropout = 0.5))(blstm)
    blstm = Dropout(rate=0.5)(blstm)
    output_data = Dense(d_model, activation = 'softmax')(blstm)

#     optimizer = RMSprop(learning_rate=learning_rate)
    optimizer = Adam(learning_rate=learning_rate)
    
    model = Model(inputs=input_data, outputs=output_data)
    model.compile(optimizer=optimizer, loss=ctc_loss_lambda_func)
    model.summary()
    return model

num_rows = 40
num_columns = 1406
num_label = 28

model = build_baseline_model(input_size = (num_rows, num_columns, 1), d_model = num_label)

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 40, 1406, 1)]     0         
_________________________________________________________________
conv2d_31 (Conv2D)           (None, 40, 1406, 32)      320       
_________________________________________________________________
max_pooling2d_22 (MaxPooling (None, 20, 694, 32)       0         
_________________________________________________________________
conv2d_32 (Conv2D)           (None, 20, 694, 64)       18496     
_________________________________________________________________
batch_normalization_14 (Batc (None, 20, 694, 64)       256       
_________________________________________________________________
conv2d_33 (Conv2D)           (None, 20, 694, 64)       36928     
_________________________________________________________________
batch_normalization_15 (Batc (None, 20, 694, 64)       256 

In [73]:
model.fit(train_ds, epochs=10)

Epoch 1/10
      1/Unknown - 2s 2s/step

InvalidArgumentError:  Not enough time for target transition sequence (required: 54, available: 10)0You can turn this error into a warning by using the flag ignore_longer_outputs_than_inputs
	 [[node loss/dense_7_loss/CTCLoss (defined at /home/long/miniconda3/envs/cs_ftmle/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1751) ]] [Op:__inference_distributed_function_35414]

Function call stack:
distributed_function
