In [1]:
from keras import backend as K
from keras.layers import Conv2D, MaxPooling2D, Layer
from keras.layers import Input, Dense, Activation, LeakyReLU, Permute, Bidirectional, CuDNNLSTM
from keras.layers import Reshape, Lambda, BatchNormalization
from keras.layers.merge import add, concatenate
from keras.models import Model
from keras.layers.recurrent import LSTM
import tensorflow as tf
import tensorflow as tf
from primus import CTC_PriMuS
import ctc_utils
import os
import tensorflow as tf
from primus import CTC_PriMuS
import ctc_utils
import os
import cv2

2022-05-17 18:55:11.314793: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-17 18:55:11.314857: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices('GPU')[0], True)

In [11]:
def default_model_params(img_height, vocabulary_size):
    params = dict()
    params['img_height'] = img_height
    params['img_width'] = None
    params['batch_size'] = 128
    params['img_channels'] = 1
    params['conv_blocks'] = 4
    params['conv_filter_n'] = [32, 64, 128, 256]
    params['conv_filter_size'] = [ [3,3], [3,3], [3,3], [3,3] ]
    params['conv_pooling_size'] = [ [2,2], [2,2], [2,2], [2,2] ]
    params['rnn_units'] = 128
    params['rnn_layers'] = 2
    params['vocabulary_size'] = vocabulary_size
    return params

In [2]:
class CTCLayer(Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = K.ctc_batch_cost
        # self.loss_fn = K.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.

        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions
        return y_pred

In [3]:

def ctc_crnn(params,width_rem = 128):
    input_shape = (params['img_height'],params['img_width'], params['img_channels'])

    inputs = Input(name='the_input', shape=input_shape, dtype='float32')

    labels = Input(name="label", shape=(None,), dtype="float32")

    width_reduction = 1
    height_reduction = 1

    #conv2d layer
    for i in range(params['conv_blocks']):
            inner = Conv2D(params['conv_filter_n'][i], params['conv_filter_size'][i], padding='same', name='conv'+ str(i+1), kernel_initializer='he_normal')(inputs if i == 0 else inner)
            inner = BatchNormalization()(inner)
            inner = LeakyReLU(0.2)(inner)
            inner = MaxPooling2D(pool_size=params['conv_pooling_size'][i], strides = params['conv_pooling_size'][i], name='max' + str(i+1))(inner)

            width_reduction = width_reduction * params['conv_pooling_size'][i][1]
            height_reduction = height_reduction * params['conv_pooling_size'][i][0]

            
    features = K.permute_dimensions(inner, (2,0,3,1))
    feature_dim = params['conv_filter_n'][-1] * (params['img_height'] / height_reduction)
    feature_width = width_rem / width_reduction
    features = tf.reshape(features, tf.stack([tf.cast(feature_width,'int32'), 16, tf.cast(feature_dim,'int32')]))
    
    # RNN block
    lstm_1 = CuDNNLSTM(params['rnn_units'], return_sequences=True, kernel_initializer='he_normal', name='lstm1')(features)  # (None, 32, 512)
    lstm_1b = CuDNNLSTM(params['rnn_units'], return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='lstm1_b')(features)
    reversed_lstm_1b = Lambda(lambda inputTensor: K.reverse(inputTensor, axes=1)) (lstm_1b)

    lstm1_merged = add([lstm_1, reversed_lstm_1b])  # (None, 32, 512)
    lstm1_merged = BatchNormalization()(lstm1_merged)
    
    lstm_2 = CuDNNLSTM(params['rnn_units'], return_sequences=True, kernel_initializer='he_normal', name='lstm2')(lstm1_merged)
    lstm_2b = CuDNNLSTM(params['rnn_units'], return_sequences=True, go_backwards=True, kernel_initializer='he_normal', name='lstm2_b')(lstm1_merged)
    reversed_lstm_2b= Lambda(lambda inputTensor: K.reverse(inputTensor, axes=1)) (lstm_2b)

    lstm2_merged = concatenate([lstm_2, reversed_lstm_2b])  # (None, 32, 1024)
    lstm2_merged = BatchNormalization()(lstm2_merged)

    # transforms RNN output to character activations:
    num_classes = params['vocabulary_size'] + 1
    inner = Dense(num_classes, kernel_initializer='he_normal',name='dense2')(lstm2_merged) #(None, 32, 63)
    y_pred = Activation('softmax', name='softmax')(inner)

    output = CTCLayer(name="ctc_loss")(labels, y_pred)


    return Model(inputs=[inputs, labels], outputs=output)

In [9]:
corpus_dirpath = "C:/Users/steli/DeepLearning/Project/tf-end-to-end/Data/primusCalvoRizoAppliedSciences2018/"
corpus_filepath = "C:/Users/steli/DeepLearning/Project/tf-end-to-end/Data/train.txt"
dictionary_path = "C:/Users/steli/DeepLearning/Project/tf-end-to-end/Data/vocabulary_semantic.txt"
primus = CTC_PriMuS(corpus_dirpath,corpus_filepath,dictionary_path, True, val_split = 0.1)

Training with 9000 and validating with 1000


In [12]:
# Parameterization
img_height = 128
params = default_model_params(img_height,primus.vocabulary_size)
max_epochs = 100
dropout = 0.5

# Model
model = ctc_crnn(params)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 the_input (InputLayer)         [(None, 128, None,   0           []                               
                                1)]                                                               
                                                                                                  
 conv1 (Conv2D)                 (None, 128, None, 3  320         ['the_input[0][0]']              
                                2)                                                                
                                                                                                  
 batch_normalization (BatchNorm  (None, 128, None, 3  128        ['conv1[0][0]']                  
 alization)                     2)                                                            

In [21]:
# Get Data
image_paths = []
image_texts = []

class PriMuS_Data:
    def __init__(self, corpus_dirpath, corpus_filepath, dictionary_path):
        self.corpus_dirpath = corpus_dirpath

        # Corpus
        corpus_file = open(corpus_filepath,'r')
        self.corpus_list = corpus_file.read().splitlines()
        corpus_file.close()

        self.current_idx = 0

        # Dictionary
        self.word2int = {}
        self.int2word = {}
            
        dict_file = open(dictionary_path,'r')
        dict_list = dict_file.read().splitlines()
        for word in dict_list:
            if not word in self.word2int:
                word_idx = len(self.word2int)
                self.word2int[word] = word_idx
                self.int2word[word_idx] = word

        dict_file.close()

        self.vocabulary_size = len(self.word2int)
    
    def get_data(self):
        return self.corpus_list
    
    def trim_ds(self):
        val_split = 0.9
        folder_path = self.corpus_dirpath
        list_of_files = self.corpus_list #10,000 filenames

        image_paths = [self.corpus_dirpath+f"{x}/{x}.png" for x in self.corpus_list] # list of image paths
        text_paths = [self.corpus_dirpath+f"{x}/{x}.semantic" for x in self.corpus_list]

        image_texts = [] # list of strings

        for path in text_paths:
            with open(path, "r") as file:
                image_texts.append(file.readline())


        train_image_paths = image_paths[ : int(len(image_paths) * val_split)]
        train_image_texts = image_texts[ : int(len(image_texts) * val_split)]

        val_image_paths = image_paths[int(len(image_paths) * val_split) : ]
        val_image_texts = image_texts[int(len(image_texts) * val_split) : ]

        return {"train":{"images":train_image_paths, "text": train_image_texts}, "val":{"images":val_image_paths, "text": val_image_texts}}



In [22]:
data_obj = PriMuS_Data(corpus_dirpath, corpus_filepath, dictionary_path)
lst = data_obj.trim_ds()

In [25]:
# Resize images to desired dimensions with unknown width
def resize_no_width(image, height):
    width = int(float(height * image.shape[1]) / image.shape[0])
    sample_img = cv2.resize(image, (width, height))
    return sample_img


# Pre-process a single image sample(resize/normalize/max_width resize)
def process_single_sample(img_path, label,max_width = 0):
    params = default_model_params(img_height,primus.vocabulary_size)
    # 1. Read image
    img = tf.io.read_file(img_path)
    # 2. Decode and convert to grayscale
    img = tf.io.decode_png(img, channels=1)
    # 3. Convert to float32 in [0, 1] range
    img = tf.image.convert_image_dtype(img, tf.float32)
    # 4. Resize to the desired size
    img = tf.image.resize(img,[params['img_height'], max_width])
    # TODO: Get height from params, calculate width from max of all images
    return {"image": img, "label": label}


In [26]:
tmp = process_single_sample(lst["train"]["images"][0],lst["train"]["text"][0],)

In [27]:
batch_size = 256

train_dataset = tf.data.Dataset.from_tensor_slices((lst['train']['images'], lst['train']['text']))

train_dataset = train_dataset.map(process_single_sample)
image_widths = [cv2.imread(img).shape[1] for img in train_dataset[0]]
max_image_width = max(image_widths)
train_dataset = train_dataset.map(process_single_sample(max_width = max_image_width))
    

train_dataset = (
    train_dataset.batch(batch_size)
    .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)

validation_dataset = tf.data.Dataset.from_tensor_slices((lst['val']['images'], lst['val']['text']))
validation_dataset = (
    validation_dataset.batch(batch_size)
    .prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
)

{'image': <tf.Tensor: shape=(32, 128, 1), dtype=float32, numpy=
 array([[[1.        ],
         [1.        ],
         [1.        ],
         ...,
         [1.        ],
         [1.        ],
         [1.        ]],
 
        [[1.        ],
         [1.        ],
         [1.        ],
         ...,
         [1.        ],
         [1.        ],
         [1.        ]],
 
        [[1.        ],
         [1.        ],
         [1.        ],
         ...,
         [1.        ],
         [1.        ],
         [1.        ]],
 
        ...,
 
        [[1.        ],
         [0.21960786],
         [0.21960786],
         ...,
         [1.        ],
         [1.        ],
         [1.        ]],
 
        [[1.        ],
         [0.21960786],
         [0.21960786],
         ...,
         [1.        ],
         [1.        ],
         [1.        ]],
 
        [[1.        ],
         [0.21960786],
         [0.26057082],
         ...,
         [1.        ],
         [1.        ],
         [1.     

https://github.com/rajesh-bhat/spark-ai-summit-2020-text-extraction/blob/master/CRNN_CTC_wandb.ipynb

In [None]:
# Compile

optimizer = tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, clipnorm=1.0)
model.compile(optimizer = optimizer)

history = model.fit(train_dataset, 
                    epochs = max_epochs,
                    validation_data=validation_dataset,
                    verbose = 1,
                    # callbacks = callbacks_list,
                    shuffle=True)