# Baseline Model for Handwritten recognition

## Imports

In [3]:
import keras
import pandas as pd
import numpy as np
from PIL import Image
import os
import json

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Conv2D, Reshape, Dense, Input, Cropping1D, RepeatVector, Flatten
from tensorflow.keras.layers import MultiHeadAttention, Dropout, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.random import uniform
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard

In [51]:
#loading configuration file
with open("config.json") as f:
    config = json.load(f)

## Loading Data




In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#The currect resized images are BW images edited in the preprocessing stage.

In [5]:
folder_path = 'drive/MyDrive/resized_images'

images = []
for file_name in os.listdir(folder_path):
  image_path = os.path.join(folder_path, file_name)
  image = np.array(Image.open(image_path))
  images.append(image)

images = np.array(images)
images.shape

(13353, 64, 512)

In [61]:
labels = pd.read_csv('labels.csv', header=None).iloc[:,1]
labels.name = 'labels'
labels.shape

(13353,)

# Sample data
Relevant for fast training. need to adjust the following stages of setting data for the model in order to run with sample instead full data.

In [9]:
# sample set of 500
# !unzip 'test_bw.zip'

In [10]:
# sampled_labels = labels[:500]

# folder_path = 'test_bw'

# sampled_images = []
# for file_name in os.listdir(folder_path):
#   image_path = os.path.join(folder_path, file_name)
#   image = np.array(Image.open(image_path))
#   sampled_images.append(image)

# sampled_images = np.array(sampled_images)
# sampled_images.shape

# Splitting the data

In [62]:
train_images, test_images, train_labels, test_labels = train_test_split(images, labels, test_size=0.2, random_state=1)

In [63]:
def label_preprocessing(labels_sentence, vocabulary):
    '''
    Takes in a single label as a string (sentence matching the content of the image) and
    preprocesses it so that the label can interpreted by the model
    Param: label_sentence: the label a string
    Returns: The preprocessed label as a sequence of indexes
    '''
    max_sequence_length = max(len(sentence) for sentence in labels_sentence )

    labels_indexes = [[vocabulary[char] for char in sentence] for sentence in labels_sentence]
    preprocesed_label = pad_sequences(labels_indexes, maxlen=max_sequence_length, padding='post')

    return preprocesed_label, max_sequence_length


def get_vocabulary(train_labels):
    """
    creates a vocabulary for the translation of the prediction
    """
    vocabulary = sorted(set(''.join(train_labels)))
    vocabulary_dict = {char: index for index, char in enumerate(vocabulary)}
    return vocabulary_dict

Preparing the data to be numpy array of character tokens. Calculaing max_sequence_length.

In [64]:
vocabulary = get_vocabulary(labels)
train_labels_preprocessed, max_sequence_length = label_preprocessing(train_labels, vocabulary)

## Metric Functions

In [65]:
def wer(y_true, y_pred):
    """
    returns calculation of Word Error Rate for a prediction.
    """
    wer_value = wer(y_true, y_pred)
    return wer_value

def cer(y_true, y_pred):
    """
    returns calculation of Character Error Rate for a prediction.
    """
    matcher = difflib.SequenceMatcher(None, y_true, y_pred)
    cer_value = 1 - matcher.ratio()
    return cer_value

## Baseline Model implementation

In [73]:
# Hyperparameters:
input_shape = (images.shape[1],
               images.shape[2],
               1)

optimizer = Adam(learning_rate=1e-4, clipvalue=1)
loss = SparseCategoricalCrossentropy()
num_classes = len(vocabulary) + 1 # additional 1 for CTC blank value

# CTC loss function (or alternative CTC layer)

This is a CTC layer implementation, that outputs max_len_sentence chracters. currently it's not in use, and instead we use CTCloss as a loss function directly in the compilation stage.

In [None]:
# class CTCLayer(layers.Layer):
#     def __init__(self, name=None):
#         super().__init__(name=name)
#         self.loss_fn = keras.backend.ctc_batch_cost

#     def call(self, y_true, y_pred):
#         batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
#         input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
#         label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")


#         input_length *= tf.ones(shape=(batch_len, 1), dtype="int64")
#         label_length *= tf.ones(shape=(batch_len, 1), dtype="int64")


#         loss = self.loss_fn(y_true, y_pred, input_length, label_length)
#         self.add_loss(loss)

#         return y_pred

In [53]:
class CTCloss(tf.keras.losses.Loss):
    """ CTCLoss objec for training the model"""
    def __init__(self, name: str = "CTCloss") -> None:
        super(CTCloss, self).__init__()
        self.name = name
        self.loss_fn = tf.keras.backend.ctc_batch_cost

    def __call__(self, y_true: tf.Tensor, y_pred: tf.Tensor, sample_weight=None) -> tf.Tensor:
        """ Compute the training batch CTC loss value"""
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)

        return loss

## CNN+Transformers Model implementation

The model is based on paper called 'Optical Character Recognition with Transformers and CTC'. The model contains currently 5 Conv2D layers, followed by 3 encoder BERT like Transformer layers.

In [74]:
# Define input layer
inputs = Input(shape=input_shape, name='Input')

# Define CNN layers, the filters, num of layers, and strides might change.
x = Conv2D(64, (3, 3), strides=(2,2), activation='relu', name='conv1')(inputs)
x = Conv2D(128, (3, 3), strides=(2,2), activation='relu', name='conv2')(x)
x = Conv2D(256, (3, 3), strides=(2,1), activation='relu', name='conv3')(x)
x = Conv2D(512, (3, 3), strides=(4,1), activation='relu', name='conv4')(x)
x = Conv2D(512, (2, 2), strides=(1,1), activation='relu', name='conv5')(x)

# Reshape output of Convolutional layers to BERT like shape.
x = Reshape((-1, 512), name='Reshape')(x)

for _ in range(config["num_transformer_layers"]):
    # Add Multi-Head Attention layer
    x = LayerNormalization(epsilon=1e-6, name=f'LayerNomalization{_*2+1}')(x)
    x = MultiHeadAttention(num_heads=config["num_heads"], key_dim=8, name=f'MultiHeadAttention{_+1}')(x, x)
    x = Dropout(config["dropout_rate"], name=f'Dropout{_*2+1}')(x)

    # Add Feed Forward Neural Network layer
    ffn_output = Dense(512, activation='relu', name=f'Dense{_+1}')(x)
    x = LayerNormalization(epsilon=1e-6, name=f'LayerNomalization{_*2+2}')(x + ffn_output)
    x = Dropout(config["dropout_rate"], name=f'Dropout{_*2+2}')(x)

# Output Dense layer
x = Dense(units=num_classes, activation='softmax', name='Dense4')(x)

#Cropping Data to change output to max_length_sentence length
output = Cropping1D(cropping=(0, 29))(x)

model = tf.keras.Model(inputs=inputs, outputs=output, name="ocr_model_CNN_Transformers_CTC")

# # Structure for alternative CTC. In that case the model input includes (Inputs, labels) .

# labels = Input(name="label", shape=(None,), dtype="float32")
# output = CTCLayer(name="ctc_loss")(labels, output)
# model = tf.keras.Model(inputs=[inputs, labels], outputs=output, name="ocr_model_CNN_Transformers_CTC")

# Compile the model
model.compile(optimizer=optimizer, loss=CTCloss())

In [None]:
model.summary()

#Fit the data and observe the kernels and biases of the layers.

In [None]:
tensorboard_callback = TensorBoard(log_dir='./logs', histogram_freq=1, write_grads=True)

model.fit(x=(train_images, train_labels_preprocessed),
          validation_split = config["VAL_SPLIT"],
          epochs=40,
          batch_size=32,
          callbacks=[tensorboard_callback])

In [None]:
%load_ext tensorboard
%tensorboard --logdir=./logs


## Prediction

In [None]:
preds = model.predict((test_images, test_labels))
index_to_char = {v: k for k, v in vocabulary.items()}
cer_sum = 0
wer_sum = 0

for i, pred in enumerate(preds):
    indices = np.argmax(pred, axis=-1)

    print(len(indices))

    characters = ''.join([index_to_char[idx] for idx in indices])
    cer_sum += character_error_rate(test_labels[i], characters)
    wer_sum += word_error_rate(test_labels[i], characters)

print('CER mean: ', cer_sum / len(test_labels))
print('WER mean: ', wer_sum / len(test_labels))