In [1]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
import os
from tqdm import tqdm
import multiprocessing
import pandas as pd
from sklearn.model_selection import train_test_split

# datapath = '/dbfs/FileStore/shared_uploads/t-nbilla@expediagroup.com/IAM-Dataset'
datapath = '../Data/'

# ASCII Character list for encoding
characterList = [c for c in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^{|}~']

In [2]:
def preprocessImage(image):
  image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
  lower = np.array([0,0,0])
  upper = np.array([179, 255, 209])
  mask = cv2.inRange(image, lower, upper)
  kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (2,2))
  close = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=1)
  image[close==0] = (255,255,255)
  retouch_mask = (image <= [250.,250.,250.]).all(axis=2)
  image[retouch_mask] = [0,0,0]
  image = 255-image
  return image[...,0]

def readImg(data):
  try:
    img = preprocessImage(cv2.imread(data[0]))
    img = resizeAndPad(img, (150, 150))
    img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
  except:
    return {'X':data[0], 'y':data[1]}
  return {'X':img, 'y':data[1]}

def loadImagePaths(datapath):
  dataToProcess = []
  with open(f'{datapath}/ascii/words.txt') as f:
      lines = f.readlines()
  for line in tqdm(lines):
    if not line.split(" ")[0][0] == '#':
      try:
        data = line.split(" ")
        l = data[0].split('-')
        imgPath = f'{datapath}/words/{l[0]}/{l[0]}-{l[1]}/{data[0]}.png'
        label = data[-1].strip()
        # label = char_to_num(tf.strings.unicode_split(data[-1].strip(), input_encoding="UTF-8"))
        dataToProcess.append([imgPath, label])
      except:
        print(f"Failed to read: {imgPath}")
  return dataToProcess

def loadImages(datapath):
  dataToProcess = loadImagePaths(datapath)
  with multiprocessing.Pool() as p:
        results = list(tqdm(p.imap(readImg, dataToProcess), total=len(dataToProcess)))
  df = pd.DataFrame(results)
  df = df[df['X'].apply(lambda x: isinstance(x, np.ndarray))]
  return df

def resizeAndPad(img, size, padColor=0):

    h, w = img.shape[:2]
    sh, sw = size
    interp = cv2.INTER_LANCZOS4
#     # interpolation method
#     if h > sh or w > sw: # shrinking image
#         interp = cv2.INTER_AREA
#     else: # stretching image
#         interp = cv2.INTER_CUBIC

    # aspect ratio of image
    aspect = w/h  # if on Python 2, you might need to cast as a float: float(w)/h

    # compute scaling and pad sizing
    if aspect > 1: # horizontal image
        new_w = sw
        new_h = np.round(new_w/aspect).astype(int)
        pad_vert = (sh-new_h)/2
        pad_top, pad_bot = np.floor(pad_vert).astype(int), np.ceil(pad_vert).astype(int)
        pad_left, pad_right = 0, 0
    elif aspect < 1: # vertical image
        new_h = sh
        new_w = np.round(new_h*aspect).astype(int)
        pad_horz = (sw-new_w)/2
        pad_left, pad_right = np.floor(pad_horz).astype(int), np.ceil(pad_horz).astype(int)
        pad_top, pad_bot = 0, 0
    else: # square image
        new_h, new_w = sh, sw
        pad_left, pad_right, pad_top, pad_bot = 0, 0, 0, 0

    # set pad color
    if len(img.shape) == 3 and not isinstance(padColor, (list, tuple, np.ndarray)): # color image but only one color provided
        padColor = [padColor]*3

    # scale and pad
    scaled_img = cv2.resize(img, (new_w, new_h), interpolation=interp)
    scaled_img = cv2.copyMakeBorder(scaled_img, pad_top, pad_bot, pad_left, pad_right, borderType=cv2.BORDER_CONSTANT, value=padColor)

    return scaled_img

# Mapping characters to integers
char_to_num = keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=characterList, mask_token=None
)

# Mapping integers back to original characters
num_to_char = keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)

In [3]:
# df.to_pickle(f'{datapath}/IAM_words.pkl')
# df.to_pickle(f'{datapath}/IAM_words_encoded.pkl')
# df.to_pickle(f'{datapath}/IAM_words_encoded_padded.pkl')

# df = pd.read_pickle(f"{datapath}/IAM_words.pkl")
# df = pd.read_pickle(f"{datapath}/IAM_words_encoded.pkl")
df = pd.read_pickle(f"{datapath}/IAM_words_encoded_padded.pkl")

# df = loadImages(datapath)

In [4]:
# X_train, X_test, y_train, y_test = train_test_split(df.X, df.y, test_size=0.1, random_state=42)

In [5]:
def encode_single_sample(img, label):
    # # 1. Read image
    # img = tf.io.read_file(img_path)
    # # 2. Decode and convert to grayscale
    # img = tf.io.decode_png(img, channels=1)
    # # 3. Convert to float32 in [0, 1] range
    # img = tf.image.convert_image_dtype(img, tf.float32)
    # # 4. Resize to the desired size
    # img = tf.image.resize(img, [img_height, img_width])
    # # 5. Transpose the image because we want the time
    # # dimension to correspond to the width of the image.
    # img = tf.transpose(img, perm=[1, 0, 2])
    # 6. Map the characters in label to numbers
    img = tf.convert_to_tensor(img)
    img = tf.transpose(img, perm=[1, 0, 2])

    # label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    label = tf.convert_to_tensor(label)
    
    # 7. Return a dict as our model is expecting two inputs
    return {"image": img, "label": label}

In [6]:
full_dataset = tf.data.Dataset.from_tensor_slices((np.stack(df.X.values),np.stack(df.y.values)))
DATASET_SIZE = len(full_dataset)

In [7]:
testMapDataset = full_dataset.map(encode_single_sample, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [8]:
# train_size = int(0.8 * DATASET_SIZE)
# val_size = int(0.10 * DATASET_SIZE)
# test_size = int(0.10 * DATASET_SIZE)

# full_dataset = full_dataset.shuffle(DATASET_SIZE)
# train_ds = full_dataset.take(train_size)
# test_ds = full_dataset.skip(train_size)
# validation_ds = test_ds.skip(val_size)
# test_ds = test_ds.take(test_size)

train_size = int(0.8 * DATASET_SIZE)
val_size = int(0.10 * DATASET_SIZE)
test_size = int(0.10 * DATASET_SIZE)

testMapDataset = testMapDataset.shuffle(DATASET_SIZE)
train_ds = testMapDataset.take(train_size)
test_ds = testMapDataset.skip(train_size)
validation_ds = test_ds.skip(val_size)
test_ds = test_ds.take(test_size)


In [9]:
print("Number of training samples: %d" % tf.data.experimental.cardinality(train_ds))
print(
    "Number of validation samples: %d" % tf.data.experimental.cardinality(validation_ds)
)
print("Number of test samples: %d" % tf.data.experimental.cardinality(test_ds))

Number of training samples: 92219
Number of validation samples: 11528
Number of test samples: 11527


In [10]:
# print("Number of training samples: %d" % len(X_train))
# print("Number of test samples: %d" % len(X_test))

In [11]:
batch_size = 32

train_ds = train_ds.cache().batch(batch_size).prefetch(buffer_size=10)
validation_ds = validation_ds.cache().batch(batch_size).prefetch(buffer_size=10)
test_ds = test_ds.cache().batch(batch_size).prefetch(buffer_size=10)

# train_ds = train_ds.cache().prefetch(buffer_size=10)
# validation_ds = validation_ds.cache().prefetch(buffer_size=10)
# test_ds = test_ds.cache().prefetch(buffer_size=10)

In [12]:
class CTCLayer(keras.layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        # Compute the training-time loss value and add it
        # to the layer using `self.add_loss()`.
        # y_true = tf.RaggedTensor.from_tensor((y_true,), padding=-1).to_tensor()[0]

        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions
        return y_pred

In [13]:
base_model = keras.applications.EfficientNetB7(
    weights='imagenet',
    include_top=False,
    input_shape=(150, 150, 3)
)

# Freeze the base_model
base_model.trainable = False

# Create new model on top
inputs = keras.Input(shape=(150, 150, 3), name='image')
labels = keras.layers.Input(name="label", shape=(None,), dtype="float32")
# The base model contains batchnorm layers. We want to keep them in inference mode
# when we unfreeze the base model for fine-tuning, so we make sure that the
# base_model is running in inference mode here.
x = base_model(inputs, training=False)

# x = keras.layers.GlobalAveragePooling2D()(x)
# x = keras.layers.Dropout(0.2)(x)  # Regularize with dropout
# x = keras.layers.Flatten()(x)

x = keras.layers.TimeDistributed(keras.layers.Flatten(), name='timedistrib')(x)
x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
x = keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True, dropout=0.25))(x)
# x = keras.layers.Flatten()(x)

x = keras.layers.Dense(len(char_to_num.get_vocabulary()) + 1, activation="softmax", name="dense2")(x)
output = CTCLayer(name="ctc_loss")(labels, x)

model = keras.Model(inputs=[inputs, labels], outputs=output, name="effecientHTR_v1")
opt = keras.optimizers.Adam()
model.compile(optimizer=opt, run_eagerly=True)


model.summary()

Model: "effecientHTR_v1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
image (InputLayer)              [(None, 150, 150, 3) 0                                            
__________________________________________________________________________________________________
efficientnetb7 (Functional)     (None, 5, 5, 2560)   64097687    image[0][0]                      
__________________________________________________________________________________________________
timedistrib (TimeDistributed)   (None, 5, 12800)     0           efficientnetb7[0][0]             
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 5, 256)       13239296    timedistrib[0][0]                
____________________________________________________________________________________

In [14]:
epochs = 50
early_stopping_patience = 10
# Add early stopping
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=early_stopping_patience, restore_best_weights=True
)

# # Train the model
# history = model.fit(
#     x=X_train, y=y_train,
#     validation_split=0.1,
#     epochs=epochs,
#     shuffle=True
#     callbacks=[early_stopping],
# )

# Train the model
history = model.fit(
    train_ds,
    validation_data=validation_ds,
    epochs=epochs,
    callbacks=[early_stopping],
)

Epoch 1/50


ResourceExhaustedError: OOM when allocating tensor with shape[32,38,38,288] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Mul]