In [10]:
import os
import fnmatch
import cv2
import numpy as np
import string
import time

from keras.preprocessing.sequence import pad_sequences

from keras.layers import Dense, LSTM, Reshape, BatchNormalization, Input, Conv2D, MaxPool2D, Lambda, Bidirectional
from keras.models import Model
from keras.activations import relu, sigmoid, softmax
import keras.backend as K
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import tensorflow.compat.v1 as tf
from tensorflow.python.client import device_lib

#from google.colab import drive
#drive.mount('/content/drive')

tf.disable_v2_behavior()
tf.logging.set_verbosity(tf.logging.ERROR)

# Check all available devices if GPU is available
print(device_lib.list_local_devices())
sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14900191499262262266
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 14626652160
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2366681422775044046
physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
xla_global_id: 416903419
]
Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5



In [11]:
char_list = string.ascii_letters+string.digits

def encode_to_labels(txt):
    dig_lst = []
    for index, char in enumerate(txt):
        try:
            dig_lst.append(char_list.index(char))
        except:
            print(char)

    return dig_lst

In [12]:
#import shutil

#shutil.unpack_archive("drive/My Drive/mjsynth.tar.gz", "/content/drive/My Drive/mjsynth_dataset")

In [13]:
!pip install datasets
from datasets import load_dataset
import numpy as np
import cv2
from PIL import Image

#Load dataset from Hugging Face Hub
dataset = load_dataset('priyank-m/MJSynth_text_recognition', split='train')

#Process images
def process_image(pil_img):
    #Convert PIL Image to grayscale
    img = pil_img.convert('L')

    #Convert PIL Image to np array
    img = np.array(img)

    #Resize image to (32, 128) if larger
    w, h = img.shape
    if h > 128 or w > 32:
        img = cv2.resize(img, (128, 32), interpolation=cv2.INTER_AREA)
        w, h = img.shape  #Update dimensions

    #Add padding
    if w < 32 or h < 128:
        add_zeros_height = np.ones((32 - w, min(128, h))) * 255  # Pad height if needed
        add_zeros_width = np.ones((32, 128 - h)) * 255  # Pad width if needed

        img = np.concatenate((img, add_zeros_height), axis=0) if w < 32 else img
        img = np.concatenate((img, add_zeros_width), axis=1) if h < 128 else img

    #Normalize image
    img = img/255.0
    img = np.expand_dims(img, axis=2)
    return img

#Process/split dataset
training_img = []
training_txt = []
valid_img = []
valid_txt = []
train_input_length = []
train_label_length = []
valid_input_length = []
valid_label_length = []
orig_txt = []
valid_orig_txt = []
max_label_len = 0

for i, item in enumerate(dataset):
    #Process image
    img = process_image(item['image'])

    #Extract text
    txt = item['label']

    #Update maximum label length
    max_label_len = max(max_label_len, len(txt))

    #Split data into validation/training sets
    if i % 10 == 0:
        valid_orig_txt.append(txt)
        valid_img.append(img)
        valid_txt.append(encode_to_labels(txt))
        valid_label_length.append(len(txt))
        valid_input_length.append(31)
    else:
        orig_txt.append(txt)
        training_img.append(img)
        training_txt.append(encode_to_labels(txt))
        train_label_length.append(len(txt))
        train_input_length.append(31)

    #Limit # processed images
    if i == 499999:
        break

print(f"Processed {len(training_img)} training images and {len(valid_img)} validation images.")




Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Processed 450000 training images and 50000 validation images.


In [14]:
# pad each output label to maximum text length
train_padded_txt = pad_sequences(training_txt, maxlen=max_label_len, padding='post', value = len(char_list))
valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value = len(char_list))

print(train_padded_txt)

[[44 15  4 ... 62 62 62]
 [ 0  2  2 ... 62 62 62]
 [28 26 43 ... 62 62 62]
 ...
 [33  0  6 ... 62 62 62]
 [18  0 13 ... 62 62 62]
 [28  0  2 ... 62 62 62]]


In [15]:
print(np.shape(train_padded_txt))

(450000, 23)


In [16]:
def build_model():
  inputs = Input(shape=(32,128,1))

  conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
  pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)

  conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
  pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)

  conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)

  conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
  pool_4 = MaxPool2D(pool_size=(2, 1))(conv_4)

  conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
  batch_norm_5 = BatchNormalization()(conv_5)

  conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
  batch_norm_6 = BatchNormalization()(conv_6)
  pool_6 = MaxPool2D(pool_size=(2, 1))(batch_norm_6)

  conv_7 = Conv2D(512, (2,2), activation = 'relu')(pool_6)

  squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_7)

  blstm_1 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(squeezed)
  blstm_2 = Bidirectional(LSTM(128, return_sequences=True, dropout = 0.2))(blstm_1)

  outputs = Dense(len(char_list)+1, activation = 'softmax')(blstm_2)

  act_model = Model(inputs, outputs)
  return act_model, inputs, outputs

In [17]:
act_model, inputs, outputs = build_model()
act_model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 32, 128, 1)]      0         
                                                                 
 conv2d_7 (Conv2D)           (None, 32, 128, 64)       640       
                                                                 
 max_pooling2d_4 (MaxPoolin  (None, 16, 64, 64)        0         
 g2D)                                                            
                                                                 
 conv2d_8 (Conv2D)           (None, 16, 64, 128)       73856     
                                                                 
 max_pooling2d_5 (MaxPoolin  (None, 8, 32, 128)        0         
 g2D)                                                            
                                                                 
 conv2d_9 (Conv2D)           (None, 8, 32, 256)        2951

In [18]:
labels = Input(name='the_labels', shape=[max_label_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

print(input_length)
print(label_length)

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])

model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)

Tensor("input_length_1:0", shape=(?, 1), dtype=int64)
Tensor("label_length_1:0", shape=(?, 1), dtype=int64)


In [19]:
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')

filepath="best_model.hdf5"
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
callbacks_list = [checkpoint]

In [20]:

training_img = np.array(training_img)
valid_img = np.array(valid_img)

train_input_length = np.array(train_input_length)
train_label_length = np.array(train_label_length)
valid_input_length = np.array(valid_input_length)
valid_label_length = np.array(valid_label_length)
print(train_input_length)
#print(training_img)
#print(valid_img)

[31 31 31 ... 31 31 31]


In [24]:
print(np.shape(train_input_length))
print(valid_label_length)
print(valid_input_length)

(450000,)
[ 4  6  5 ...  9  5 11]
[31 31 31 ... 31 31 31]


In [21]:
print(len(training_img))
print(len(training_txt))
print(len(train_input_length))
print(len(train_label_length))

450000
450000
450000
450000


In [19]:
# !pip install ray
# !pip install -U "ray[tune]"
# !pip install ray[tune] bayesian-optimization
# !pip install scikit-optimize
# import ray
# from ray import tune
# from ray import train
# from ray.tune.schedulers import HyperBandScheduler
# from ray.tune.search.bayesopt import BayesOptSearch


#Grid search hyperparameter tuning

# batch_sizes = [64, 128, 256]
# epochs = [10, 20, 30, 40, 50]

# for batch_size in batch_sizes:
#   for epoch in epochs:
#     print("BATCH SIZE: ")
#     print(batch_size)
#     print("EPOCHS: ")
#     print(epoch)
#     model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)
#     model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')
#     model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length], y=np.zeros(len(training_img)), batch_size=batch_size, epochs = epoch, validation_data = ([valid_img, valid_padded_txt, valid_input_length, valid_label_length], [np.zeros(len(valid_img))]), verbose = 1, callbacks = callbacks_list)





In [20]:
print(np.shape(training_img))
print(np.shape(train_padded_txt))
print(np.shape(train_input_length))
print(np.shape(train_label_length))
#print(train_label_length)
#print(training_img)


#OPTIMAL BATCH SIZE AND EPOCHS
batch_size = 64
epochs = 20
model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length], y=np.zeros(len(training_img)), batch_size=batch_size, epochs = epochs, validation_data = ([valid_img, valid_padded_txt, valid_input_length, valid_label_length], [np.zeros(len(valid_img))]), verbose = 1, callbacks = callbacks_list)


(450000, 32, 128, 1)
(450000, 23)
(450000,)
(450000,)
Train on 450000 samples, validate on 50000 samples
Epoch 1/20

  updates = self.state_updates



Epoch 1: val_loss improved from inf to 10.54279, saving model to best_model.hdf5


  saving_api.save_model(


Epoch 2/20
Epoch 2: val_loss improved from 10.54279 to 6.65361, saving model to best_model.hdf5
Epoch 3/20
Epoch 3: val_loss improved from 6.65361 to 3.69054, saving model to best_model.hdf5
Epoch 4/20
Epoch 4: val_loss improved from 3.69054 to 3.51883, saving model to best_model.hdf5
Epoch 5/20
Epoch 5: val_loss improved from 3.51883 to 3.31471, saving model to best_model.hdf5
Epoch 6/20
Epoch 6: val_loss improved from 3.31471 to 3.21003, saving model to best_model.hdf5
Epoch 7/20
Epoch 7: val_loss improved from 3.21003 to 3.14317, saving model to best_model.hdf5
Epoch 8/20
Epoch 8: val_loss improved from 3.14317 to 3.08841, saving model to best_model.hdf5
Epoch 9/20
Epoch 9: val_loss improved from 3.08841 to 2.99820, saving model to best_model.hdf5
Epoch 10/20
Epoch 10: val_loss did not improve from 2.99820
Epoch 11/20
Epoch 11: val_loss improved from 2.99820 to 2.98854, saving model to best_model.hdf5
Epoch 12/20
Epoch 12: val_loss did not improve from 2.98854
Epoch 13/20
Epoch 13: 

<keras.src.callbacks.History at 0x7b890b088a90>

In [26]:
!pip install python-Levenshtein

from Levenshtein import distance

act_model.load_weights('crnn_18.hdf5')

#act_model.save('crnn_best.h5', save_format='h5')

distances = []
accuracy = []
accuracy_adjusted = []
correct = []
incorrect = []

prediction = act_model.predict(valid_img[:50000])
out = K.get_value(K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0])*prediction.shape[1],
                         greedy=True)[0][0])

i = 0
for x in out:
    print("original_text =  ", valid_orig_txt[i])
    print("predicted text = ", end = '')
    word = ''
    for p in x:
        if int(p) != -1:
            print(char_list[int(p)], end = '')
            word = word + char_list[int(p)]
    print('\n')
    print("WORD: ")
    print(word)
    print()
    distances.append(distance(valid_orig_txt[i], word))
    tempDist = distance(valid_orig_txt[i], word)
    print(tempDist)
    if word == valid_orig_txt[i]:
      print("IDENTICAL")
      accuracy.append(1)
      accuracy_adjusted.append(1)
    else:
      print("NOT IDENTICAL")
      accuracy.append(0)
      accuracy_adjusted.append(1/tempDist)
      incorrect.append(word)
      correct.append(valid_orig_txt[i])
    i+=1

print("DISTANCE: ")
print(np.shape(distances))
print(np.mean(distances))
print("ACCURACY: ")
print(np.shape(accuracy))
print(np.mean(accuracy))
print("ACCURACY ADJUSTED: ")
print(np.shape(accuracy_adjusted))
print(np.mean(accuracy_adjusted))
print("CORRECT SHAPE: ")
print(np.shape(correct))
print("INCORRECT SHAPE: ")
print(np.shape(incorrect))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m

0
IDENTICAL
original_text =   Entirety
predicted text = Entirety

WORD: 
Entirety

0
IDENTICAL
original_text =   Pummels
predicted text = Pammels

WORD: 
Pammels

1
NOT IDENTICAL
original_text =   REFINANCING
predicted text = REANANCING

WORD: 
REANANCING

2
NOT IDENTICAL
original_text =   GESTAPO
predicted text = GESTAPO

WORD: 
GESTAPO

0
IDENTICAL
original_text =   Ck
predicted text = Ck

WORD: 
Ck

0
IDENTICAL
original_text =   GAMOW
predicted text = CAMOW

WORD: 
CAMOW

1
NOT IDENTICAL
original_text =   Pervasively
predicted text = Pervasively

WORD: 
Pervasively

0
IDENTICAL
original_text =   Bozo
predicted text = Bozo

WORD: 
Bozo

0
IDENTICAL
original_text =   TURMOIL
predicted text = TURMOIL

WORD: 
TURMOIL

0
IDENTICAL
original_text =   canniest
predicted text = cannlest

WORD: 
cannlest

1
NOT IDENTICAL
original_text =   Geeky
predicted text = Geeky

WORD: 
Geeky

0
IDENTICAL
original_text =   guess
predicted 

In [45]:
distances_incorrect = []

for i in range(len(correct)):

  print(correct[i].lower())
  print(incorrect[i].lower())
  print()
  distances_incorrect.append(distance(correct[i].lower(), incorrect[i].lower()))

print(np.shape(distances_incorrect))
print(np.mean(distances_incorrect))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
brainier
brainer

bill
cll

mussolini
mussolin

brag
brag

falconry
faleonry

baptiste
boptisno

electrification
rectrification

volition
mniton

doming
coming

overwhelms
qverwheims

reinvigorated
reinvigoroted

daiquiri
daicur

sweetener
swebiener

pennyweights
hinnades

pettifogs
hetengs

tyrannosaurus
tyrannosaupus

rubbishing
rubblshing

recondition
reconiytion

nannys
nonnys

mcclain
meclain

heartless
hcartless

sacrifice
sacritice

riderless
hdiress

franco
mraed

somali
bomafi

modicum
modicun

pantomimed
rshithihed

quartered
quattered

earnests
parnests

ruffly
nufly

gnocchi
cs

irretrievable
trretrivabie

grieg
griec

tinged
uinged

signalized
signalized

bleakly
bleckly

flawed
flawep

segregates
segregates

kline
kine

charlatanism
charlatonism

ambassador
anbassador

languorously
lancuorously

resewing
resing

purgers
nurgers

iotas
tatoap

labyrinthine
babvrenpeiee

equivocated
equiyocated

splat
aplat

s