In [1]:
!nvidia-smi

Sat Aug 01 18:13:16 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 445.87       Driver Version: 445.87       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 1650   WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   61C    P8     7W /  N/A |    134MiB /  4096MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU                  PID   Type   Process name                  GPU Memory |
|       

In [11]:
import keras
keras.__version__

'2.3.1'

#### *IMPORTING THE DEPENDENCIES*

In [2]:
import string
import cv2
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, LSTM, Reshape, BatchNormalization, Input, Conv2D, MaxPool2D, Lambda, Bidirectional
from keras.models import Model
import keras.backend as K
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


#### *READING THE TEXT FILE*

In [3]:
def read_file(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [4]:
data = read_file('parser.txt')
data = data.split('\n')
data[:10]

['a01-000u-00-00 ok 154 408 768 27 51 AT A',
 'a01-000u-00-01 ok 154 507 766 213 48 NN MOVE',
 'a01-000u-00-02 ok 154 796 764 70 50 TO to',
 'a01-000u-00-03 ok 154 919 757 166 78 VB stop',
 'a01-000u-00-04 ok 154 1185 754 126 61 NPT Mr.',
 'a01-000u-00-05 ok 154 1438 746 382 73 NP Gaitskell',
 'a01-000u-00-06 ok 154 1896 757 173 72 IN from',
 'a01-000u-01-00 ok 156 395 932 441 100 VBG nominating',
 'a01-000u-01-01 ok 156 901 958 147 79 DTI any',
 'a01-000u-01-02 ok 156 1112 958 208 42 AP more']

#### *ENCODER FUNCTION*

In [6]:
characters = string.ascii_letters + string.digits + string.punctuation
def encoder(txt):
    output = []
    for index, character in enumerate(txt):
        output.append(characters.index(character))
    return output
print(characters,len(characters))

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 94


#### *PREPROCESSING THE IMAGE FILES*

In [6]:
def transform(image):
    width , height = image.shape

    new_width = 32
    new_height = int(height * (new_width / width))
    image = cv2.resize(image, (new_height , new_width))
    image = image.astype('float32')
    width , height = image.shape

    if width < 32:
        add_zeros = np.full((32-width, height), 255)
        image = np.concatenate((image, add_zeros))
        width , height = image.shape

    if height < 128:
        add_zeros = np.full((width , 128-height), 255)
        image = np.concatenate((image, add_zeros), axis=1)
        width , height = image.shape

    if height > 128 or width > 32:
        dim = (128,32)
        image = cv2.resize(image, dim)

    image = cv2.subtract(255, image)
    # Expanding the dimension of image
    image = np.expand_dims(image, axis=2)
    # Normalize the image
    image = image / 255
    return image

#### *GENERATING THE TRAINING AND VALIDATION DATA*

In [7]:
train_images = []
train_labels = []
train_input_length = []
train_label_length = []
train_original_text = []

valid_images = []
valid_labels = []
valid_input_length = []
valid_label_length = []
valid_original_text = []

max_len = 0

for index, line in enumerate(data):
    splits = line.split(' ')
    status = splits[1]
    if status == 'ok':
        word_id = splits[0]
        word = "".join(splits[8:])
        splits_id = word_id.split('-')
        filepath = 'words/{}/{}-{}/{}.png'.format(splits_id[0], splits_id[0], splits_id[1],word_id)
        # processing on image
        img = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE)
        try:
            img = transform(img)
        except:
            continue
        # processing on label
        try:
            label = encoder(word)
        except:
            continue
        if index % 10 == 0:
            valid_images.append(img)
            valid_labels.append(label)
            valid_input_length.append(31)
            valid_label_length.append(len(word))
            valid_original_text.append(word)
        else:
            train_images.append(img)
            train_labels.append(label)
            train_input_length.append(31)
            train_label_length.append(len(word))
            train_original_text.append(word)
        if len(word) > max_len:
            max_len = len(word)
    if index >= 10000:
        break

#### *PADDING THE SEQUENCES*

In [8]:
train_padded_label = pad_sequences(train_labels, 
                             maxlen=max_len, 
                             padding='post',
                             value=len(characters))

valid_padded_label = pad_sequences(valid_labels, 
                             maxlen=max_len, 
                             padding='post',
                             value=len(characters))

In [9]:
train_padded_label.shape, valid_padded_label.shape

((7850, 16), (876, 16))

#### *CONVERTING IT INTO A NUMPY ARRAY*

In [10]:
train_images = np.asarray(train_images)
train_input_length = np.asarray(train_input_length)
train_label_length = np.asarray(train_label_length)

valid_images = np.asarray(valid_images)
valid_input_length = np.asarray(valid_input_length)
valid_label_length = np.asarray(valid_label_length)

In [11]:
train_images.shape

(7850, 32, 128, 1)

#### *BUILDING THE MODEL*

In [7]:
inputs = Input(shape=(32,128,1))
conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1) 
conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)
conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)
conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(conv_3)
pool_4 = MaxPool2D(pool_size=(2, 1))(conv_4) 
conv_5 = Conv2D(512, (3,3), activation = 'relu', padding='same')(pool_4)
batch_norm_5 = BatchNormalization()(conv_5)
conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
batch_norm_6 = BatchNormalization()(conv_6)
pool_6 = MaxPool2D(pool_size=(2, 1))(batch_norm_6)
conv_7 = Conv2D(512, (2,2), activation = 'relu')(pool_6)

squeezed = Lambda(lambda x: K.squeeze(x, 1))(conv_7)
 
blstm_1 = Bidirectional(LSTM(256, return_sequences=True, dropout = 0.2))(squeezed)
blstm_2 = Bidirectional(LSTM(256, return_sequences=True, dropout = 0.2))(blstm_1)
 
outputs = Dense(len(characters)+1, activation = 'softmax')(blstm_2)

model = Model(inputs, outputs)

In [8]:
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 32, 128, 1)        0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 32, 128, 64)       640       
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 16, 64, 64)        0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 16, 64, 128)       73856     
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 8, 32, 128)        0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 8, 32, 256)        295168    
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 8, 32, 256)        5900

#### *LOSS FUNCTION*

In [14]:
labels = Input(name='labels', shape=[max_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

loss = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])

#### *MODEL USED FOR TRAINING*

In [15]:
#model to be used at training time
model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss)

In [16]:
batch_size = 16
epochs = 30
e = str(epochs)
optimizer = 'Adam'

In [17]:
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = optimizer, metrics=['accuracy'])
# callbacks function
filepath="models/model-{}e-{}o.h5".format(str(epochs),str(optimizer))
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

In [18]:
history = model.fit(x=[train_images, train_padded_label, train_input_length, train_label_length],
                    y=np.zeros(len(train_images)),
                    batch_size=batch_size, 
                    epochs=epochs, 
                    validation_data=([valid_images, valid_padded_label, valid_input_length, valid_label_length], [np.zeros(len(valid_images))]),
                    verbose=1,
                    callbacks=[checkpoint])

Train on 7850 samples, validate on 876 samples
Epoch 1/30

Epoch 00001: val_loss improved from inf to 13.75102, saving model to models/model-30e-Adamo.h5
Epoch 2/30

Epoch 00002: val_loss improved from 13.75102 to 12.35194, saving model to models/model-30e-Adamo.h5
Epoch 3/30

Epoch 00003: val_loss improved from 12.35194 to 10.85446, saving model to models/model-30e-Adamo.h5
Epoch 4/30

Epoch 00004: val_loss improved from 10.85446 to 9.86072, saving model to models/model-30e-Adamo.h5
Epoch 5/30

Epoch 00005: val_loss improved from 9.86072 to 8.47661, saving model to models/model-30e-Adamo.h5
Epoch 6/30

Epoch 00006: val_loss improved from 8.47661 to 7.72624, saving model to models/model-30e-Adamo.h5
Epoch 7/30

Epoch 00007: val_loss improved from 7.72624 to 6.77066, saving model to models/model-30e-Adamo.h5
Epoch 8/30

Epoch 00008: val_loss improved from 6.77066 to 5.82648, saving model to models/model-30e-Adamo.h5
Epoch 9/30

Epoch 00009: val_loss improved from 5.82648 to 5.21826, sav