In [None]:
!pip install --upgrade keras
!pip install easydict

In [0]:
DIC = './'

In [3]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [0]:
from easydict import EasyDict as edict

cfg = edict()

cfg.CHAR_VECTOR = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-'_&.!?,\""

cfg.LEARNING_RATE = 0.0001

cfg.LR_DECAY_RATE = 0.95

cfg.EMBEDDING_DIM = 256

cfg.UNITS = 1024

cfg.TRAIN_BATCH_SIZE = 64

In [0]:
class LanguageIndex():
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = cfg.CHAR_VECTOR

        self.create_index()

    def create_index(self):
        self.word2idx['<pad>'] = 0
        self.word2idx['<start>'] = 1
        self.word2idx['<end>'] = 2
        self.word2idx[''] = 3
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 4

        for word, index in self.word2idx.items():
            self.idx2word[index] = word

In [0]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf

# tf.disable_v2_behavior()
tf.enable_eager_execution()


def gru(units):
    return tf.keras.layers.GRU(units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_activation='sigmoid',
                                   recurrent_initializer='glorot_uniform')


class Encoder(tf.keras.Model):
    def __init__(self, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.cnn = tf.keras.Sequential([
            tf.keras.layers.Conv2D(64, [3, 3], padding="same", activation='relu', input_shape=(None, None, 1)),
            tf.keras.layers.MaxPool2D(pool_size=[2, 2], strides=2),
            tf.keras.layers.Conv2D(128, [3, 3], padding="same", activation='relu'),
            tf.keras.layers.MaxPool2D(pool_size=[2, 2], strides=2),
            tf.keras.layers.Conv2D(256, [3, 3], padding="same", activation='relu'),
            tf.keras.layers.Conv2D(256, [3, 3], padding="same", activation='relu'),
            tf.keras.layers.MaxPool2D(pool_size=[2, 1], strides=[2, 1]),
            tf.keras.layers.Conv2D(512, [3, 3], padding="same", activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Conv2D(512, [3, 3], padding="same", activation='relu'),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.MaxPool2D(pool_size=[2, 1], strides=[2, 1]),
            tf.keras.layers.Conv2D(512, [2, 2], strides=[2, 1], padding="same", activation='relu'),
            tf.keras.layers.Reshape((25, 512))
        ], name='cnn')

        self.gru = gru(self.enc_units)

    def call(self, x):
        x = self.cnn(x)
        output, state = self.gru(x)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

    def model(self):
        x = tf.keras.layers.Input(shape=(None, None, 1))
        return tf.keras.Model(inputs=x, outputs=self.call(x))


class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 25, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 25, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 25, 1)
        # we get 1 at the last axis because we are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights


class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(enc_output, hidden)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x1 = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x2 = tf.concat([tf.expand_dims(context_vector, 1), x1], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x2)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size * 1, vocab)
        x = self.fc(output)

        return x, state, attention_weights

    def model(self):
        x = tf.keras.layers.Input(shape=(1))
        hidden = tf.keras.layers.Input(shape=(1024))
        enc_output = tf.keras.layers.Input(shape=(25, 1024))
        return tf.keras.Model(inputs=[x, hidden, enc_output], outputs=self.call(x, hidden, enc_output))

In [0]:
import cv2
import numpy as np
from typing import List
import matplotlib.pyplot as plt


def process_img(img_path, width=100, height=32, center=False):
    imread = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    if center:
        imread = make_center(imread)
    imread = resize_image(imread, width, height, center)
    imread = np.expand_dims(imread, axis=-1)
    imread = np.array(imread, np.float32)
    return imread

def make_center(img):
    h, w = img.shape
    minh = -1
    for i in range(h):
        if np.any(img[i,:] != 255):
            break
        minh = i
    maxh = h
    for i in range(h-1, -1, -1):
        if np.any(img[i,:] != 255):
            break
        maxh = i
    updis, downdis = minh+1, h-maxh
    starth, endh = 0, h
    if updis >= downdis:
        starth = updis - downdis
    else:
        endh = h - (downdis - updis)
    minw = -1
    for i in range(w):
        if np.any(img[:,i] != 255):
            break
        minw = i
    maxw = w
    for i in range(w-1, -1, -1):
        if np.any(img[:,i] != 255):
            break
        maxw = i
    leftdis, rightdis = minw+1, w-maxw
    startw, endw = 0, w
    if leftdis >= rightdis:
        startw = leftdis - rightdis
    else:
        endw = w - (rightdis - leftdis)
    img = img[starth:endh,startw:endw]
    return img

def resize_image(image, out_width, out_height, center):
    """
        Resize an image to the "good" input size
    """
    im_arr = image
    h, w = np.shape(im_arr)[:2]
    ratio = out_height / h

    im_arr_resized = cv2.resize(im_arr, (int(w * ratio), out_height))
    re_h, re_w = np.shape(im_arr_resized)[:2]

    if re_w >= out_width:
        final_arr = cv2.resize(im_arr, (out_width, out_height))
    else:
        final_arr = np.ones((out_height, out_width), dtype=np.uint8) * 255
        if center:
            start = (out_width - re_w) // 2
            final_arr[:, start:start+re_w] = im_arr_resized
        else:
            final_arr[:, 0:np.shape(im_arr_resized)[1]] = im_arr_resized
    return final_arr


def preprocess_label(label):
    label = label.rstrip().strip()
    w = '<start> '
    for i in label:
        w += i + ' '
    w += ' <end>'
    return w


def process_result(result, label_lang):
    result_label = ""
    for i in result:
        if label_lang.idx2word[i] != '<end>':
            result_label += label_lang.idx2word[i]
        else:
            return result_label
    return result_label


def compute_accuracy(ground_truth: List[str], predictions: List[str]) -> np.float32:
    accuracy = []
    for index, label in enumerate(ground_truth):
        prediction = predictions[index]
        total_count = len(label)
        correct_count = 0
        try:
            for i, tmp in enumerate(label):
                if tmp == prediction[i]:
                    correct_count += 1
        except IndexError:
            continue
        finally:
            try:
                accuracy.append(correct_count / total_count)
            except ZeroDivisionError:
                if len(prediction) == 0:
                    accuracy.append(1)
                else:
                    accuracy.append(0)

    accuracy = np.mean(np.array(accuracy).astype(np.float32), axis=0)
    return accuracy

In [0]:
def evaluate(encoder, decoder, img_path, label_lang):
    img = process_img(img_path)

    enc_output, enc_hidden = encoder(np.expand_dims(img, axis=0))

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([label_lang.word2idx['<start>']] * BATCH_SIZE, 1)

    results = np.zeros((BATCH_SIZE, 25), np.int32)

    for t in range(1, 25):
        # passing enc_output to the decoder
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)

        predicted_id = tf.argmax(predictions, axis=-1).numpy()

        results[:, t - 1] = predicted_id

        dec_input = tf.expand_dims(predicted_id, 1)

    preds = [process_result(result, label_lang) for result in results]

    return preds[0]

In [0]:
img_rows,img_cols,img_channels = [40,40,1]
nb_classes = 62
nb_conv=3
nb_pool=2

def create_model():
    """
        Creates a conv-net model
    """
    print("Creating model...")
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.BatchNormalization(input_shape=(img_rows, img_cols, img_channels)))

    model.add(tf.keras.layers.Convolution2D(32, (nb_conv, nb_conv), padding='same', strides=(2, 2)))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Convolution2D(32, (nb_conv, nb_conv), padding='same'))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.MaxPooling2D(pool_size=(nb_pool, nb_pool), strides=(1, 1)))

    model.add(tf.keras.layers.Convolution2D(48, (nb_conv, nb_conv), padding='same', strides=(2, 2)))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.Convolution2D(48, (nb_conv, nb_conv), padding='same'))
    model.add(tf.keras.layers.Activation('relu'))
    model.add(tf.keras.layers.BatchNormalization())

    model.add(tf.keras.layers.MaxPooling2D(pool_size=(nb_pool, nb_pool), strides=(1, 1)))

    model.add(tf.keras.layers.Flatten())

    model.add(tf.keras.layers.Dense(256, kernel_initializer='he_normal'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Activation('relu'))

    model.add(tf.keras.layers.Dropout(0.5))

    model.add(tf.keras.layers.Dense(nb_classes))
    model.add(tf.keras.layers.Activation('softmax'))

    sgd = tf.keras.optimizers.SGD(lr=0.01, momentum=0.9, decay=1e-7, nesterov=True)

    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    
    # model.summary()

    print("Done.")
    return model

In [10]:
import os

label_lang = LanguageIndex()
vocab_size = len(label_lang.word2idx)

BATCH_SIZE = 1
embedding_dim = cfg.EMBEDDING_DIM
units = cfg.UNITS

encoder = Encoder(units, BATCH_SIZE)
decoder = Decoder(vocab_size, embedding_dim, units, BATCH_SIZE)

checkpoint_dir = f'{DIC}checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder)

checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

single = create_model()

single_weights = f"{DIC}sgd.h5"
single.load_weights(single_weights)

Creating model...
Done.


In [13]:
WDIC = f"{DIC}words2"

SINGLE_CHAR_VECTOR = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

result = []

for dirname, _, filenames in os.walk(WDIC):
    for filename in filenames:
        pred = evaluate(encoder=encoder, decoder=decoder, img_path=os.path.join(dirname, filename), label_lang=label_lang)
        if len(pred) == 1:
            newimg = process_img(os.path.join(dirname, filename), width=40, height=40, center=True)
            pred = np.argmax(single.predict(np.expand_dims(newimg, axis=0)), axis=-1)
            pred = SINGLE_CHAR_VECTOR[pred[0]]
        elif pred.isupper():
            pred = ""
        print(f'image: {filename}  pred: {pred}')
        result.append({'file':filename, 'pred':pred})

image: 30.png  pred: things
image: 22.png  pred: of
image: 16.png  pred: endegalimations
image: 8.png  pred: A
image: 11.png  pred: l
image: 23.png  pred: us
image: 17.png  pred: Conition
image: 10.png  pred: Officct
image: 34.png  pred: lzz
image: 0.png  pred: Is
image: 25.png  pred: is
image: 35.png  pred: eficiencybillable
image: 2.png  pred: Possible
image: 33.png  pred: and
image: 3.png  pred: Ta
image: 28.png  pred: of
image: 32.png  pred: convenience
image: 13.png  pred: 
image: 6.png  pred: Parerless
image: 5.png  pred: Completely
image: 18.png  pred: Conted
image: 14.png  pred: unplabonatenenatioumenen<pad>
image: 19.png  pred: For
image: 26.png  pred: a
image: 1.png  pred: It
image: 29.png  pred: two
image: 7.png  pred: In
image: 31.png  pred: Quj
image: 9.png  pred: Law
image: 12.png  pred: Above
image: 15.png  pred: 1
image: 21.png  pred: rest
image: 27.png  pred: matter
image: 20.png  pred: the
image: 4.png  pred: Go
image: 24.png  pred: it
image: 88.png  pred: talking
ima

In [0]:
import pandas as pd

result = pd.DataFrame(result)

result.to_csv(f'{DIC}words2_pred.csv')