In [None]:
# If you run this script in colab
# !git clone https://github.com/pai-sj/ocr-text-recognition.git
# import sys
# sys.path.append("./ocr-text-recognition/")
# !pip install -r ./ocr-text-recognition/requirements.txtw

In [2]:
%matplotlib inline
import numpy as np
import cv2
    
import matplotlib.pyplot as plt

import sys
sys.path.append("../")

import tensorflow as tf 
assert int(tf.__version__[:1]) < 2.0, "해당 코드는 1.x에서만 동작합니다."
tf.logging.set_verbosity(tf.logging.ERROR)

# \[ MNIST 데이터셋 \]

MNIST 데이터셋을 통해 정상적으로 동작하는지를 확인해보도록 함

## 1. 데이터 가져오기 

In [2]:
from utils.dataset import SerializationDataset

train_set = SerializationDataset('mnist','train',
                                 digit=5,pad_range=(3,10))
validation_set = SerializationDataset('mnist','validation',
                                      digit=5,pad_range=(3,10))
test_set = SerializationDataset('mnist','test',
                                digit=(3,8),pad_range=(3,10))

## 2. 데이터 Generator 가져오기

In [3]:
from models.generator import Seq2SeqGenerator

train_gen = Seq2SeqGenerator(train_set, 
                          batch_size=32)
valid_gen = Seq2SeqGenerator(validation_set, 
                          batch_size=100, 
                          shuffle=False)
test_gen = Seq2SeqGenerator(test_set, 
                         batch_size=500, 
                         shuffle=False)

conv2text = test_gen.convert2text

In [None]:
X,Y = train_gen[0]
for i in range(3):
    image = X['images'][i,:,:,0]
    dec_input = X['decoder_inputs'][i]
    dec_input = conv2text(dec_input)
    
    output = Y['output_seqs'][i]
    output = conv2text(output)

    plt.title(f"Decoder Input : {dec_input} \n Model Output : {output}")
    plt.imshow(image)
    plt.show()

> CRNN과 다른 데이터셋이 필요합니다. CRNN과 달리, Images, Decoder Input, Model Output 이렇게 총 3개의 데이터가 필요합니다. Decoder Input과 Model Output은 1번의 Time Step 만큼 차이가 납니다. 

# \[ 2. SRN(Sequence Recognition Network) \]

MNIST 데이터셋을 통해 `SRN(Sequence Recognition Network)`을 학습시켜보도록 하겠습니다. SRN은 CRNN의 구조와 Seq2Seq, 그리고 Attention Network을 합친 모델입니다.

![Imgur](https://i.imgur.com/M11craN.png)

## 1. CRNN과 동일한 부분들 구성하기
----


### (1) Source Features 계산하기

![Imgur](https://i.imgur.com/nDZbuC2.png)

In [None]:
from models.layers import ConvFeatureExtractor, Map2Sequence

from tensorflow.keras.layers import Input
from tensorflow.keras import backend as K

In [None]:
height = 28
num_classes = 10
n_conv = 16

K.clear_session()

# For Gray Scale Image & Dynamic width
inputs = Input(shape=(height, None, 1),name='images')

# n_conv == Convolution Filter의 갯수를 정하는 계수 F
conv_maps = ConvFeatureExtractor(n_conv,
                                 name='feature_extractor')(inputs)
feature_seqs = Map2Sequence(name='map_to_sequence')(conv_maps)

#### C.F) `Map2Sequence`의 역할

Conv_maps의 Shape을 변경하여, Bidirectional LSTM Layer의 Input Shape 형태로 변경

In [None]:
print(f"conv_maps의 shape : {conv_maps.shape}")
print(f"feature_seqs의 shape : {feature_seqs.shape}")

### (2) Encoder State Vector($S_{encoder}$) 계산하기

![Imgur](https://i.imgur.com/kgZLw3N.png)

$$
states_{encoder} = [H_{forward} ; H_{backward}]
$$

In [None]:
from tensorflow.keras.layers import Layer
from tensorflow.keras.layers import Bidirectional, LSTM

class BLSTMEncoder(Layer):
    """
    CRNN 중 Recurrent Layers에 해당하는 Module Class
    Convolution Layer의 Image Feature Sequence를 Encoding하여,
    우리가 원하는 Text Feature Sequence로 만듦

    | Layer Name | #Hidden Units |
    | ----       | ------ |
    | Bi-LSTM1   | 256    |
    | Bi-LSTM2   | 256    |

    """
    def __init__(self, n_units=256, **kwargs):
        self.n_units = n_units
        super().__init__(**kwargs)
        self.lstm1 = Bidirectional(LSTM(n_units, return_sequences=True))
        self.lstm2 = Bidirectional(LSTM(n_units, return_sequences=True))

    def call(self, inputs, **kwargs):
        x = self.lstm1(inputs)
        x = self.lstm2(x)
        return x

    def get_config(self):
        config = {
            "n_units": self.n_units
        }
        base_config = super().get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
n_lstm = 256
states_encoder = BLSTMEncoder(n_units=n_lstm)(feature_seqs)

In [None]:
print(f'states_encoder의 shape : {states_encoder.shape}')

> Bidirectional 이므로, Forward 방향에서의 state, Backward 방향에서의 state가 모였기 때문에 n_lstm의 2배만큼 됩니다.

## 2. Attention 구성하기
----

![Imgur](https://i.imgur.com/clo5uEw.png)

Attention은 우리가 필요한 정보만을 취사선택할 수 있도록 만든 모듈입니다. 글자 영상 추출기를 통해 만들어진 정보 중 Decoder에서 필요한 정보만을 취사선택할 수 있도록 만듭니다. Attention은 하나의 방법론으로, 다양한 형태로 구성할 수 있습니다. 이번에 쓰는 방법은 Luong Attention입니다.

### (1) Score 계산하기
![Imgur](https://i.imgur.com/vNkwyPs.png)

Decoder의 정보중 어떤 정보가 더 중요한 정보인지를 판단하기 위한 지표로, Score을 아래와 같이 계산합니다. 내적의 연산의 중요한 특징은, 두 벡터가 유사할수록 그 크기가 커진다는 점에 있습니다. 디코더의 벡터($S$)와 인코더의 벡터($V$)를 내적해줌으로써, 디코더와 가까운 정보에게 더 가중치를 주게 됩니다.

### (2) Score를 Normalize 하기

![Imgur](https://i.imgur.com/hWCD9fK.png)

각 Time Step 별로 점수가 따로 매겨지게 됩니다. 이를 합산할 때, 그 크기가 지나치게 커지지 않도록, 전체 score의 합이 1이 되도록 표준화합니다. 

### (3) Context Vector 계산하기

![Imgur](https://i.imgur.com/OOvZyzv.png)

각 Encoder 정보와 Score 점수를 곱해서 나온 값이 바로 Context Vector가 됩니다. 이 정보는 Encoder의 정보 중 필요한 정보만을 추출한 정보가 됩니다. 이 정보를 바탕으로 분류기에 넣으면 우리가 원하는 철자 정보를 얻을 수 있게 됩니다.

In [None]:
from tensorflow.keras.layers import Softmax
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Layer

In [None]:
class DotAttention(Layer):
    """ General Dot-Product Attention Network (Luong, 2015)

    * n_state :
       if n_state is None, Dot-Product Attention(s_t * h_i)
       if n_state is number, general Dot-Product Attention(s_t * W_a * h_i)

    """
    def __init__(self, n_state=None, **kwargs):
        super().__init__(**kwargs)
        self.n_state = n_state
        if isinstance(self.n_state, int):
            self.key_dense = Dense(self.n_state)

    def call(self, inputs, **kwargs):
        states_encoder = inputs[0]
        states_decoder = inputs[1]

        # (0) adjust the size of encoder state to the size of decoder state
        if isinstance(self.n_state, int):
            key_vector = self.key_dense(states_encoder)
        else:
            key_vector = states_encoder

        # (1) Calculate Score
        expanded_states_encoder = key_vector[:, None, ...]
        # >>> (batch size, 1, length of encoder sequence, num hidden)
        expanded_states_decoder = states_decoder[..., None, :]
        # >>> (batch size, length of decoder sequence, 1, num hidden)
        score = K.sum(expanded_states_encoder * expanded_states_decoder,
                      axis=-1)
        # >>> (batch size, length of decoder input, length of encoder input)
        # (2) Normalize score
        attention = Softmax(axis=-1, name='attention')(score)

        # (3) Calculate Context Vector
        value_vector = states_encoder[:, None, ...] # Key Vector와 Value Vector을 다르게 둚
        context = K.sum(value_vector * attention[..., None], axis=2)
        # >>> (batch size, length of decoder input, num hidden)

        return context, attention

### (4) Decoder 구성하기

어텐션에 Query를 던질 Decoder을 구현해보도록 하겠습니다. 글자를 임베딩하는 Embedding Layer와 GRUCell을 이용하도록 하겠습니다.

![Imgur](https://i.imgur.com/f0jLCf5.png)

In [None]:
from tensorflow.keras.layers import Embedding

In [None]:
# For Gray Scale Image & Dynamic width
n_embed = 3
decoder_inputs = Input(shape=(None,),name='decoder_inputs')

embedding_layer = Embedding(num_classes+1, n_embed)
embeded_decoder_inputs = embedding_layer(decoder_inputs)

![Imgur](https://i.imgur.com/yArrBKh.png)

우리는 초기 state를 넣는 인자를 따로 만들었습니다.<br>
이후에 inference Logic을 짤 때, Decoder Logic에서 필요하므로 추가하였습니다.

### (5) Attention Layer 구성하기

In [None]:
from tensorflow.keras.layers import Softmax
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Layer

In [None]:
class DotAttention(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def call(self, inputs, **kwargs):
        states_encoder = inputs[0]
        states_decoder = inputs[1]
        
        # (1) Calculate Score
        expanded_states_encoder = states_encoder[:,None,...] 
        # >>> (batch size, 1, length of encoder sequence, num hidden)
        expanded_states_decoder = states_decoder[...,None,:] 
        # >>> (batch size, length of decoder sequence, 1, num hidden)            
        score = K.sum(expanded_states_encoder * expanded_states_decoder,
                          axis=-1)
        # >>> (batch size, length of decoder input, length of encoder input)
        
        # (2) Normalize score
        attention = Softmax(axis=-1, name='attention')(score)
        
        # (3) Calculate Context Vector
        context = K.sum(expanded_states_encoder * attention[...,None], axis=2)
        # >>> (batch size, length of decoder input, num hidden)
        
        return context, attention

In [None]:
dotattend = DotAttention()

context, attention = dotattend([states_encoder, states_decoder])

### (6) prediction with Softmax

![Imgur](https://i.imgur.com/ihz1Hpq.png)

In [None]:
from tensorflow.keras.layers import Dense, concatenate

In [None]:
n_clf = 256

clf1_layer = Dense(n_clf, activation='relu')
clf2_layer = Dense(num_classes+1, 
                   activation='softmax',
                   name='output_seqs')

concat_output = concatenate([context, states_decoder],
                            name='concat_output')
fc_outputs = clf1_layer(concat_output)
predictions = clf2_layer(fc_outputs)

## 3. 전체 모델 구성하고 학습시키기
----


In [None]:
K.clear_session()

height = 28
num_classes = 10
n_conv = 16 # the number of Convolution filter
n_state = 128 # the number of BLSM units
n_embed = 10 # the size of embedding vector
n_clf = 128 # the number of units in classifier Dense layer

# Image Encoder
inputs = Input(shape=(height, None, 1),name='images')
conv_maps = ConvFeatureExtractor(n_conv,
                                 name='feature_extractor')(inputs)
feature_seqs = Map2Sequence(name='map_to_sequence')(conv_maps)
states_encoder = Dense(n_state, activation='tanh')(feature_seqs)
#states_encoder = BLSTMEncoder(n_units=n_lstm)(feature_seqs)    

# Embedding Layer
decoder_inputs = Input(shape=(None,), name='decoder_inputs')
embedding_layer = Embedding(num_classes+1, n_embed)
embedding_target = embedding_layer(decoder_inputs)

# Text Decoder
decoder_state_inputs = Input(shape=(n_state,), name='decoder_state')
gru_layer = GRU(n_state, 
                name='decoder_gru', 
                return_sequences=True)
states_decoder = gru_layer(embedding_target,
                           initial_state=decoder_state_inputs)

# Attention Layer
dotattend = DotAttention()
context, attention = dotattend([states_encoder, states_decoder])

# Classifier Layer
clf1_layer = Dense(n_clf, activation='relu')
clf2_layer = Dense(num_classes+1, activation='softmax',name='output_seqs')

concat_output = concatenate([context, states_decoder], name='concat_output')
fc_outputs = clf1_layer(concat_output)
predictions = clf2_layer(fc_outputs)

## (8) 추론 모델과 학습 모델 구성하기

In [None]:
from tensorflow.keras.models import Model

In [None]:
# For training
trainer = Model([inputs, 
                 decoder_inputs,
                 decoder_state_inputs], 
                predictions, name='trainer')

# For Inference
# - (1) Encoder
encoder = Model(inputs, states_encoder, 
                name='encoder')

# - (2) Decoder
states_encoder_input = Input((None,n_state), 
                             name='states_encoder_input')

context, attention = dotattend([states_encoder_input, states_decoder])
concat_output = concatenate([context, states_decoder], axis=-1, 
                            name='concat_output')
fc_outputs = clf1_layer(concat_output)
predictions = clf2_layer(fc_outputs)

decoder = Model([states_encoder_input, decoder_inputs, decoder_state_inputs], 
                [states_decoder, predictions], name='decoder')

## (9) 학습 모델 Compile하기

학습할 모델에 대한 Loss Function와 optimizer를 결정합니다.

In [5]:
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

In [4]:
def masking_sparse_categorical_crossentropy(mask_value):
    """
    Runs sparse Categorical Crossentropy Loss Algorithm on each batch element Without Masking Value

    :param mask_value: masking value for preventing Back Propagation
    :return:
    """
    def loss(y_true, y_pred):
        y_true = tf.cast(y_true, K.floatx())
        mask = K.equal(y_true, mask_value)
        mask = 1 - K.cast(mask, K.floatx())
        y_true = y_true * mask

        loss = K.sparse_categorical_crossentropy(y_true, y_pred) * mask
        return K.sum(loss) / K.sum(mask)

    return loss

In [None]:
y_true = tf.placeholder(shape=(None, None), dtype=tf.int32)

trainer.compile(Adam(lr=1e-3),
                loss={"output_seqs":masking_sparse_categorical_crossentropy(-1)},
                target_tensors=[y_true])

## (10) 모델 학습하기

In [None]:
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [None]:
callbacks =[]
rlrop = ReduceLROnPlateau(
    factor=0.5, patience=5, 
    min_lr=1e-6, verbose=1)
callbacks.append(rlrop)

epochs = 50

In [None]:
train_gen = Seq2SeqGenerator(train_set, 
                          batch_size=32,
                          state_size=n_state)
valid_gen = Seq2SeqGenerator(validation_set, 
                             batch_size=100,
                             shuffle=False,
                             state_size=n_state)                            
test_gen = Seq2SeqGenerator(test_set, 
                            batch_size=500, 
                            shuffle=False,
                            state_size=n_state)                                                        

In [None]:
hist = trainer.fit_generator(train_gen,
                             epochs=epochs,
                             validation_data=valid_gen,
                             callbacks=callbacks)

## (11) 모델 평가하기

In [None]:
EOS_TOKEN = 10

In [None]:
X,_ = test_gen[0]

# Target image 
target_images = X['images'][:10]

# Encoder 결과 계산
states_encoder_ = encoder.predict(target_images)

In [None]:
# Decoder
batch_size = target_images.shape[0]

prev_inputs = np.ones((batch_size,1)) * EOS_TOKEN
prev_states = np.zeros((batch_size, 512))

In [None]:
result = prev_inputs.copy()
while True:
    states_decoder_, predictions_ = decoder.predict({
        "states_encoder_input" : states_encoder_,
        "decoder_inputs": prev_inputs,
        "decoder_state": prev_states        
    })
    prev_states = states_decoder_[:,-1,:]
    prev_inputs = np.argmax(predictions_,axis=-1)
    
    if np.all(prev_inputs == EOS_TOKEN):
        break
    result = np.concatenate([result,prev_inputs],axis=-1)
result = result[:,1:].squeeze()

In [None]:
for image, seq in zip(target_images,result):
    plt.title(seq)
    plt.imshow(image[:,:,0])
    plt.show()