In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import cv2

try:
    # Font로 자동으로 이미지 만들기
    import cairocffi as cairo
except:
    !pip install cairocffi
    
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib
from scipy import ndimage
from functools import partial

import re
import sys
sys.path.append("../")
from utils.dataset import OCRDataset
from models.generator import OCRGenerator, KOR2IDX, KOR_CHARS

In [2]:
import tensorflow as tf 
assert int(tf.__version__[:1]) < 2.0, "해당 코드는 1.x에서만 동작합니다."

# \[ MNIST 데이터셋 - CRNN \]

MNIST 데이터셋을 통해 정상적으로 동작하는지를 확인해보도록 함

## 1. 데이터 가져오기 

In [3]:
from utils.dataset import SerializationDataset

train_set = SerializationDataset('mnist','train',
                                 digit=5,pad_range=(3,10))
validation_set = SerializationDataset('mnist','validation',
                                      digit=5,pad_range=(3,10))
test_set = SerializationDataset('mnist','test',
                                digit=(3,8),pad_range=(3,10))

## 2. 데이터 Generator 가져오기

In [None]:
from models.generator import DataGenerator

train_gen = DataGenerator(train_set, 
                          batch_size=32)
valid_gen = DataGenerator(validation_set, 
                          batch_size=100, 
                          shuffle=False)
test_gen = DataGenerator(test_set, 
                         batch_size=500, 
                         shuffle=False)

## 3. 모델 구성하기

In [None]:
from models.layers import ConvFeatureExtractor, Map2Sequence 
from models.layers import BLSTMEncoder, CTCDecoder
from models.losses import ctc_loss

import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense

In [None]:
height = 28
num_classes = 10
n_hidden = 16
n_lstm = 256
K.clear_session()

# For Gray Scale Image & Dynamic width
inputs = Input(shape=(height, None, 1),name='image')

# (batch size, height, width, channels) 
# -> (batch size, width, height, channels)
transposed = K.permute_dimensions(inputs, (0, 2, 1, 3))

# CRNN Model
conv_maps = ConvFeatureExtractor(n_hidden=n_hidden,
                                 name='feature_extractor')(transposed)
feature_seqs = Map2Sequence(name='map_to_sequence')(conv_maps)
lstm_seqs = BLSTMEncoder(n_units=n_lstm)(feature_seqs)

# 우리의 출력 형태는 class 수에 Blank Label을 하나 더해 #classes + 1 만큼을 출력
output_seqs = Dense(num_classes+1,
                    activation='softmax',
                    name='output_seqs')(lstm_seqs)

In [None]:
from models.optimizer import AdamW

# 모델 구성하기
# (1) 학습 모델 구성하기
y_true =  tf.placeholder(shape=(None,None), dtype=tf.int32)
trainer = Model(inputs, output_seqs, name='trainer')
trainer.compile('adam',
                loss={"output_seqs":ctc_loss},
                target_tensors=[y_true])

### caution

`K.ctc_batch_cost`에 이용되는 Input Tensor의 Interface는 아래와 같습니다.

* y_true: tensor `(samples, max_string_length)` containing the truth labels.
* y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax.

In [None]:
# (2) 예측 모델 구성하기
predictions = CTCDecoder(beam_width=100)(output_seqs)
predictor = Model(inputs, predictions[0], name='predictor')

## 4. 모델  학습시키기


In [None]:
trainer.fit_generator(train_gen,
                      epochs=10,
                      validation_data=valid_gen)

## 5. 모델 평가하기

In [None]:
for image, true_label in zip(*test_gen[0]):
    result = predictor.predict(image[np.newaxis])
    predict_seq = "".join([str(char) for char in result.ravel()])
    plt.title(f'label : {predict_seq}')
    plt.imshow(image[:,:,0])
    plt.show()

# \[ 2. MNIST 데이터셋 - SRN(Sequence Recognition Network) \]

MNIST 데이터셋을 통해 `SRN(Sequence Recognition Network)`는 

## 1. 데이터 가져오기 

In [4]:
from utils.dataset import SerializationDataset

train_set = SerializationDataset('mnist','train',
                                 digit=5,pad_range=(3,10))
validation_set = SerializationDataset('mnist','validation',
                                      digit=5,pad_range=(3,10))
test_set = SerializationDataset('mnist','test',
                                digit=(3,8),pad_range=(3,10))


## 2. 데이터 Generator 가져오기

In [5]:
from models.generator import Seq2SeqGenerator

train_gen = Seq2SeqGenerator(train_set, 
                          batch_size=32)
valid_gen = Seq2SeqGenerator(validation_set, 
                          batch_size=100, 
                          shuffle=False)
test_gen = Seq2SeqGenerator(test_set, 
                         batch_size=500, 
                         shuffle=False)

## 3. 모델 구성하기

![Imgur](https://i.imgur.com/1KKH413.png)

In [None]:
from models.layers import ConvFeatureExtractor, Map2Sequence 
from models.layers import BLSTMEncoder, CTCDecoder
from models.losses import ctc_loss

import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import TimeDistributed

### (1) Source Features 계산하기

![Imgur](https://i.imgur.com/Ewzjpfa.png)

In [None]:
height = 28
num_classes = 10
n_hidden = 16
n_lstm = 256

K.clear_session()

# For Gray Scale Image & Dynamic width
inputs = Input(shape=(height, None, 1),name='images')

# (batch size, height, width, channels) 
# -> (batch size, width, height, channels)
transposed = K.permute_dimensions(inputs, (0, 2, 1, 3))

# CRNN Model
conv_maps = ConvFeatureExtractor(n_hidden=n_hidden,
                                 name='feature_extractor')(transposed)
feature_seqs = Map2Sequence(name='map_to_sequence')(conv_maps)

### (2) Encoder State Vector($S_{encoder}$) 계산하기

![Imgur](https://i.imgur.com/826jsU3.png)

$
S_{encoder} = [h_1;h_2;h_3;\cdots;h_k]
$

In [None]:
states_encoder = BLSTMEncoder(n_units=n_lstm)(feature_seqs)

### (3) Decoder Embedding 계산하기

In [None]:
# For Gray Scale Image & Dynamic width
decoder_inputs = Input(shape=(None,),name='decoder_inputs')

embedding_layer = Embedding(num_classes+1,256)
embeded_decoder_inputs = embedding_layer(decoder_inputs)

### (4) Decoder State Vector($S_{decoder}$) 계산하기

In [None]:
gru_layer = GRU(n_lstm*2, 
                name='decoder_gru', 
                return_sequences=True)

states_decoder = gru_layer(embeded_decoder_inputs)

### (5) Context Vector($C$) 계산하기


Code Reference : [Attention based Seq2Seq in Keras](https://wanasit.github.io/attention-based-sequence-to-sequence-in-keras.html)

In [None]:
from tensorflow.keras.layers import Softmax
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Layer

In [None]:
class DotAttention(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def call(self, inputs, **kwargs):
        states_encoder = inputs[0]
        states_decoder = inputs[1]
        
        # (1) Calculate Score
        expanded_states_encoder = states_encoder[:,None,...] 
        # >>> (batch size, 1, length of encoder sequence, num hidden)
        expanded_states_decoder = states_decoder[...,None,:] 
        # >>> (batch size, length of decoder sequence, 1, num hidden)            
        score = K.sum(expanded_states_encoder * expanded_states_decoder,
                          axis=-1)
        # >>> (batch size, length of decoder input, length of encoder input)
        
        # (2) Normalize score
        attention = Softmax(axis=-1, name='attention')(score)
        
        # (3) Calculate Context Vector
        context = K.sum(expanded_states_encoder * attention[...,None], axis=2)
        
        return context, attention
        

In [None]:
dotattend = DotAttention()

context, attention = dotattend([states_encoder, states_decoder])

### (6) prediction with Softmax

In [None]:
n_fc_hidden = 256

fc1_layer = Dense(n_fc_hidden, activation='tanh')
fc2_layer = Dense(num_classes+1, activation='softmax')

concat_output = concatenate([context, states_decoder],axis=-1, name='concat_output')
fc_outputs = TimeDistributed(fc1_layer)(concat_output)
predictions = TimeDistributed(fc2_layer, name='output_seqs')(fc_outputs)

### (7) 모델 전체 구성하기

In [None]:
K.clear_session()

height = 28
num_classes = 10
n_conv = 16 # the number of Convolution filter
n_lstm = 256 # the number of BLSM units
n_fc = 256 # the numbe of final Dense units

# For Gray Scale Image & Dynamic width, Target Inputs for Teaching Force
inputs = Input(shape=(height, None, 1),name='images')

# (batch size, height, width, channels) 
# -> (batch size, width, height, channels)
transposed = K.permute_dimensions(inputs, (0, 2, 1, 3))

# CRNN Model
conv_maps = ConvFeatureExtractor(n_conv, name='feature_extractor')(transposed)
feature_seqs = Map2Sequence(name='map_to_sequence')(conv_maps)

# BLSTM Encoder
states_encoder = BLSTMEncoder(n_units=n_lstm)(feature_seqs)

# Embedding Encoder
target_inputs = Input(shape=(None,),name='target_inputs')
embedding_layer = Embedding(num_classes+1,256)
embedding_target = embedding_layer(target_inputs)

# GRU Decoder
gru_layer = GRU(n_lstm*2, 
                name='decoder_gru', 
                return_sequences=True,
                return_state=True)

states_decoder, state_last = gru_layer(embedding_target)

# Attention Layer
dotattend = DotAttention()

context, attention = dotattend([states_encoder, states_decoder])

# Classifier Layer
fc1_layer = Dense(n_fc, activation='tanh')
fc2_layer = Dense(num_classes+1, activation='softmax')

concat_output = concatenate([context, states_decoder],axis=-1, name='concat_output')
fc_outputs = TimeDistributed(fc1_layer)(concat_output)
predictions = TimeDistributed(fc2_layer, name='output_seqs')(fc_outputs)

----

### (7) Loss Function 구하기

In [None]:
from models.optimizer import AdamW
from models.losses import masking_sparse_categorical_crossentropy

In [None]:
trainer = Model([inputs, target_inputs], predictions, name='trainer')
y_true = tf.placeholder(shape=(None, None), dtype=tf.int32)

trainer.compile(AdamW(lr=1e-3),
                loss={"output_seqs":masking_sparse_categorical_crossentropy(-1)},
                target_tensors=[y_true])

### (8) 모델 학습하기

In [None]:
trainer.fit_generator(train_gen, epochs=7)

### (9) 추론 모델(Inference Model) 구성하기

In [None]:
# Encoder Model 구성하기
image_encoder = Model(inputs,states_encoder,name='encoder')
embedding_layer = Embedding(num_classes+1,256)
embedding_target = embedding_layer(target_inputs)

In [None]:
# Decoder Model 구성하기
prev_state = Input(shape=(None,n_lstm*2), 
                   name='prev_state')
states_decoder, state_last = gru_layer(embedding_target,
                                       initial_state=prev_state)
context, attention = dotattend([states_encoder, states_decoder])

# Classifier Layer
fc1_layer = Dense(n_fc, activation='tanh')
fc2_layer = Dense(num_classes+1, activation='softmax')

concat_output = concatenate([context, states_decoder], axis=-1, 
                            name='concat_output')
fc_outputs = TimeDistributed(fc1_layer)(concat_output)
predictions = TimeDistributed(fc2_layer, name='output_seqs')(fc_outputs)

In [None]:
Model([])

# \[ Synthetic 데이터셋 \]

`cairo` 라이브러리로 작위적으로 만든 한글 이미지로 잘 학습되는 지를 확인

## 한글 Matplotlib 출력 세팅

In [None]:
# 1. 나눔 폰트를 다운받기
!apt-get update -qq
!apt-get install fonts-nanum* -qq

import matplotlib.font_manager as fm
# 2. 나눔 폰트의 위치 가져오기 
system_font = fm.findSystemFonts() # 현재 시스템에 설치된 폰트
nanum_fonts = [font for font in system_font if "NanumBarunGothic.ttf" in font]
font_path = nanum_fonts[0] # 설정할 폰트의 경로

# 3. 나눔 폰트로 설정하기
font_name = fm.FontProperties(fname=font_path, size=10).get_name()
plt.rc("font",family=font_name)

# 4. 폰트 재설정하기
fm._rebuild()

# 5. (optional) minus 기호 깨짐 방지
import matplotlib as mpl
mpl.rcParams['axes.unicode_minus'] = False

## 1. 데이터 가져오기 

In [None]:
# 단어 최소/최대 길이
min_words = 4
max_words = 8

df = pd.read_csv("../datasets/wordslist.txt",names=['word'])
df = df.drop_duplicates()
df = df[df.word.str.match(r'^[가-힣]+$')]
df = df[
    df.word.map(
        lambda x: (len(x) >= min_words) 
        and (len(x) <= max_words))]
words = df.word.values
np.random.shuffle(words)

In [None]:
# OCRDataset setting
OCRDataset = partial(OCRDataset,
                     font_size=24,
                     bg_noise=0.2,
                     affine_noise=(0.0,0.01),
                     color_noise=(0.1,0.3),
                     gray_scale=True)

dataset = OCRDataset(words)
images, labels = dataset[0:3]

for image, label in zip(images, labels):
    plt.title(label)
    plt.imshow(image[:,:,0],cmap='gray')
    plt.show()

## 2. 데이터 Generator 구현하기

In [None]:
# 단어 군을 기준으로 나누기 
# validation words는 5%만 둚
valid_words = words[:len(words)*5//100]
train_words = words[len(words)*5//100:]

valid_set = OCRDataset(valid_words)
train_set = OCRDataset(train_words)

train_gen = OCRGenerator(train_set, batch_size=64)
valid_gen = OCRGenerator(valid_set, batch_size=64)

## 3. 모델 구성하기

In [None]:
from models.layers import ConvFeatureExtractor, Map2Sequence 
from models.layers import BLSTMEncoder, CTCDecoder
from models.losses import ctc_loss

import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.layers import Input, Dense

In [None]:
height = 36
num_classes = len(KOR_CHARS)
n_hidden = 16
n_lstm = 256
K.clear_session()

# For Gray Scale Image & Dynamic width
inputs = Input(shape=(height, None, 1))

# (batch size, height, width, channels) 
# -> (batch size, width, height, channels)
transposed = K.permute_dimensions(inputs, (0, 2, 1, 3))

# CRNN Model
conv_maps = ConvFeatureExtractor(n_hidden=n_hidden,
                                 name='feature_extractor')(transposed)
feature_seqs = Map2Sequence(name='map_to_sequence')(conv_maps)
lstm_seqs = BLSTMEncoder(n_units=n_lstm)(feature_seqs)

# 우리의 출력 형태는 class 수에 Blank Label을 하나 더해 #classes + 1 만큼을 출력
output_seqs = Dense(num_classes+1,
                    activation='softmax',
                    name='output_seqs')(lstm_seqs)

In [None]:
from models.optimizer import AdamW

# 모델 구성하기
# (1) 학습 모델 구성하기
y_true =  tf.placeholder(shape=(None,None), dtype=tf.int32)
trainer = Model(inputs, output_seqs, name='trainer')
trainer.compile('adam',
                loss={"output_seqs":ctc_loss},
                target_tensors=[y_true])

### caution

`K.ctc_batch_cost`에 이용되는 Input Tensor의 Interface는 아래와 같습니다.

* y_true: tensor `(samples, max_string_length)` containing the truth labels.
* y_pred: tensor `(samples, time_steps, num_categories)` containing the prediction, or output of the softmax.

In [None]:
# (2) 예측 모델 구성하기
predictions = CTCDecoder(beam_width=100)(output_seqs)
predictor = Model(inputs, predictions[0], name='predictor')

## 4. 모델  학습시키기


In [None]:
trainer.fit_generator(train_gen,
                      epochs=10,
                      validation_data=valid_gen)

## 5. 모델 평가하기

In [None]:
for image, true_label in zip(*test_gen[0]):
    result = predictor.predict(image[np.newaxis])
    predict_seq = "".join([str(char) for char in result.ravel()])
    plt.title(f'label : {predict_seq}')
    plt.imshow(image[:,:,0])
    plt.show()

---

# [TODO] Attention GRU Modeling

### Reference : 

1. [Neural Machine Translation By Jointly Learning to Align and Translate](https://arxiv.org/pdf/1409.0473.pdf)
2. [Bahdanau Attention 개념 정리](https://hcnoh.github.io/2018-12-11-bahdanau-attention)

### GRU의 기본 공식
<br>
$
\hat y_t = softmax(W_y \cdot s_t + b_y) \\
s_t = z_t \odot s_{t-1} + (1-z_t) \odot \bar s_t \\
z_t = \sigma(W_z y_{t-1} + U_z s_{t-1} + b_z) \\
r_t = \sigma(W_r y_{t-1} + U_r s_{t-1} + b_r) \\
\bar s_t = tanh(W_s y_{t-1} + U_s(r_t \odot s_{t-1}) + b_s) \\
$

Attention 메커니즘을 활용하여 위의 연산들을 재정의하면 아래와 같이 정리할 수 있다. <br>
<br>
$
\hat y_t = softmax(W_y \cdot s_t + b_y) \\
s_t = z_t \odot s_{t-1} + (1-z_t) \odot \bar s_t \\
z_t = \sigma(W_z y_{t-1} + U_z s_{t-1} + C_z c_t + b_z) \\
r_t = \sigma(W_r y_{t-1} + U_r s_{t-1} + C_r c_t + b_r) \\
\bar s_t = tanh(W_s y_{t-1} + U_s(r_t \odot s_{t-1}) + C_s c_t + b_s) \\
$

GRU 모델 및 기본 RNN 모델에서의 Context Vector의 활용을 살펴보면 다음의 특징을 파악할 수 있습니다. Context Vector $c_t$는 RNN의 입력으로 사용되는 $y_{t-1}$ 과 함께 등장하며 함께 임베딩 공간에 뿌려져서 더해지는 방식으로 활용됩니다. 즉, 간단하게 정리하자면 $Wy_{t-1}$ 대신 $W y_{t-1} + C c_t$가 된다는 것이다. 이건 RNN 입력을 y_{t-1} 단독으로 사용하는 것이 아니라 Context Vector C_t와 Concatenation하여 사용하는 것과 같은 의미입니다. 이걸 수식으로 정리하면 다음과 같습니다. 




In [None]:
from tensorflow.keras.layers import GRU, Embedding
from tensorflow.keras.layers import Layer, GRUCell, LSTMCell, RNN

In [None]:
class AttentionCell(Layer):
    """
    Robust Scene Text Recognition with Automatic Rectification에서 나오는 
    <Attend> Network에 대한 Module Class
    
    Reference : 
    
    BLSTM Encoder Sequence에서 우리가 원하는 Text Sequence으로 바꾸기 위해, 
    BLSTM 부분에서 어떤 것들이 필요한 것인지를 파악
    """
    def __init__(self, n_units, **kwargs):
        self.state_size = n_units
        self.input_proj_layer = Dense(n_units, use_bias=False, 
                                      name='input_project')
        self.state_proj_layer = Dense(n_units,
                                      name='state_project')
        self.score_layer = Dense(1, use_bias=False,
                                 name='score')
        self.gru_layer = GRUCell(n_units)        
        super().__init__(**kwargs)
        
    def call(self, inputs, states):
        # (batch size, time step, hidden size) -> (batch size, time step, hidden size)
        h_proj = self.input_proj_layer(inputs)
        # (batch size, hidden size) -> (batch size, 1, hidden size)
        s_proj = self.state_proj_layer(states[0])
        s_proj = s_proj[:,None,:]
        
        # (batch size, time step, hidden size) 
        # -> (batch size, time step, 1) -> (batch size, time step)
        score = self.score_layer(K.tanh(h_proj+s_proj))
        score = score[:,:,0]
        
        alpha = K.softmax(score)[:,:,None]
        context = K.sum(inputs*alpha, axis=1)
                
        alpha = tf.identity(alpha, name='alpha')        
        context = tf.identity(context, name='context')
        print("Context : ", context.shape)
        print("State[0]   : ", self.gru_layer(context, states)[0].shape)
        print("State[1]   : ", self.gru_layer(context, states)[1])
        return self.gru_layer(context, states)
    

In [None]:
K.clear_session()

# For Gray Scale Image & Dynamic width
inputs = Input(shape=(height, None, 1))
labels = Input(shape=(None,))

x = Embedding(num_classes+1,256)
y = x(labels)

# (batch size, height, width, channels) -> (batch size, width, height, channels)
transposed = K.permute_dimensions(inputs, (0, 2, 1, 3))

# CRNN Model
conv_maps = ConvFeatureExtractor(name='feature_extractor')(transposed)
feature_seqs = Map2Sequence(name='map_to_sequence')(conv_maps)
blstm_seqs = BLSTMEncoder(n_units=256,name='blstm_encoder')(feature_seqs)
# attend_seqs = RNN(AttentionCell(n_units=256*2),
#                   return_sequences=True, 
#                   name='GRU_Attention')(blstm_seqs)


In [None]:
Attention?

In [None]:
from tensorflow.keras.layers import Attention, AdditiveAttention

In [None]:
AdditiveAttention?

In [None]:
blstm_seqs

In [None]:
attend_states

In [None]:
output_seqs = Dense(num_classes+1, activation='softmax')(attend_seqs)

In [None]:
output_seqs