In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import cv2
import sys

sys.path.append("../")
from keras.applications.vgg16 import preprocess_input, VGG16
import tensorflow as tf
import keras

from utils.dataset import OCRDataset
from models.data_utils import normalize_shape, generate_output, restore_quandrangle
import matplotlib.pyplot as plt

Using TensorFlow backend.


Graph Visualization
---


In [2]:
from IPython.display import clear_output, Image, display, HTML
import numpy as np    

def strip_consts(graph_def, max_const_size=32):
    """Strip large constant values from graph_def."""
    strip_def = tf.GraphDef()
    for n0 in graph_def.node:
        n = strip_def.node.add() 
        n.MergeFrom(n0)
        if n.op == 'Const':
            tensor = n.attr['value'].tensor
            size = len(tensor.tensor_content)
            if size > max_const_size:
                tensor.tensor_content = "<stripped %d bytes>"%size
    return strip_def

def show_graph(graph_def, max_const_size=32):
    """Visualize TensorFlow graph."""
    if hasattr(graph_def, 'as_graph_def'):
        graph_def = graph_def.as_graph_def()
    strip_def = strip_consts(graph_def, max_const_size=max_const_size)
    code = """
        <script>
          function load() {{
            document.getElementById("{id}").pbtxt = {data};
          }}
        </script>
        <link rel="import" href="https://tensorboard.appspot.com/tf-graph-basic.build.html" onload=load()>
        <div style="height:600px">
          <tf-graph-basic id="{id}"></tf-graph-basic>
        </div>
    """.format(data=repr(str(strip_def)), id='graph'+str(np.random.rand()))

    iframe = """
        <iframe seamless style="width:1200px;height:620px;border:0" srcdoc="{}"></iframe>
    """.format(code.replace('"', '&quot;'))
    display(HTML(iframe))

In [3]:
def initialize_uninitialized(sess):
    global_vars          = tf.global_variables()
    is_not_initialized   = sess.run([tf.is_variable_initialized(var) for var in global_vars])
    not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f]

    if len(not_initialized_vars):
        sess.run(tf.variables_initializer(not_initialized_vars))

# EAST 모델 구현하기
---
East 모델은 크게 3가지 부분으로 구성되어 있습니다.
````
The model can be decomposed into three parts: Feature Extractor stem, feature-merging branch and output layer.
````

![Imgur](https://i.imgur.com/XZ9gbdr.png)

## 1. Feature Extractor(stem) 구성하기
---

Backend로는 일단 기본적으로 쓰이는 VGG16 모델을 이용하도록 하겠습니다.<br>
EAST 논문에서는 Backend 부분을 stem이라 부릅니다.<br>

| feature map | size | tensor name |
| ---- | ---- | ---- |
| $f1$ | $\frac{1}{32}$ | `block5_pool` |
| $f2$ | $\frac{1}{16}$ | `block4_pool` |
| $f3$ | $\frac{1}{8}$  | `block3_pool` |
| $f4$ | $\frac{1}{4}$  | `block2_pool` |

In [4]:
keras.backend.clear_session()

with tf.variable_scope('stem'):
    vgg16 = VGG16(include_top=False)

sess = keras.backend.get_session()
graph = sess.graph
with graph.as_default():
    inputs = vgg16.input
    
    feature_maps = []
    for i in range(5,1,-1):
        feature_map = vgg16.get_layer('block{}_pool'.format(i))
        feature_tensor = tf.identity(feature_map.output,
                                  "f{}".format(6-i))
        feature_maps.append(feature_tensor)

Instructions for updating:
Colocations handled automatically by placer.


In [5]:
show_graph(graph)

## 2. Feature Merging (branch) 구성하기
---

$$
g_i = \begin{cases}
unpool(h_i), & \mbox{if  } i \le 3 \\
conv_\mbox{3x3}(h_i), & \mbox{if  } i = 4
\end{cases}\\
h_i = \begin{cases}
f_i, & \mbox{if  } i = 1 \\
conv_\mbox{3x3}(conv_\mbox{1x1}(g_{i-1}:f_i)), & \mbox{otherwise  }
\end{cases}
$$

In [6]:
def unpool(tensor):
    with tf.variable_scope('unpool'):
        shape = tf.shape(tensor)
        return tf.image.resize_bilinear(tensor,
                                        size=[shape[1]*2,shape[2]*2])    

In [7]:
num_layers = [None, 128, 64, 32, 32]

with graph.as_default():
    f = None
    h = None
    g = None
    with tf.variable_scope('branch'):
        for i, f in enumerate(feature_maps,1):
            num_layer = num_layers[i]
            with tf.variable_scope('block{}'.format(i)):            
                if i == 1:
                    h = f
                else:
                    concat = tf.concat([g,f], axis=-1)
                    squeeze = tf.layers.Conv2D(num_layer,(1,1),
                                     padding='same',
                                     activation=tf.nn.relu,
                                     name='conv_1x1')(concat)
                    h = tf.layers.Conv2D(num_layer,(3,3),
                                         padding='same',
                                         activation=tf.nn.relu,
                                         name='conv_3x3')(squeeze)
                if i <= 3:
                    g = unpool(h)
                else:
                    g = tf.layers.Conv2D(num_layer,(3,3),
                                         padding='same',
                                         activation=tf.nn.relu)(h)
    g = tf.identity(g,name='final_feature_map')

In [9]:
show_graph(sess.graph)

## 3. output layer 구성하기
---

일단 우리는 RBOX을 기준으로 구현하도록 하겠습니다.<br>
RBox가 QUAD보다 성능이 더 뛰어나게 나타났기 때문입니다.<br>

구현 시 조심해야 하는 것 중 하나가 바로, <br>
좌표값에는 최대 scale이 정해져 있다는 것입니다. (논문엔 따로 적혀있지도 않았습니다.)

In [10]:
text_scale = 512

with graph.as_default():
    with tf.variable_scope('output'):
        score_map = tf.layers.Conv2D(1, (1,1), 
                                     activation=tf.nn.sigmoid,
                                     name='score')(g)
        loc_map = tf.layers.Conv2D(4, (1,1),
                                   activation=tf.nn.sigmoid)(g)
        loc_map = tf.identity(text_scale * loc_map, name='score')
        
        with tf.variable_scope('angle'):
            # angle should be in [-45, 45]
            angle_map = tf.layers.Conv2D(1, (1,1),
                                        activation=tf.nn.sigmoid)(g)
            angle_map = (angle_map - 0.5) * np.pi/2 
            
        y_pred_cls = tf.identity(score_map,name='score_prediction')
        y_pred_geo = tf.concat([loc_map, angle_map], axis=-1,
                               name='geometry_prediction')

In [11]:
with graph.as_default():
    initialize_uninitialized(sess)
    output = sess.run(angle_map,feed_dict={g:np.random.uniform(-100,100,(10,10,10,32))})
    print("angle 최소 값 : ",output.min())
    print("angle 최대 값 : ",output.max())

angle 최소 값 :  -0.7853982
angle 최대 값 :  0.7853982


In [12]:
show_graph(graph)

## 4. Loss Function 구성하기
---

$
L = L_{s} + \lambda_{g}L_{g},\\
L_{s} \mbox{: losses for score map} \\
L_{g} \mbox{: losses for geometry} \\
$

### 4.1 Score Loss 계산하기
----
Score Loss는 아래와 같습니다.<br>
$
Loss = \mbox{balanced-xent}(\hat Y,Y^*)\\
     = -\beta Y^* log \hat Y - (1-\beta)(1-Y^*)log(1-\hat Y)
$<br>

$\beta$는 positive sample과 negative sample의 수를 조정하는 인자로, 아래와 같이 구할 수 있습니다.<br>
$
\beta = \frac{\mbox{positive number of samples}}{\mbox{total number of samples}}
$

In [13]:
epsilon = 1e-7

with graph.as_default():
    y_true_cls = tf.placeholder(tf.float32,
                                shape=(None,None,None,1),
                                name='y_true_cls')

    with tf.variable_scope('score'):
        with tf.variable_scope('balance_factor'):
            num_pos = tf.count_nonzero(y_true_cls,axis=[1,2,3],dtype=tf.float32)
            num_tot = tf.reduce_prod(tf.shape(y_true_cls)[1:])
            beta = num_pos / tf.cast(num_tot,tf.float32)
            beta = tf.reshape(beta,shape=(-1,1,1,1))

        with tf.variable_scope('balanced_cross_entropy'):
            bcse = -(beta*y_true_cls*tf.log(epsilon+y_pred_cls) + 
                     (1.-beta)*(1.-y_true_cls)*tf.log(epsilon+1.-y_pred_cls))

    score_loss = tf.reduce_mean(bcse, name='score_loss')

In [14]:
show_graph(graph)

### c.f) Score loss 계산이 올바르게 돌아가는지 확인하기

#### 1) 완전 랜덤인 상황

In [15]:
half_1 = np.ones(shape=(10,50,250,1))
half_2 = np.zeros(shape=(10,200,250,1))
true_y = np.concatenate([half_1,half_2],axis=1)
pred_y = np.random.uniform(0,1,true_y.shape)

with graph.as_default():
    initialize_uninitialized(sess)
    result = sess.run(score_loss,feed_dict={
        y_true_cls : true_y,
        y_pred_cls : pred_y
    })
    
    print(result)

0.6799598


#### 2) 거의 비슷하게 맞추는 지경

In [16]:
half_1 = np.ones(shape=(10,50,250,1))
half_2 = np.zeros(shape=(10,200,250,1))
true_y = np.concatenate([half_1,half_2],axis=1)
pred_y = np.clip(true_y + np.random.normal(0,0.1,size=true_y.shape),0.,1.)

with graph.as_default():
    initialize_uninitialized(sess)
    result = sess.run(score_loss,feed_dict={
        y_true_cls : true_y,
        y_pred_cls : pred_y
    })
    
    print(result)

0.028918803


##### 3) 좀 더 틀리는 상황

In [17]:
half_1 = np.ones(shape=(10,50,250,1))
half_2 = np.zeros(shape=(10,200,250,1))
true_y = np.concatenate([half_1,half_2],axis=1)
pred_y = np.clip(true_y + np.random.normal(0,0.3,size=true_y.shape),0.,1.)

with graph.as_default():
    initialize_uninitialized(sess)
    result = sess.run(score_loss,feed_dict={
        y_true_cls : true_y,
        y_pred_cls : pred_y
    })
    
    print(result)

0.110305935


#### 4) 많이 틀리는 상황

In [18]:
half_1 = np.ones(shape=(10,50,250,1))
half_2 = np.zeros(shape=(10,200,250,1))
true_y = np.concatenate([half_1,half_2],axis=1)
pred_y = np.clip(true_y + np.random.normal(0,0.5,size=true_y.shape),0.,1.)

with graph.as_default():
    initialize_uninitialized(sess)
    result = sess.run(score_loss,feed_dict={
        y_true_cls : true_y,
        y_pred_cls : pred_y
    })
    
    print(result)

0.4384753


### 4.2 Geometry Loss 계산하기
----
Geometry Loss는 아래와 같습니다.<br>
$
Loss = Loss_{IoU} + \lambda_{\theta} Loss_{\theta}
$<br>

두개의 세부 로스로 나뉘어져 있습니다.<br>

기본적으로 좌표에 대한 Loss는 IoU Loss를 기반으로 작성되었습니다.<br>
이는 물체의 서로다른 스케일에 invariant하게 로스가 생기게 하기 위함입니다. <br>
IOU에 대한 로스는 아래와 같습니다.<br>
$
Loss_{IoU} = - log IoU(\hat R,R^*) = -log\frac{|\hat R \cap R^*|}{|\hat R \cup R^*|}\\
| \hat R \cup R^* | = |\hat R| + |R^*| - | \hat R \cap R^* |
$<br>

각($\theta$)에 대한 로스는 아래와 같습니다.<br>
$
Loss_{\theta} = 1 - cos(\hat \theta - \theta^*)
$<br>

In [19]:
iou_smooth = 1e-5 # Iou Loss 계산시 문제 생기는 걸 방지
alpha_theta = 10 # Weight for theta loss
alpha_geo = 1 # Weight for geometry loss
with graph.as_default():
    y_true_geo = tf.placeholder(tf.float32,
                            shape=(None,None,None,5),
                            name='y_true_geo')

    with tf.variable_scope('geometry'):
        geo_mask = tf.identity(y_true_cls, name='geo_mask')
        num_pos = tf.count_nonzero(y_true_cls,axis=[1,2,3],dtype=tf.float32,name='num_pos')
        
        with tf.variable_scope('split_tensor'):
            top_true, right_true, bottom_true, left_true, theta_true = tf.split(y_true_geo,5,axis=3)
            top_pred, right_pred, bottom_pred, left_pred, theta_pred = tf.split(y_pred_geo,5,axis=3)
        
        with tf.variable_scope('aabb'):            
            with tf.variable_scope("area"):
                area_true = (top_true + bottom_true) * (right_true + left_true)
                area_pred = (top_pred + bottom_pred) * (right_pred + left_pred)

                w_intersect = (tf.minimum(right_true, right_pred) 
                               + tf.minimum(left_true, left_pred))
                h_intersect = (tf.minimum(top_true, top_pred) 
                               + tf.minimum(bottom_true, bottom_pred))
                area_intersect = w_intersect * h_intersect
                area_union = area_true + area_pred - area_intersect

            with tf.variable_scope('iou_loss'):
                loss_aabb = -tf.log((area_intersect+iou_smooth)
                                    /(area_union+iou_smooth))
                # geo_mask에서 1인 부분들만 학습에 들어감
                # 전체 평균이 아닌 geo_mask에서 1인 것들만 학습하므로, num_pos로 나누어주어야 함
                # 배치 별 로스의 합 / 배치 당 데이터의 수
                loss_aabb = tf.reduce_sum(loss_aabb * geo_mask,axis=[1,2,3]) / num_pos

        loss_aabb = tf.reduce_mean(loss_aabb, name='loss_aabb')

        with tf.variable_scope('theta'): 
            loss_theta = (1 - tf.cos(theta_pred - theta_true))
            # geo_mask에서 1인 부분들만 학습에 들어감
            # 전체 평균이 아닌 geo_mask에서 1인 것들만 학습하므로, num_pos로 나누어주어야 함
            # 배치 별 로스의 합 / 배치 당 데이터의 수            
            loss_theta = tf.reduce_sum(loss_theta * geo_mask,axis=[1,2,3]) / num_pos
        loss_theta = tf.reduce_mean(loss_theta,name='loss_theta')
        
        with tf.variable_scope('aabb_theta'):
            geo_loss = loss_aabb + alpha_theta * loss_theta
    geo_loss = tf.identity(geo_loss, name='geo_loss')

In [20]:
show_graph(graph)

In [21]:
with graph.as_default():
    with tf.variable_scope('total_loss'):
        loss = score_loss + alpha_geo * geo_loss
    tf.add_to_collection(tf.GraphKeys.LOSSES,loss)
    tf.add_to_collection(tf.GraphKeys.LOSSES,score_loss)
    tf.add_to_collection(tf.GraphKeys.LOSSES,geo_loss)

In [22]:
show_graph(graph)

In [23]:
dataset = OCRDataset()
index = 0
fm_scale = 4

In [24]:
image, polys = dataset[index]

image, polys = normalize_shape(image, polys)
score_map, geo_map = generate_output(image, polys,fm_scale)

new_geo_map = geo_map*10

In [25]:
score_map = np.expand_dims(score_map,axis=0)
geo_map = np.expand_dims(geo_map,axis=0)
new_geo_map = np.expand_dims(new_geo_map,axis=0)
#new_geo_map = np.random.uniform(0,512,size=geo_map.shape)

In [26]:
with graph.as_default():
    initialize_uninitialized(sess)
    result = sess.run(geo_loss,feed_dict={
        y_true_cls : np.expand_dims(score_map,axis=-1),
        y_true_geo : geo_map,
        y_pred_geo : new_geo_map
    })
    print(result)

4.636681


In [27]:
with graph.as_default():
    lr = tf.placeholder_with_default(0.001, None,
                                     name='learning_rate')
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    without_stem_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                               scope='^((?!stem).)*$')
    with tf.control_dependencies(update_ops):
        headtune_op = (tf.train
                       .AdamOptimizer(lr)
                       .minimize(loss, 
                                 var_list=without_stem_variables,
                                 name='headtune_train_op'))
        finetune_op = (tf.train
                       .AdamOptimizer(lr)
                       .minimize(loss, 
                                 name='finetune_train_op'))

Instructions for updating:
Use tf.cast instead.


In [28]:
show_graph(graph)