# End-to-end Trainable Sequential Optical Character Recognition
(This notebook can be run inside Colab)

The problem is inspired by [How to train a Keras model to recognize text with variable length](https://www.dlology.com/blog/how-to-train-a-keras-model-to-recognize-variable-length-text/) and redefined as:

* Given a video footage showing an image from left to right, recover the text sentence printed on the image. To simplify the problem, we assume the sentence is randomly drawn from the [GRID corpus](http://staffwww.dcs.shef.ac.uk/people/J.Barker/assets/cooke-2006-jasa-ecbf8f7ef7cb429e9621317bfc64a67002a4c465be3c1a3f6144eeed058ee634.pdf).
* The sentence can appear anywhere on the image and hence any frame in the video.
* The video frame width is smaller than any character width so that no frame captures a full character.
* The video frame height is the same as the image height.
* Sample from the video footage a sequence of image frames as input to the neural net. To make the problem difficult, we use a low sample rate such that no two neighbouring frames in the sampled sequence share common pixels and some pixels from the original image are never captured in the samples. 

![Problem definition by picture](https://raw.githubusercontent.com/liyinnbw/ML/master/SequentialOCR/problem.png)

## Random Seed

In [0]:
import numpy as np
import random
np.random.seed(17)
random.seed(17)

## Image Generator
For generating images containing random text sentences and noise.

In [2]:
!pip install cairocffi
import cairocffi as cairo
import numpy as np
from scipy import ndimage
import re
# this creates larger "blotches" of noise which look
# more realistic than just adding gaussian noise
# assumes greyscale with pixels ranging from 0 to 1

regex = r'^[a-z ]+$'

def speckle(img):
    severity = np.random.uniform(0, 0.6)
    blur = ndimage.gaussian_filter(np.random.randn(*img.shape) * severity, 1)
    img_speck = (img + blur)
    img_speck[img_speck > 1] = 1
    img_speck[img_speck <= 0] = 0
    return img_speck


# paints the string in a random location the bounding box
# also uses a random font, a slight random rotation,
# and a random amount of speckle noise

def paint_text(text, w, h, rotate=False, ud=False, multi_fonts=False):
    surface = cairo.ImageSurface(cairo.FORMAT_RGB24, w, h)
    with cairo.Context(surface) as context:
        context.set_source_rgb(1, 1, 1)  # White
        context.paint()
        # this font list works in CentOS 7
        if multi_fonts:
            fonts = ['Century Schoolbook', 'Courier', 'STIX', 'URW Chancery L', 'FreeMono']
            context.select_font_face(np.random.choice(fonts), cairo.FONT_SLANT_NORMAL,
                                     np.random.choice([cairo.FONT_WEIGHT_BOLD, cairo.FONT_WEIGHT_NORMAL]))
        else:
            context.select_font_face('Courier', cairo.FONT_SLANT_NORMAL, cairo.FONT_WEIGHT_BOLD)
        context.set_font_size(25)
        box = context.text_extents(text)
        border_w_h = (4, 4)
        if box[2] > (w - 2 * border_w_h[1]) or box[3] > (h - 2 * border_w_h[0]):
            raise IOError('Could not fit string into image. Max char count is too large for given image width.')

        # teach the RNN translational invariance by
        # fitting text box randomly on canvas, with some room to rotate
        max_shift_x = w - box[2] - border_w_h[0]
        max_shift_y = h - box[3] - border_w_h[1]
        top_left_x = np.random.randint(0, int(max_shift_x))
        if ud:
            top_left_y = np.random.randint(0, int(max_shift_y))
        else:
            top_left_y = h // 2
        context.move_to(top_left_x - int(box[0]), top_left_y - int(box[1]))
        context.set_source_rgb(0, 0, 0)
        context.show_text(text)

    buf = surface.get_data()
    a = np.frombuffer(buf, np.uint8)
    a.shape = (h, w, 4)
    a = a[:, :, 0]  # grab single channel
    a = a.astype(np.float32) / 255
    a = np.expand_dims(a, 0)
    if rotate:
        a = image.random_rotation(a, 3 * (w - top_left_x) / w + 1)
    a = speckle(a)

    return a


def shuffle_mats_or_lists(matrix_list, stop_ind=None):
    ret = []
    assert all([len(i) == len(matrix_list[0]) for i in matrix_list])
    len_val = len(matrix_list[0])
    if stop_ind is None:
        stop_ind = len_val
    assert stop_ind <= len_val

    a = list(range(stop_ind))
    np.random.shuffle(a)
    a += list(range(stop_ind, len_val))
    for mat in matrix_list:
        if isinstance(mat, np.ndarray):
            ret.append(mat[a])
        elif isinstance(mat, list):
            ret.append([mat[i] for i in a])
        else:
            raise TypeError('`shuffle_mats_or_lists` only supports '
                            'numpy.array and list objects.')
    return ret


# Translation of characters to unique integer values
def text_to_labels(text):
    ret = []
    for char in text:
        ret.append(alphabet.find(char))
    return ret


# Reverse translation of numerical classes back to characters
def labels_to_text(labels):
    ret = []
    for c in labels:
        if c == len(alphabet):  # CTC Blank
            ret.append("")
        else:
            ret.append(alphabet[c])
    return "".join(ret)


# only a-z and space..probably not to difficult
# to expand to uppercase and symbols

def is_valid_str(in_str):
    search = re.compile(regex, re.UNICODE).search
    return bool(search(in_str))

Collecting cairocffi
[?25l  Downloading https://files.pythonhosted.org/packages/f7/99/b3a2c6393563ccbe081ffcceb359ec27a6227792c5169604c1bd8128031a/cairocffi-1.1.0.tar.gz (68kB)
[K     |████▊                           | 10kB 24.9MB/s eta 0:00:01[K     |█████████▌                      | 20kB 3.2MB/s eta 0:00:01[K     |██████████████▎                 | 30kB 4.3MB/s eta 0:00:01[K     |███████████████████             | 40kB 3.0MB/s eta 0:00:01[K     |███████████████████████▉        | 51kB 3.4MB/s eta 0:00:01[K     |████████████████████████████▋   | 61kB 4.0MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 3.4MB/s 
Building wheels for collected packages: cairocffi
  Building wheel for cairocffi (setup.py) ... [?25l[?25hdone
  Created wheel for cairocffi: filename=cairocffi-1.1.0-cp36-none-any.whl size=88591 sha256=75aa1c55273b00f6c7fad3f7f3d4c7c9e8171000d850b9aad61e03f46d93200a
  Stored in directory: /root/.cache/pip/wheels/11/5e/47/167d9dfd5fa5850dd0cd3db80afe6

## Batch Generator (With Curriculum)
Load, preprocess and generate batches for training or testing

In [3]:
import numpy as np
from keras.preprocessing import sequence
from keras.callbacks import Callback

class OCRBatchGenerator(Callback):
    def __init__(self, texts, textlenmax, imageshape, splitframes, frameinterval, steps, shuffle = False, curriculum=None):
        super(OCRBatchGenerator, self).__init__()
        self.steps = steps
        self.shuffle = shuffle
        self.curriculum = curriculum
        self.imageshape = imageshape
        self.splitframes = splitframes
        self.frameinterval = frameinterval
        self.texts = texts
        self.textlenmax = textlenmax
        self.textlen = textlenmax if self.curriculum == None else self.curriculum[0]
    
    def str_to_label(self, string):
        label = []
        for char in string:
            if char == ' ':
                label.append(26)
            else:
                label.append(ord(char) - ord('a')) #ord return unicode integer
        return label
    
    def next(self):
        full_size = len(self.texts)
        batch_size = int(full_size / self.steps + 0.5)
        while True:
            idxs = np.arange(full_size)
            if self.shuffle:
                np.random.shuffle(idxs)
            for step in range(self.steps):
                idx_start = step*batch_size
                idx_end = idx_start+batch_size if step<self.steps-1 else full_size
                idx_batch = idxs[idx_start:idx_end]
                images = np.zeros((len(idx_batch), self.splitframes, self.imageshape[0],self.imageshape[1]//self.splitframes//self.frameinterval,self.imageshape[2]),dtype='float32')
                images_len = np.full((len(idx_batch),1), fill_value=self.splitframes, dtype='int32')
                labels = []
                labels_len = []
                labels_truth = []
                for idx, textidx in enumerate(idx_batch):
                    text = self.texts[textidx]
                    text = text[:self.textlen] if len(text) > self.textlen else text
                    image= paint_text(text, self.imageshape[1], self.imageshape[0], rotate=False, ud=True, multi_fonts=True).reshape(self.imageshape)
                    image_split = np.array(np.split(image,self.splitframes*self.frameinterval,axis=1))[np.arange(0, self.splitframes*self.frameinterval, self.frameinterval)]
                    images[idx] = image_split
                    label = self.str_to_label(text)
                    labels.append(label)
                    labels_len.append(len(label))
                    labels_truth.append(text)
                labels = sequence.pad_sequences(labels, maxlen=self.textlenmax, dtype='int32', padding='post', truncating='post', value=-1)#self.blank_char) #return numpy array
                labels_len = np.asarray(labels_len, dtype='int32').reshape(-1,1)
                # normalize image (the given image is already in 0-1 range)
                images = (images-0.5)*2
                inputs = {
                    'images': images,
                    'labels': labels,
                    'images_len': images_len,
                    'labels_len': labels_len,
                    'truth': labels_truth # not for training, validation/test only
                }
                outputs = {'ctc':np.zeros([images.shape[0]])} #dummy ground truth ctc value, not used but needed
                yield (inputs, outputs)
                
    # curriculum learning by gradually increasing text length
    def on_epoch_begin(self, epoch, logs={}):
        if self.curriculum != None and epoch<len(self.curriculum):
            self.textlen = self.curriculum[epoch]
        else:
            self.textlen = self.textlenmax
            
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        logs['curriculum_text_len']=self.textlen

Using TensorFlow backend.


## NetGenerator
A generator similar to BatchGenerator but instead generate models. Each model is given a name and can be visualized in Tensorboard.

In [4]:
import keras
from keras import regularizers
from keras import backend as K
from keras import optimizers
from keras import regularizers

# !pip install keras-tcn
# from tcn import TCN

# !pip install tensorflow-addons >= 1.13.1
import tensorflow as tf
# import tensorflow_addons as tfa
!pip install keras-self-attention
from keras_self_attention import SeqSelfAttention

class TemporalBlock(tf.layers.Layer):
    def __init__(self, n_outputs, kernel_size, strides, dilation_rate, dropout=0.2, 
                 trainable=True, name=None, dtype=None, 
                 activity_regularizer=None, **kwargs):
        super(TemporalBlock, self).__init__(
            trainable=trainable, dtype=dtype,
            activity_regularizer=activity_regularizer,
            name=name, **kwargs
        )        
        self.dropout = dropout
        self.n_outputs = n_outputs
        self.conv1 = CausalConv1D(
            n_outputs, kernel_size, strides=strides, 
            dilation_rate=dilation_rate, activation=tf.nn.relu, 
            name="conv1")
        self.conv2 = CausalConv1D(
            n_outputs, kernel_size, strides=strides, 
            dilation_rate=dilation_rate, activation=tf.nn.relu, 
            name="conv2")
        self.down_sample = None
    
    def build(self, input_shape):
        channel_dim = 2
        self.dropout1 = tf.layers.Dropout(self.dropout, [tf.constant(1), tf.constant(1), tf.constant(self.n_outputs)])
        self.dropout2 = tf.layers.Dropout(self.dropout, [tf.constant(1), tf.constant(1), tf.constant(self.n_outputs)])
        if input_shape[channel_dim] != self.n_outputs:
            # self.down_sample = tf.layers.Conv1D(
            #     self.n_outputs, kernel_size=1, 
            #     activation=None, data_format="channels_last", padding="valid")
            self.down_sample = tf.layers.Dense(self.n_outputs, activation=None)
    
    def call(self, inputs, training=True):
        x = self.conv1(inputs)
        x = tf.contrib.layers.layer_norm(x)
        x = self.dropout1(x, training=training)
        x = self.conv2(x)
        x = tf.contrib.layers.layer_norm(x)
        x = self.dropout2(x, training=training)
        if self.down_sample is not None:
            inputs = self.down_sample(inputs)
        return tf.nn.relu(x + inputs)
      
      
def ctc_lambda_func(args):
  y_pred, pred_len, y_true, true_len = args

  # the 2 is critical here since the first couple outputs of the RNN
  # tend to be garbage:
  offset = 0
  y_pred = y_pred[:, offset:]
  return K.ctc_batch_cost(y_true, y_pred, pred_len-offset, true_len)
  
class NetGenerator():
  def __init__(self, input_shape, out_categories, label_len):
    self.input_shape = input_shape
    self.out_categories = out_categories
    self.label_len = label_len
  def next(self):

    modelnames =[
        'conv32-dense64-dense64-lr0.001',
        'conv32-dense64-bigru64-lr0.0005',
        'conv32-dense64-tcn64k2d32-lr0.001',
        # 'conv32-dense64-bilstm-selfattension-lr0.001'
    ]
     
    for idx, name in enumerate(modelnames):
      
        ################################################
        # input layers
        ################################################
        # the name parameter of multiple inputs must match the dictionary key of input dict
        images = keras.Input(name='images', shape=self.input_shape, dtype='float32' )
        labels = keras.Input(name='labels', shape=[self.label_len], dtype='int32') 
        images_len = keras.Input(name='images_len', shape=[1], dtype='int32') # needed for ctc in tensor format
        labels_len = keras.Input(name='labels_len', shape=[1], dtype='int32') # needed for ctc in tensor format
        
        ################################################
        # common feature extraction layers
        ################################################
        x = keras.layers.ZeroPadding3D(padding=(1,2,2))(images) 
        x = keras.layers.Conv3D(32, (3, 5, 5), strides=(1,2,2))(x)
        x = keras.layers.BatchNormalization()(x)
        x = keras.layers.Activation('relu')(x)
#         x = keras.layers.SpatialDropout3D(0.5)(x)
        x = keras.layers.MaxPooling3D(pool_size=(1, 2, 2), strides=(1, 2, 2))(x)

        x = keras.layers.TimeDistributed(keras.layers.Flatten())(x)
        x = keras.layers.Dense(64)(x) #reduce feature size
        
        #################################################
        # 4 different models
        #################################################
        learnrate = 0.001
        if idx == 0:
            # MLP model, feel free to add more layers
            x = keras.layers.Dense(64)(x)
        elif idx == 1:
            learnrate = 0.0005
            # bi-gru model, our experiment shows single bi-gru is not enough. 
            x = keras.layers.Bidirectional(keras.layers.GRU(64, return_sequences=True), merge_mode='sum')(x)
        elif idx == 2:
            # TCN ( caveat: not exactly following the paper )
            # just make sure effective receptive field > input 74 frames
            # receptive field = kernel_size x last_dilation = 3 x 32 = 96 > 74
            x_skip= x
            x = keras.layers.Conv1D(64, 3, dilation_rate=1, padding='causal')(x)
            x = keras.layers.Lambda(tf.contrib.layers.layer_norm)(x)
            x = keras.layers.Activation('relu')(x)

            x = keras.layers.Conv1D(64, 3, dilation_rate=2, padding='causal')(x)
            x = keras.layers.Lambda(tf.contrib.layers.layer_norm)(x)
            x = keras.layers.Activation('relu')(x)
            x = keras.layers.add([x_skip,x])
            x_skip= x

            x = keras.layers.Conv1D(64, 3, dilation_rate=4, padding='causal')(x)
            x = keras.layers.Lambda(tf.contrib.layers.layer_norm)(x)
            x = keras.layers.Activation('relu')(x)

            x = keras.layers.Conv1D(64, 3, dilation_rate=8, padding='causal')(x)
            x = keras.layers.Lambda(tf.contrib.layers.layer_norm)(x)
            x = keras.layers.Activation('relu')(x)
            x = keras.layers.add([x_skip,x])
            x_skip= x

            x = keras.layers.Conv1D(64, 3, dilation_rate=16, padding='causal')(x)
            x = keras.layers.Lambda(tf.contrib.layers.layer_norm)(x)
            x = keras.layers.Activation('relu')(x)

            x = keras.layers.Conv1D(64, 3, dilation_rate=32, padding='causal')(x)
            x = keras.layers.Lambda(tf.contrib.layers.layer_norm)(x)
            x = keras.layers.Activation('relu')(x)
            x = keras.layers.add([x_skip,x])
        # elif idx == 3:
        #     # bi-LSTM + selfattension
        #     x = keras.layers.Bidirectional(keras.layers.LSTM(units=128,return_sequences=True), merge_mode='concat')(x)
        #     x = SeqSelfAttention(attention_activation='sigmoid', attention_width=15, name='Attention')(x)
        
        #################################################
        # common classification layers
        #################################################
        x = keras.layers.Dense(self.out_categories)(x)
        x = keras.layers.BatchNormalization()(x)
      
        # categorical classification for each time step
        image_label_pred = keras.layers.Activation('softmax', name='y_pred')(x) 
        
        # ctc
        # the output layer name must match the key in the output dict
        ctc_loss = keras.layers.Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([image_label_pred, images_len, labels, labels_len])
        
        #################################################
        # compile model
        #################################################
        model = keras.Model(inputs=[images, images_len, labels, labels_len], outputs=[ctc_loss])
        model.compile(
            # the loss calc occurs elsewhere, so use a dummy lambda func for the loss
            loss={'ctc': lambda y_true, y_pred: y_pred},
            optimizer=optimizers.Adam(lr=learnrate),
        )
        
        #################################################
        # test function to be called in statistial callback
        #################################################
        test_func = K.function(inputs=[images, labels, images_len, labels_len, K.learning_phase()], outputs=[image_label_pred, ctc_loss])
        yield (name, model, test_func)


Collecting keras-self-attention
  Downloading https://files.pythonhosted.org/packages/44/3e/eb1a7c7545eede073ceda2f5d78442b6cad33b5b750d7f0742866907c34b/keras-self-attention-0.42.0.tar.gz
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25l[?25hdone
  Created wheel for keras-self-attention: filename=keras_self_attention-0.42.0-cp36-none-any.whl size=17296 sha256=b39079b75d440d1880da84a30fc03e622ae497c18e9ffe004690b4ef9403001f
  Stored in directory: /root/.cache/pip/wheels/7b/05/a0/99c0cf60d383f0494e10eca2b238ea98faca9a1fe03cac2894
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.42.0


## Statistical Callback
Code to do valiadation and log metrics at end of each epoch. It also saves some input image at current epoch for visual validation.

In [0]:
from keras.callbacks import Callback
from keras import backend as K 
import editdistance 
from PIL import Image

class StatCallback(Callback):       
    def __init__(self, func, validation_generator, validation_steps):
        super(StatCallback, self).__init__()
        self.func=func
        self.validation_generator=validation_generator
        self.validation_steps=validation_steps
        self.offset = 0
        
    #Calculation of WER with Levenshtein distance.
    def wer(self, r, h):
        # initialisation
        d = np.zeros((len(r)+1)*(len(h)+1), dtype=np.uint8)
        d = d.reshape((len(r)+1, len(h)+1))
        for i in range(len(r)+1):
            for j in range(len(h)+1):
                if i == 0:
                    d[0][j] = j
                elif j == 0:
                    d[i][0] = i
        # computation
        for i in range(1, len(r)+1):
            for j in range(1, len(h)+1):
                if r[i-1] == h[j-1]:
                    d[i][j] = d[i-1][j-1]
                else:
                    substitution = d[i-1][j-1] + 1
                    insertion    = d[i][j-1] + 1
                    deletion     = d[i-1][j] + 1
                    d[i][j] = min(substitution, insertion, deletion)
        return d[len(r)][len(h)]
    
    def labels_to_str(self,labels,showblank=False):
        outstr= ''
        for c in labels:
            if c >= 0 and c < 26:
                outstr += chr(c + ord('a'))
            elif c == 26:
                outstr += ' '
            else:
                if showblank:
                    outstr +='?'
        return outstr
    
    def on_epoch_end(self, epoch, logs=None):
        samples = 0
        mean_norm_ed = 0.0
        mean_ed = 0.0
        word_count = 0
        word_err_count = 0
        sentence_err_count = 0
        mean_loss = 0
        steps = self.validation_steps
        while steps > 0:
            batch = next(self.validation_generator)[0]
            # call test_func provided in NetGenerator
            image_label_pred, ctc_loss = self.func([batch['images'],batch['labels'],batch['images_len'],batch['labels_len']])
            # accumulate ctc loss
            mean_loss+=np.sum(ctc_loss)
            
            sample_count = image_label_pred.shape[0]
            frames_count = image_label_pred.shape[1]-self.offset
            pred_categories = image_label_pred.shape[2]
            # using tensorflow ctc decoder to decode label
            y_pred = K.placeholder(shape=[sample_count,frames_count,pred_categories])
            input_length = K.placeholder(shape=[sample_count])
            input_length_value = np.full(shape=sample_count, fill_value=frames_count)
            decoder = K.ctc_decode(y_pred, input_length, beam_width=3, greedy=True)
            decoded = K.get_session().run(decoder, feed_dict={y_pred:image_label_pred[:,self.offset:], input_length: input_length_value})[0][0]
            # convert label to string
            raw_strs=[] # FOR DEBUG: highest probability label for all timesteps. show blank
            decoded_strs = [] # collapsed string after ctc_decode and no blank
            for j in range(sample_count):
                rawstr = self.labels_to_str(np.argmax(image_label_pred[j],axis=1),showblank=True)
                outstr = self.labels_to_str(decoded[j],showblank=False)
                raw_strs.append(rawstr)
                decoded_strs.append(outstr)

            truth_strs = batch['truth']
            for j in range(sample_count):
                if j<1:
                  input_img = (np.concatenate(batch['images'][j],axis=1)/2+0.5)*255
                  input_img = Image.fromarray(np.uint8(input_img[:,:,0]))
                  input_img.save("./log/{:03d}.png".format(steps),"png")
                  # print debug strings to get a sense of result
                  print('step {:03d}, truth:[{}] decoded:[{}] raw:[{}]'.format(steps,truth_strs[j],decoded_strs[j],raw_strs[j]))
                edit_dist = editdistance.eval(decoded_strs[j], truth_strs[j])

                #sentence error
                if edit_dist!=0:
                    sentence_err_count += 1
                #word error
                truth_words = truth_strs[j].split()
                decoded_words = decoded_strs[j].split()
                word_count += len(truth_words)
                word_err_count += self.wer(truth_words, decoded_words)

                #edit distance
                mean_ed += float(edit_dist)
                mean_norm_ed += float(edit_dist) / len(batch['labels'][j])
            samples += sample_count
            steps -= 1
        mean_norm_ed = mean_norm_ed / samples #the same as cer
        mean_ed = mean_ed / samples
        mean_ser = sentence_err_count / samples
        mean_wer = word_err_count / word_count
        mean_loss = mean_loss / samples
        
        print('VAL_LOSS=',mean_loss,'SER=',mean_ser,'WER=',mean_wer,'CER=',mean_norm_ed)
        logs = logs or {}
        logs['val_loss']=mean_loss
        logs['cer'] = mean_norm_ed
        logs['wer'] = mean_wer
        logs['ser'] = mean_ser
        
        

## Generate [GRID Corpus](http://staffwww.dcs.shef.ac.uk/people/J.Barker/assets/cooke-2006-jasa-ecbf8f7ef7cb429e9621317bfc64a67002a4c465be3c1a3f6144eeed058ee634.pdf)

In [6]:
import random

words_command = ['bin','lay','place','set']
words_color = ['blue','green','red','white']
words_preposition = ['at','by','in','which']
words_letter =['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','x','y','z']
words_digit = ['zero','one','two','three','four','five','six','seven','eight','nine']
words_adverb = ['again','now','please','soon']
texts = []
for command in words_command:
  for color in words_color:
    for preposition in words_preposition:
      for letter in words_letter:
        for digit in words_digit:
          for adverb in words_adverb:
            texts.append(' '.join([command,color,preposition,letter,digit,adverb]))

random.shuffle(texts)
print('<',texts[0],'>')

character_sequence_len_max = 35

# split to train, validation, test
sample_count = len(texts)
print('sample count:',sample_count)
train_texts = texts[:9000]
train_count = len(train_texts)
print('train count:',train_count)
validation_texts = texts[train_count:train_count+1000]
validation_count = len(validation_texts)
print('validation count:',validation_count)
# test_texts = texts[train_count+validation_count:]
# test_count = len(test_texts)
# print('test count:', test_count)

< place blue which k four please >
sample count: 64000
train count: 9000
validation count: 1000


## Training

In [7]:
!pip install tensorboardcolab
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback

tbc=TensorBoardColab(port=6007, graph_path='.', startup_waiting_time=8)

Wait for 8 seconds...
TensorBoard link:
https://011f25b2.ngrok.io


In [9]:
from keras.callbacks import TensorBoard, ModelCheckpoint
from keras import backend as K 
import tensorflow as tf

print (keras.__version__)
print (tf.__version__)

# # limit GPU memory
# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.6
# # config.gpu_options.allow_growth=True
# K.tensorflow_backend.set_session(tf.Session(config=config))


image_shape = (50,740,1) 
frameinterval = 2
image_sequence_len = 74
character_categories = 28



net_gen = NetGenerator(
    input_shape=(image_sequence_len,image_shape[0],image_shape[1]//image_sequence_len//frameinterval,image_shape[2]), 
    out_categories=character_categories, 
    label_len=character_sequence_len_max
)

# very important or error
K.clear_session()
net_generator = net_gen.next()
modelconfig=next(net_generator,None)
  
while modelconfig!=None:

    modelname,model,test_func = modelconfig

    print(modelname)
    model.summary()
    
    train_gen = OCRBatchGenerator(train_texts, character_sequence_len_max, image_shape, image_sequence_len, frameinterval, steps=300, shuffle=True)
    valid_gen = OCRBatchGenerator(validation_texts, character_sequence_len_max, image_shape, image_sequence_len, frameinterval, steps=33, shuffle=True)

    cb_stat = StatCallback(
        func = test_func,
        validation_generator = valid_gen.next(), 
        validation_steps = 28
    )
    
    cb_tensorboard = TensorBoard(
        log_dir='./log/'+modelname,
        histogram_freq=5,  
        write_graph=True, 
        write_images=False,
        write_grads=False,
#         update_freq ='epoch'
    )

    cb_checkpoint = ModelCheckpoint(
        './log/'+modelname+'_weights.h5', 
        monitor='cer', 
        verbose=0, 
        save_best_only=True, 
        save_weights_only=True, 
        mode='auto', 
        period=1
    )

    model.fit_generator(
      initial_epoch = 0,
      generator=train_gen.next(), 
      steps_per_epoch = train_gen.steps,
      epochs = 200,
      validation_data = next(valid_gen.next()), #in order to save histogram must not be generator, else can
      validation_steps = valid_gen.steps,
#       validation_freq = 1,
      callbacks=[train_gen, valid_gen, cb_stat, cb_tensorboard, cb_checkpoint]
    )
    
    # very important or error
    K.clear_session()
    modelconfig=next(net_generator,None)

print('All done')


2.2.5
1.15.0
conv32-dense64-dense64-lr0.001
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
images (InputLayer)             (None, 74, 50, 5, 1) 0                                            
__________________________________________________________________________________________________
zero_padding3d_1 (ZeroPadding3D (None, 76, 54, 9, 1) 0           images[0][0]                     
__________________________________________________________________________________________________
conv3d_1 (Conv3D)               (None, 74, 25, 3, 32 2432        zero_padding3d_1[0][0]           
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 74, 25, 3, 32 128         conv3d_1[0][0]                   
________________________________________________