In [1]:
import tensorflow as tf 
import os
from os.path import join
import json
import random
import itertools
import re
import datetime
import numpy as np
from scipy import ndimage
import pylab
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
%matplotlib inline
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Input, Dense, Activation
from tensorflow.keras.layers import Reshape, Lambda
from tensorflow.keras.layers import add, concatenate
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import GRU
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import get_file
from tensorflow.keras.preprocessing import image
import tensorflow.keras.callbacks
import cv2
from PIL import Image
import numpy as np
import pandas as pd

import boto3

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
import tensorflow as tf
sess = tf.Session()
K.set_session(sess)

In [5]:
!ls ../../data/

example.txt         meta.json           meta_json.json
meta.csv            meta_json.csv       word_level_meta.csv


In [None]:
unique = meta.drop_duplicates(subset=['document'], keep='last')

from collections import Counter

all_tokens = Counter()
all_lower_tokens = Counter()
all_chars = Counter()
all_lower_chars = Counter()
all_letters = ''
i = 1
# for tokens in unique.meta[:10]:
for tokens in unique.meta:
    i += 2
    for token in tokens:
        all_letters += token
        all_tokens[token] += 1
        all_lower_tokens[token.lower()] += 1
        for char in token:
            all_chars[char] += 1
            all_lower_chars[char.lower()] += 1

In [None]:
from collections import defaultdict
import dill as pickle


def labels_to_text(labels):
    return ''.join(list(map(lambda x: all_letters[int(x)], labels)))

def text_to_labels(text):
    return list(map(lambda x: all_letters.index(x), text))

def is_valid_str(s):
    for ch in s:
        if not ch in all_letters:
            return False
    return True

def unpickle(filename):
    """Unpickle file"""
    with open(filename, 'rb') as f:
        return pickle.load(f)

class TextImageGenerator:
    
    def __init__(self, word_level_df, data_path, img_width, img_height, batch_size, downsample_factor, 
                 max_text_len=21+1, pre_pad=True, save_letters=True, use_s3=False):
        
#         self.letters = letters
#         # create letters mapping 
#         # +2 0 for pad value and 1 for OOV
#         self.letter2idx = defaultdict(lambda: 1, {x: i+2 for i, x in enumerate(sorted(letters))})
#         self.letter2idx['<unk>'] = 1
#         self.letter2idx['<pad>'] = 0
#         self.pad_idx = self.letter2idx['<pad>']
#         self.idx2letter = {v: k for k, v in self.letter2idx.items()}
        
#         if save_letters:
#             with open(os.path.join('../../data/', 'processed', 'letters_map.pkl'), 'wb') as f:
#                 pickle.dump(self.letter2idx, f, protocol=2)
            
        self.data_path = data_path
        self.img_width = img_width
        self.img_height = img_height
        self.batch_size = batch_size
        self.downsample_factor = downsample_factor
        self.max_text_len = max_text_len
        self.pre_pad = pre_pad
        
        # TODO: should I create DF as well? 
        self.word_level_df = word_level_df
#         self.word_level_df = create_word_level_df(meta)
#         self.word_level_df = create_image_path(self.word_level_df, data_path)
        
        # training data 
        self.samples = self.word_level_df[['image_path', 'token']].values.tolist()
        self.N = len(self.samples)
        self.current_index = 0
        
    def build_data(self):
        self.images = np.zeros((self.N, self.img_height, self.img_width))
        self.texts = []
        bad_records = []
        for i, (img_path, text) in enumerate(self.samples):
            try:
                # read image 
                img = cv2.imread(img_path)
                # grayscale image
                img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                # resize image
                img = cv2.resize(img, (self.img_width, self.img_height))
                # change image type
                img = img.astype(np.float32)
                # scale image 
                img /= 255
                # width and height are backwards from typical Keras convention
                # because width is the time dimension when it gets fed into the RNN
                self.images[i, :, :] = img
                self.texts.append(text)
            except:
                print('Image not available for image', i, img_path, text)
                bad_records.append(i)
        # update stats to remove bad records with no image data 
        self.N -= len(bad_records)
        self.indexes = list(range(self.N))
        self.images = np.delete(self.images, bad_records, axis=0)

    def get_output_size(self):
#         return len(self.letters) + 1
        return len(all_letters) + 1

    def next_sample(self):
        self.current_index += 1
        if self.current_index >= self.N:
            self.current_index = 0
            random.shuffle(self.indexes)
        return self.images[self.indexes[self.current_index]], self.texts[self.indexes[self.current_index]]

    def next_batch(self):
        while True:
            # width and height are backwards from typical Keras convention
            # because width is the time dimension when it gets fed into the RNN
            if K.image_data_format() == 'channels_first':
                X = np.ones([self.batch_size, 1, self.img_width, self.img_height])
            else:
                X = np.ones([self.batch_size, self.img_width, self.img_height, 1])

            y = np.ones([self.batch_size, self.max_text_len])
            input_length = np.ones((self.batch_size, 1)) * (self.img_width // self.downsample_factor - 2)
            label_length = np.zeros((self.batch_size, 1))
            source_str = []

            for i in range(self.batch_size):
                img, text = self.next_sample()
                img = img.T
                if K.image_data_format() == 'channels_first':
                    img = np.expand_dims(img, 0)
                else:
                    img = np.expand_dims(img, -1)
                X[i] = img
#                 pdb.set_trace()
#                 y_numeric = text_to_labels(text, self.letters)
                y_numeric = text_to_labels(text)
#                 y_numeric = [self.letter2idx[l] for l in text]
                if self.pre_pad: padded_y = ([self.pad_idx] * (self.max_text_len - len(y_numeric))) + y_numeric
                else: padded_y = y_numeric + ([self.pad_idx] * (self.max_text_len - len(y_numeric)))
                y[i] = padded_y
#                 y[i] = text_to_labels(text)
                source_str.append(text)
                label_length[i] = len(text)

            inputs = {
                'the_input': X,
                'the_labels': y,
                'input_length': input_length,
                'label_length': label_length,
                #'source_str': source_str
            }          
            outputs = {'ctc': np.zeros([self.batch_size])}
            yield (inputs, outputs)

        