# Params

In [None]:
boise_state_data_path="/media/ansary/DriveData/Work/bengalAI/datasets/imgOCR/boise_state/"
img_height     = 64
img_width      = 512
save_dir       = "/media/ansary/DriveData/Work/bengalAI/datasets/imgOCR/boise_state/"


# Imports and Resources

In [None]:
#---------------------
# import
#---------------------
import PIL
import PIL.Image , PIL.ImageDraw , PIL.ImageFont 
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2
import tensorflow as tf
%matplotlib inline


from glob import glob
from ast import literal_eval
from tqdm.auto import tqdm
tqdm.pandas()

from coreLib.store import genTFRecords
from coreLib.utils import create_dir,correctPadding,stripPads
#---------------------
# resources
#---------------------
words_dir=os.path.join(boise_state_data_path,"words")
label_csv=os.path.join(boise_state_data_path,"labels.csv")
# pairs
img_dir=create_dir(save_dir,"images")
tgt_dir=create_dir(save_dir,"targets")
# records
rec_dir  =create_dir(save_dir,"tfrecords")
# dataframe 
df=pd.read_csv(label_csv)
# eval labels
df.labels=df.labels.progress_apply(lambda x: literal_eval(x))
# img_path
df["img_path"]=df.filename.progress_apply(lambda x: os.path.join(words_dir,x))
df

# Font
font_path     =os.path.join(os.getcwd(),"Bangla.ttf")
# create font
font=PIL.ImageFont.truetype(font_path, size=img_height)

# Create Image->Word (target data)

In [None]:
i=0
for idx in tqdm(range(len(df))):
    # extract
    img_path=df.iloc[idx,2]
    comps=df.iloc[idx,1]
    try:
        # image and label
        img=cv2.imread(img_path,0)
        img[img<255]=0
        img=img-255
        # resize (heigh based)
        h,w=img.shape 
        width= int(img_height* w/h) 
        img=cv2.resize(img,(width,img_height),fx=0,fy=0, interpolation = cv2.INTER_NEAREST)
        #----------------------
        # target
        #----------------------
        label="".join(comps)
        # shape    
        h,w=img.shape 

        min_offset=100
        max_dim=h+w+min_offset
        # draw
        image = PIL.Image.new(mode='L', size=(max_dim,max_dim))
        draw = PIL.ImageDraw.Draw(image)
        draw.text(xy=(0, 0), text=label, fill=255, font=font)
        # create target
        tgt=np.array(image)
        tgt=stripPads(tgt,0)
        # resize
        tgt=cv2.resize(tgt,(w,h),fx=0,fy=0, interpolation = cv2.INTER_NEAREST)

        # pad correction
        img=correctPadding(img)
        tgt=correctPadding(tgt)
        # save
        cv2.imwrite(os.path.join(img_dir,f"{i}.png"),img)
        cv2.imwrite(os.path.join(tgt_dir,f"{i}.png"),tgt)
        i+=1
    except Exception as e:
            print(e)
   

In [None]:
# ---------------------------------------------------------
# globals
# ---------------------------------------------------------
# number of images to store in a tfrecord
DATA_NUM  = 1024

#---------------------------------------------------------------
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def to_tfrecord(image_paths,save_dir,r_num):
    '''	            
      Creates tfrecords from Provided Image Paths	        
      args:	        
        image_paths     :   specific number of image paths	       
        save_dir        :   location to save the tfrecords	           
        r_num           :   record number	
    '''
    # record name
    tfrecord_name='{}.tfrecord'.format(r_num)
    # path
    tfrecord_path=os.path.join(save_dir,tfrecord_name)
    with tf.io.TFRecordWriter(tfrecord_path) as writer:    
        for image_path in image_paths:
            
            target_path=str(image_path).replace('images','targets')
            #image
            with(open(image_path,'rb')) as fid:
                image_bytes=fid.read()
            # target
            with(open(target_path,'rb')) as fid:
                target_bytes=fid.read()
            
            
            data ={ 'image':_bytes_feature(image_bytes),
                    'target':_bytes_feature(target_bytes),
            }
            # write
            features=tf.train.Features(feature=data)
            example= tf.train.Example(features=features)
            serialized=example.SerializeToString()
            writer.write(serialized)


def genTFRecords(_paths,mode_dir):
    '''	        
        tf record wrapper
        args:	        
            _paths    :   all image paths for a mode	        
            mode_dir  :   location to save the tfrecords	    
    '''
    for i in tqdm(range(0,len(_paths),DATA_NUM)):
        # paths
        image_paths= _paths[i:i+DATA_NUM]
        # record num
        r_num=i // DATA_NUM
        # create tfrecord
        to_tfrecord(image_paths,mode_dir,r_num)    

In [None]:
# paths    
img_paths=[img_path for img_path in glob(os.path.join(img_dir,"*.*"))]
# tfrecords
genTFRecords(img_paths,rec_dir)