## Set the following params correctly
* **data_path**  : Path to The **converted** folder 
* **save_path**  : Path to save the processed data
* **split_test** : %of test data  

In [None]:
data_path="/media/ansary/DriveData/Work/bengalAI/datasets/__raw__/converted/"
save_path="/media/ansary/DriveData/Work/bengalAI/datasets/Recognition/"
split_test=20

In [None]:
import sys
sys.path.append('../')
#--------------------
# imports
#--------------------
import os 
import json
import cv2
import numpy as np
import pandas as pd 
import string
import random
from glob import glob
from tqdm.auto import tqdm
from coreLib.utils import stripPads,LOG_INFO,GraphemeParser,create_dir
tqdm.pandas()
random.seed(42)

In [None]:
#--------------------
# GLOBALS
#--------------------
# symbols to avoid 
SYMBOLS=list(string.ascii_letters)
SYMBOLS+=[str(i) for i in range(10)]
GP=GraphemeParser()
#--------------------------------images2words------------------------------------------------------------
#--------------------
# helper functions
#--------------------

def extract_word_images_and_labels(img_path):
    '''
        extracts word images and labels from a given image
        args:
            img_path : path of the image
        returns:
            (images,labels)
            list of images and labels
    '''
    imgs=[]
    labels=[]
    # json_path
    json_path=img_path.replace("jpg","json")
    # read image
    data=cv2.imread(img_path,0)
    # label
    label_json = json.load(open(json_path,'r'))
    # get word idx
    for idx in range(len(label_json['shapes'])):
        # label
        label=str(label_json['shapes'][idx]['label'])
        # special charecter negation
        if not any(substring in label for substring in SYMBOLS):
            labels.append(label)
            # crop bbox
            xy=label_json['shapes'][idx]['points']
            # crop points
            x1 = int(np.round(xy[0][0]))
            y1 = int(np.round(xy[0][1]))
            x2 = int(np.round(xy[1][0]))
            y2 = int(np.round(xy[1][1]))
            # image
            img=data[y1:y2,x1:x2]
            imgs.append(img)
    return imgs,labels

In [None]:
img_idens=[]
img_labels=[]
src=[]
i=0
main_path=create_dir(save_path,"bw")
save_path=create_dir(main_path,"images")
LOG_INFO(save_path)
# get image paths
img_paths=[img_path for img_path in glob(os.path.join(data_path,"*.jpg"))]
# iterate
for img_path in tqdm(img_paths):
    # extract images and labels
    imgs,labels=extract_word_images_and_labels(img_path)
    if len(imgs)>0:
        for img,label in zip(imgs,labels):
            try:

                # thresh
                blur = cv2.GaussianBlur(img,(5,5),0)
                _,img = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
                img=stripPads(img,255)
                img_save_path=os.path.join(save_path,f"{i}.png")
                # save
                cv2.imwrite(img_save_path,img)
                # append
                img_idens.append(f"{i}.png")
                img_labels.append(label)
                src.append(os.path.basename(img_path))
                i=i+1

            except Exception as e: 
                LOG_INFO(f"error in creating image:{img_path} label:{label},error:{e}",mcolor='red')


In [None]:
# dataframe
df              =   pd.DataFrame({"filename":img_idens,"word":img_labels,"src":src})
# graphemes
df["graphemes"] =   df.word.progress_apply(lambda x:GP.word2grapheme(x))
df.dropna(inplace=True)

# test train split
srcs=list(df.src.unique())
random.shuffle(srcs)
eval_len=int(len(srcs)*split_test/100)
eval_srcs=srcs[:eval_len]
df["mode"]=df.src.progress_apply(lambda x: "test" if x in eval_srcs else "train")



df=df[["filename","word","graphemes","mode"]]
df

In [None]:
df.to_csv(os.path.join(main_path,"data.csv"),index=False)

In [None]:
LOG_INFO(f"IMPORTANT: PATH TO USE FOR tools/process.py:{main_path}","red")