## Set the following params correctly
* **data_path**  : Path to The folder that contains **1 to 150** folders 
* **save_path**  : Path to save the processed data
* **split_test** : %of test data  

In [None]:
data_path="/media/ansary/DriveData/Work/bengalAI/datasets/__raw__/Dataset"
save_path="/media/ansary/DriveData/Work/bengalAI/datasets/Recognition/"
split_test=20

In [None]:
import sys
sys.path.append('../')
#--------------------
# imports
#--------------------
import os 
import json
import cv2
import numpy as np
import pandas as pd 
import string
import random
from glob import glob
from tqdm.auto import tqdm
from shutil import copyfile
from PIL import Image, ImageEnhance                                                                
from coreLib.utils import stripPads,LOG_INFO,GraphemeParser,create_dir,WordCleaner
tqdm.pandas()
random.seed(42)
assert len(os.listdir(data_path))==150,"WORNG data_path for folders"

In [None]:
dfs=[]
#---------------------
# process xlsx
#---------------------
for i in tqdm(range(1,151)):
    xlsx=os.path.join(data_path,f"{i}",f"{i}.xlsx")
    df=pd.read_excel(xlsx)
    if "Id" in df.columns:
        filename=df["Id"].tolist()
    else:
        filename=df["ID"].tolist()

    if "Word" in df.columns:
        labels=df["Word"].tolist()
    else:
        labels=df["word"].tolist()

    df=pd.DataFrame({"mode":filename,"word":labels})
    dfs.append(df)
df=pd.concat(dfs,ignore_index=True)
df

In [None]:
#----------------------------
# graphemes and cleaning
#----------------------------
GP=GraphemeParser()
WC=WordCleaner()
df.word=df.word.progress_apply(lambda x: WC.clean(str(x)))
df.dropna(inplace=True)
df["graphemes"]=df.word.progress_apply(lambda x: GP.word2grapheme(x))
df

In [None]:
#----------------------------
# valid images
#----------------------------
idens=df["mode"].tolist()
valid=[]
for i in tqdm(range(1,151)):
    folder=os.path.join(data_path,f"{i}","Words")
    img_paths=[img_path for img_path in glob(os.path.join(folder,"*/*.*"))]
    for src in img_paths:
        base=os.path.basename(src).split(".")[0]
        if base in idens:
            valid.append(src)



In [None]:
main_path=create_dir(save_path,"bh")
save_path=create_dir(main_path,"images")

In [None]:
def removeShadow(img):
    rgb_planes = cv2.split(img)

    result_planes = []
    result_norm_planes = []
    for plane in rgb_planes:
        dilated_img = cv2.dilate(plane, np.ones((7,7), np.uint8))
        bg_img = cv2.medianBlur(dilated_img, 21)
        diff_img = 255 - cv2.absdiff(plane, bg_img)
        norm_img = cv2.normalize(diff_img,None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
        result_planes.append(diff_img)
        result_norm_planes.append(norm_img)

    result = cv2.merge(result_planes)
    result_norm = cv2.merge(result_norm_planes)
    return result




In [None]:
iden=0

filename=[]
word=[]
graphemes=[]
mode=[]

for img_path in tqdm(valid):
    img=cv2.imread(img_path)
    # base
    base=os.path.basename(img_path).split(".")[0]
    idf=df.loc[df["mode"]==base]
    _word=idf.word.tolist()[0]
    _graphemes=idf.graphemes.tolist()[0]
    _mode=idf["mode"].tolist()[0].split("_")[0]
    fname=f"{iden}.png"
    
    cv2.imwrite(os.path.join(save_path,fname),img)
    filename.append(fname)
    word.append(_word)
    graphemes.append(_graphemes)
    mode.append(_mode)
    
    iden+=1
    

In [None]:
data  =   pd.DataFrame({"filename":filename,
                      "word":word,
                      "graphemes":graphemes,
                      "mode":mode})
data

In [None]:
data.dropna(inplace=True)

# test train split
srcs=list(data["mode"].unique())
random.shuffle(srcs)
eval_len=int(len(srcs)*split_test/100)
eval_srcs=srcs[:eval_len]
data["mode"]=data["mode"].progress_apply(lambda x: "test" if x in eval_srcs else "train")
data


In [None]:
data.to_csv(os.path.join(main_path,"data.csv"),index=False)

In [None]:
LOG_INFO(f"IMPORTANT: PATH TO USE FOR tools/process.py:{main_path}","red")