## Set the following params correctly
* **readme_txt_path**  : Path to The **README.txt** file under **Boise State Bangla Handwriting Dataset 20200228**folder 
* **save_path**  : Path to save the processed data
* **split_test** : %of test data  

In [None]:
readme_txt_path="/media/ansary/DriveData/Work/bengalAI/datasets/"+\
                "__raw__/Boise State Bangla Handwriting Dataset 20200228/README.txt"

save_path="/media/ansary/DriveData/Work/bengalAI/datasets/Recognition/"
split_test=20

In [None]:
import sys
sys.path.append('../')
#----------------------
# imports
#----------------------
import os 
import pandas as pd 
import numpy as np
import shutil
import cv2
import matplotlib.pyplot as plt
%matplotlib inline
from coreLib.utils import stripPads,LOG_INFO,create_dir
from glob import glob
from tqdm.auto import tqdm
tqdm.pandas()
import random
random.seed(42)
base_path=os.path.dirname(readme_txt_path)
LOG_INFO(base_path)
assert len(os.listdir(base_path))==5,"WRONG PATH FOR README.txt"

In [None]:
def extract_info(_dir,coords,fmt):
    '''
        extracts information from boise-state annotations
    '''
    img_paths=[img_path for img_path in glob(os.path.join(_dir,f"*.{fmt}"))]
    liness=[]
    words=[]
    comps=[]
    chars=[]
    xmins=[]
    ymins=[]
    xmaxs=[]
    ymaxs=[]
    _paths=[]
    # images
    for img_path in tqdm(img_paths):
        base=img_path.split(".")[0]
        # text path
        _iden=os.path.basename(img_path).split(".")[0]
        text_path=os.path.join(_dir,coords,f"{_iden}.txt")
        with open(text_path,"r") as tf:
            lines=tf.readlines()
        for line in lines:
            parts=line.split()
            if len(parts)>4:
                line_num=parts[0].replace("\ufeff","")
                word_num=parts[1]
                label=parts[2]
                data=parts[3]
                x,y,w,h=[int(i) for i in parts[-1].split(",")]
                liness.append(line_num)
                words.append(word_num)
                chars.append(label)
                xmins.append(x)
                ymins.append(y)
                xmaxs.append(x+w)
                ymaxs.append(y+h)
                _paths.append(img_path)
                comps.append(data)
    df=pd.DataFrame({"line":liness,
                     "word":words,
                     "char":chars,
                     "comp":comps,
                     "xmin":xmins,
                     "ymin":ymins,
                     "xmax":xmaxs,
                     "ymax":ymaxs,
                     "image":_paths})
    return df

def check_missing(_dir,coords,fmt):
    '''
        checks for missing data
    '''
    img_paths=[img_path for img_path in glob(os.path.join(_dir,f"*.{fmt}"))]
    txt_paths=[txt_path for txt_path in glob(os.path.join(_dir,coords,"*.txt"))]
    # error check
    for img_path in tqdm(img_paths):
        if "jpg" in img_path:
            _iden=os.path.basename(img_path).split(".")[0]
            txt_path=os.path.join(_dir,coords,f"{_iden}.txt")
            if not os.path.exists(txt_path):
                print(img_path)
                for txt in txt_paths:
                    if _iden in txt:
                        print(txt)
                        niden=os.path.basename(txt).split('.')[0]
                        print(f"RENAME:{_iden} to {niden}")
                        os.rename(os.path.join(_dir,f"{_iden}.{fmt}"),
                                  os.path.join(_dir,f"{niden}.{fmt}"))
                        
                        
def removeShadow(img):
    rgb_planes = cv2.split(img)

    result_planes = []
    result_norm_planes = []
    for plane in rgb_planes:
        dilated_img = cv2.dilate(plane, np.ones((7,7), np.uint8))
        bg_img = cv2.medianBlur(dilated_img, 21)
        diff_img = 255 - cv2.absdiff(plane, bg_img)
        norm_img = cv2.normalize(diff_img,None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_8UC1)
        result_planes.append(diff_img)
        result_norm_planes.append(norm_img)

    result = cv2.merge(result_planes)
    result_norm = cv2.merge(result_norm_planes)
    return result

In [None]:
os.listdir(base_path)
dfs=[]

## 1.Camera

In [None]:
_dir=os.path.join(base_path,'1. Camera','1. Essay')
coords='Character Coordinates_a'
fmt="jpg"
check_missing(_dir,coords,fmt)
dfs.append(extract_info(_dir,coords,fmt))

## 2. Scan

In [None]:
_dir=os.path.join(base_path,'2. Scan','1. Essay')
coords='Character Coordinates_a'
fmt="tif"
check_missing(_dir,coords,fmt)
dfs.append(extract_info(_dir,coords,fmt))

# 3. Conjunct

In [None]:
_dir=os.path.join(base_path,'3. Conjunct')
coords='Character Coordinates'
fmt="tif"
check_missing(_dir,coords,fmt)
dfs.append(extract_info(_dir,coords,fmt))

In [None]:
df=pd.concat(dfs,ignore_index=True)
df

In [None]:
main_path=create_dir(save_path,"bs")
save_path=create_dir(main_path,"images")

In [None]:
#"filename","word","graphemes","mode"
filename=[]
graphemes=[]
mode=[]
iden=0
for img_path in tqdm(df.image.unique()):
    idf=df.loc[df.image==img_path]
    #-------------
    # image
    #-------------
    img=cv2.imread(img_path)
    
    # charmap
    cimg=removeShadow(img)
    cimg=cv2.cvtColor(cimg, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(cimg,(5,5),0)
    _,img = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    for line in idf.line.unique():
        linedf=idf.loc[idf.line==line]
        for word in linedf.word.unique():
            wdf=linedf.loc[linedf.word==word]
            # word
            xmin=int(min(wdf.xmin.tolist()))
            xmax=int(max(wdf.xmax.tolist()))

            ymin=int(min(wdf.ymin.tolist()))
            ymax=int(max(wdf.ymax.tolist()))

            data=img[ymin:ymax,xmin:xmax]
            data=stripPads(data,255)
            
            fname=f"{iden}.png"
            filename.append(fname)
            cv2.imwrite(os.path.join(save_path,fname),data)
            graphemes.append(wdf.comp.tolist())
            mode.append(img_path.replace(base_path,""))
            iden+=1
    

In [None]:
data=pd.DataFrame({"filename":filename,"graphemes":graphemes,"mode":mode})
data["word"]=data["graphemes"].progress_apply(lambda x:"".join(x))
data=data[["filename","word","graphemes","mode"]]
data

In [None]:
data.dropna(inplace=True)

# test train split
srcs=list(data["mode"].unique())
random.shuffle(srcs)
eval_len=int(len(srcs)*split_test/100)
eval_srcs=srcs[:eval_len]
data["mode"]=data["mode"].progress_apply(lambda x: "test" if x in eval_srcs else "train")
data

In [None]:
data.to_csv(os.path.join(main_path,"data.csv"),index=False)

In [None]:
LOG_INFO(f"IMPORTANT: PATH TO USE FOR tools/process.py:{main_path}","red")