# Store Dataset

In [1]:
import cv2
import os
from glob import glob
from tqdm.auto import tqdm
import shutil
import matplotlib.pyplot as plt
import numpy as np
import math
import os.path as osp

#-------------------------------
ds_dir   ="/backup/Recognition/NAT/"
src_dir  ="/backup/RAW/DET/DBNet/"
#-------------------------------
def create_dir(base,ext):
    '''
        creates a directory extending base
        args:
            base    =   base path 
            ext     =   the folder to create
    '''
    _path=os.path.join(base,ext)
    if not os.path.exists(_path):
        os.mkdir(_path)
    return _path
#-------------------------------
ds_dir           = create_dir(ds_dir,"scene")
img_dir          = create_dir(ds_dir,"images")
data_csv         = os.path.join(ds_dir,"data.csv")
data_txt         = os.path.join(ds_dir,"data.txt") 
ds_idens = os.listdir(src_dir)
ds_idens

['mlt2017train',
 'icdar2015train',
 'sorieTest',
 'icdar2015test',
 'sorieTrain',
 'totaltext',
 'wildreceipt',
 'mlt2017eval',
 'funsd',
 'ctw']

In [2]:
def get_mini_boxes(contour):
    if not contour.size:
        return None
    bounding_box = cv2.minAreaRect(contour)
    points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])

    index_1, index_2, index_3, index_4 = 0, 1, 2, 3
    if points[1][1] > points[0][1]:
        index_1 = 0
        index_4 = 1
    else:
        index_1 = 1
        index_4 = 0
    if points[3][1] > points[2][1]:
        index_2 = 2
        index_3 = 3
    else:
        index_2 = 3
        index_3 = 2

    box = [points[index_1], points[index_2],
        points[index_3], points[index_4]]
    
    return np.array(box,dtype=np.float32)

def get_rotate_crop_image(img, points):
        # Use Green's theory to judge clockwise or counterclockwise
        # author: biyanhua
        d = 0.0
        for index in range(-1, 3):
            d += -0.5 * (points[index + 1][1] + points[index][1]) * (
                        points[index + 1][0] - points[index][0])
        if d < 0: # counterclockwise
            tmp = np.array(points)
            points[1], points[3] = tmp[3], tmp[1]

        img_crop_width = int(
            max(
                np.linalg.norm(points[0] - points[1]),
                np.linalg.norm(points[2] - points[3])))
        img_crop_height = int(
            max(
                np.linalg.norm(points[0] - points[3]),
                np.linalg.norm(points[1] - points[2])))
        pts_std = np.float32([[0, 0], [img_crop_width, 0],
                            [img_crop_width, img_crop_height],
                            [0, img_crop_height]])
        M = cv2.getPerspectiveTransform(points, pts_std)
        dst_img = cv2.warpPerspective(
            img,
            M, (img_crop_width, img_crop_height),
            borderMode=cv2.BORDER_REPLICATE,
            flags=cv2.INTER_CUBIC)
        dst_img_height, dst_img_width = dst_img.shape[0:2]
        if dst_img_height * 1.0 / dst_img_width >= 1.5:
            dst_img = np.rot90(dst_img)
        return dst_img

def extract_data(img_path,ds_iden):
    img=cv2.imread(img_path)
    # gt_path
    if ds_iden in ['sorieTest','sorieTrain']:
        gt_path=img_path.replace("images","gts")
        gt_path=gt_path.split(".")[0]+".txt"
        
    elif ds_iden in ['mlt2017train','mlt2017eval','icdar2015test']:
        gt_path=img_path.replace("images","gts").replace("img","gt_img")
        gt_path=gt_path.split(".")[0]+".txt"
        
    else: #ctw,funsd,icdar2015train,'totaltext',"wildreceipt"
        gt_path=img_path.replace("images","gts")+".txt"

    
    # 8 lenght xy
    len8s=["wildreceipt",
           'mlt2017train',
           'mlt2017eval',
           'icdar2015train',
           'icdar2015test',
           'sorieTrain',
           'sorieTest',
           'funsd']
    
    lines = []
    # ann
    reader = open(gt_path, 'r').readlines()
    for line in reader:
        item = {}
        if ds_iden=="ctw":
            parts=line.strip().split("####")
            label=parts[-1].replace(",","*")
            line=parts[0]+label
    
        
        parts=line.strip().split(",")
        # mlt2017
        if "mlt2017" in ds_iden:
            lang=parts[8]
            if lang not in ["Bangla","Latin"]:
                continue
            label=line.split(f"{lang},")[-1]
        elif ds_iden in len8s:
            label="".join(parts[8:])    
        else:
            label = parts[-1]
        
        # wildreceipt
        if ds_iden=='wildreceipt':
            if len(label)==0:
                continue
        label=label.strip()
        # conversion
        if label=="###":
            continue
        
        #--> poly
        line = [i.strip('\ufeff').strip('\xef\xbb\xbf') for i in parts]
        if  ds_iden in len8s:
            poly = np.array(list(map(np.float32, line[:8]))).reshape((-1, 2))
            crop=get_rotate_crop_image(img, poly)
        else:
            num_points = math.floor((len(line) - 1) / 2) * 2
            poly = np.array(list(map(np.float32, line[:num_points]))).reshape((-1,1, 2))
            if poly.shape[0] < 3:
                continue
            box=get_mini_boxes(poly)
            if box is None:
                continue
            crop=get_rotate_crop_image(img, box)
                
        item['crop'] = crop
        item['text'] = label
    
        lines.append(item)
    
    return lines
    

In [3]:
fiden=0
filepaths=[]
words=[]
def process_dataset(ds_path,ds_iden):
    global fiden
    filepaths=[]
    words=[]
    dataset_path=os.path.join(ds_path,ds_iden)
    img_paths=[img_path for img_path in tqdm(glob(os.path.join(dataset_path,"images","*.*")))]
    # extract anns 
    for img_path in tqdm(img_paths):
        try:
            lines=extract_data(img_path,ds_iden)
            if len(lines)>0:
                for line in lines:
                    filepath= os.path.join(img_dir,f"{fiden}.png")
                    img=line["crop"]
                    word=line["text"]
                    # save
                    cv2.imwrite(filepath,img)
                    filepaths.append(filepath)
                    words.append(word)
                    with open(data_txt,"a+") as f:
                        f.write(f"{fiden}.png,{word}\n")
                    fiden+=1
        except Exception as e:
            print("-------------------------------")
            print(ds_iden,":",img_path)
            print(e)
            print("-------------------------------")
    return filepaths,words
   



In [4]:
for ds_iden in ds_idens:
    print(ds_iden)
    _filepaths,_words=process_dataset(src_dir,ds_iden)
    filepaths+=_filepaths
    words+=_words


mlt2017train


  0%|          | 0/7200 [00:00<?, ?it/s]

  0%|          | 0/7200 [00:00<?, ?it/s]

-------------------------------
mlt2017train : /backup/RAW/DET/DBNet/mlt2017train/images/img_1188.gif
OpenCV(4.5.1) /tmp/pip-req-build-ms668fyv/opencv/modules/imgproc/src/imgwarp.cpp:3144: error: (-215:Assertion failed) _src.total() > 0 in function 'warpPerspective'

-------------------------------
-------------------------------
mlt2017train : /backup/RAW/DET/DBNet/mlt2017train/images/img_401.gif
OpenCV(4.5.1) /tmp/pip-req-build-ms668fyv/opencv/modules/imgproc/src/imgwarp.cpp:3144: error: (-215:Assertion failed) _src.total() > 0 in function 'warpPerspective'

-------------------------------
-------------------------------
mlt2017train : /backup/RAW/DET/DBNet/mlt2017train/images/img_478.gif
OpenCV(4.5.1) /tmp/pip-req-build-ms668fyv/opencv/modules/imgproc/src/imgwarp.cpp:3144: error: (-215:Assertion failed) _src.total() > 0 in function 'warpPerspective'

-------------------------------
-------------------------------
mlt2017train : /backup/RAW/DET/DBNet/mlt2017train/images/img_1187.gif


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

sorieTest


  0%|          | 0/347 [00:00<?, ?it/s]

  0%|          | 0/347 [00:00<?, ?it/s]

-------------------------------
sorieTest : /backup/RAW/DET/DBNet/sorieTest/images/X51006619503.jpg
'utf-8' codec can't decode byte 0xa3 in position 407: invalid start byte
-------------------------------
-------------------------------
sorieTest : /backup/RAW/DET/DBNet/sorieTest/images/X51006328967.jpg
could not convert string to float: ''
-------------------------------
icdar2015test


  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

sorieTrain


  0%|          | 0/712 [00:00<?, ?it/s]

  0%|          | 0/712 [00:00<?, ?it/s]

-------------------------------
sorieTrain : /backup/RAW/DET/DBNet/sorieTrain/images/X51005605333(1).jpg
[Errno 2] No such file or directory: '/backup/RAW/DET/DBNet/sorieTrain/gts/X51005605333(1).txt'
-------------------------------
-------------------------------
sorieTrain : /backup/RAW/DET/DBNet/sorieTrain/images/X51005685355(2).jpg
[Errno 2] No such file or directory: '/backup/RAW/DET/DBNet/sorieTrain/gts/X51005685355(2).txt'
-------------------------------
-------------------------------
sorieTrain : /backup/RAW/DET/DBNet/sorieTrain/images/X51005433492(1).jpg
[Errno 2] No such file or directory: '/backup/RAW/DET/DBNet/sorieTrain/gts/X51005433492(1).txt'
-------------------------------
-------------------------------
sorieTrain : /backup/RAW/DET/DBNet/sorieTrain/images/X51007339647(1).jpg
[Errno 2] No such file or directory: '/backup/RAW/DET/DBNet/sorieTrain/gts/X51007339647(1).txt'
-------------------------------
-------------------------------
sorieTrain : /backup/RAW/DET/DBNet/s

  0%|          | 0/1555 [00:00<?, ?it/s]

  0%|          | 0/1555 [00:00<?, ?it/s]

wildreceipt


  0%|          | 0/1739 [00:00<?, ?it/s]

  0%|          | 0/1739 [00:00<?, ?it/s]

mlt2017eval


  0%|          | 0/1799 [00:00<?, ?it/s]

  0%|          | 0/1799 [00:00<?, ?it/s]

funsd


  0%|          | 0/199 [00:00<?, ?it/s]

  0%|          | 0/199 [00:00<?, ?it/s]

ctw


  0%|          | 0/1500 [00:00<?, ?it/s]

  0%|          | 0/1500 [00:00<?, ?it/s]

-------------------------------
ctw : /backup/RAW/DET/DBNet/ctw/images/0478.jpg
could not convert string to float: 'NEW YORK'
-------------------------------


In [5]:
import pandas as pd
df=pd.DataFrame({"filepath":filepaths,"word":words})
df.to_csv(data_csv,index=False)