In [37]:
## test for reading pre-processed rico dataset
## convert preprocessed data to seq format (with discreting process embedded)

import torch
import io

with open('../raw_datasets/publaynet/pre_processed_20_5/test.pt', 'rb') as f:
    buffer = io.BytesIO(f.read())
    file=torch.load(buffer)

file[0]

{'name': 'PMC4055390_00006.jpg', 'bboxes': tensor([[0.0842, 0.6198, 0.3996, 0.3184],
        [0.5135, 0.6198, 0.3997, 0.3184],
        [0.2491, 0.0892, 0.4990, 0.0153],
        [0.0842, 0.4991, 0.8288, 0.0285],
        [0.0842, 0.1131, 0.8289, 0.3573],
        [0.0842, 0.5350, 0.8289, 0.0498]]), 'labels': tensor([1, 1, 1, 1, 4, 4]), 'canvas_size': [601.0, 792.0], 'filtered': False}


In [39]:
import math
import copy

class AddGaussianNoise():
    '''
    Add Gaussian Noise to bounding box
    '''

    def __init__(self,
                 mean=0.,
                 std=1.,
                 normalized: bool = True,
                 bernoulli_beta: float = 1.0):
        self.std = std
        self.ori_std=std
        self.mean = mean
        self.normalized = normalized
        # adding noise to every element by default
        self.bernoulli_beta = bernoulli_beta
        print('Noise: mean={0}, std={1}, beta={2}'.format(
            self.mean, self.std, self.bernoulli_beta))
        if self.ori_std >= 1.0:
            print("mix noise!")
    def __call__(self, data):
        # Gold Label
        if 'gold_bboxes' not in data.keys():
            data['gold_bboxes'] = copy.deepcopy(data['bboxes'])

        num_elemnts = data['bboxes'].size(0)
        beta = data['bboxes'].new_ones(num_elemnts) * self.bernoulli_beta
        element_with_noise = torch.bernoulli(beta).unsqueeze(dim=-1)

        if self.ori_std>=3.0:
            rand=torch.rand(1).item()
            if rand<=0.2:
                self.std=0.005
            elif rand<=0.4 and rand>0.2:
                self.std=0.01
            elif rand<=0.6 and rand>0.4:
                self.std=0.015
            elif rand<=0.8 and rand>0.6:
                self.std=0.02
            else:
                self.std=0.025


        elif self.ori_std>=2.0:
            rand=torch.rand(1).item()
            if rand<=0.25:
                self.std=0.005
            elif rand<=0.50 and rand>0.25:
                self.std=0.01
            elif rand<=0.75 and rand>0.5:
                self.std=0.015
            else:
                self.std=0.02

        elif self.ori_std >= 1.0: #mix noise
            rand=torch.rand(1).item()
            if rand<=0.33:
                self.std=0.005
            elif rand<=0.66 and rand>0.33:
                self.std=0.01
            else:
                self.std=0.02


        if self.normalized:
            data['bboxes'] = data['bboxes'] + torch.randn(
                data['bboxes'].size()) * self.std + self.mean
        else:
            canvas_width, canvas_height = data['canvas_size'][0], data[
                'canvas_size'][1]
            ele_x, ele_y = data['bboxes'][:, 0] * canvas_width, data[
                'bboxes'][:, 1] * canvas_height
            ele_w, ele_h = data['bboxes'][:, 2] * canvas_width, data[
                'bboxes'][:, 3] * canvas_height
            data['bboxes'] = torch.stack([ele_x, ele_y, ele_w, ele_h], dim=1)
            data['bboxes'] = data['bboxes'] + torch.randn(
                data['bboxes'].size()) * self.std + self.mean
            data['bboxes'][:, 0] /= canvas_width
            data['bboxes'][:, 1] /= canvas_height
            data['bboxes'][:, 2] /= canvas_width
            data['bboxes'][:, 3] /= canvas_height
        data['bboxes'][data['bboxes'] < 0] = 0.0
        data['bboxes'][data['bboxes'] > 1] = 1.0
        data['bboxes'] = data['bboxes'] * element_with_noise + data[
            'gold_bboxes'] * (1 - element_with_noise)
        return data

    def __repr__(self):
        return self.__class__.__name__ + '(mean={0}, std={1}, beta={2})'.format(
            self.mean, self.std, self.bernoulli_beta)
        
def decapulate(bbox):
    if len(bbox.size()) == 2:
        x1, y1, x2, y2 = bbox.T
    else:
        x1, y1, x2, y2 = bbox.permute(2, 0, 1)
    return x1, y1, x2, y2
    
def convert_ltwh_to_ltrb(bbox):
    l, t, w, h = decapulate(bbox)
    r = l + w
    b = t + h
    return torch.stack([l, t, r, b], axis=-1)

class DiscretizeBoundingBox():

    def __init__(self, num_x_grid: int, num_y_grid: int) -> None:
        self.num_x_grid = num_x_grid
        self.num_y_grid = num_y_grid
        self.max_x = self.num_x_grid - 1
        self.max_y = self.num_y_grid - 1

    def discretize(self, bbox):
        """
        Args:
            continuous_bbox torch.Tensor: N * 4
        Returns:
            discrete_bbox torch.LongTensor: N * 4
        """
        cliped_boxes = torch.clip(bbox, min=0.0, max=1.0)
        x1, y1, x2, y2 = decapulate(cliped_boxes)
        discrete_x1 = torch.floor(x1 * self.max_x)
        discrete_y1 = torch.floor(y1 * self.max_y)
        discrete_x2 = torch.floor(x2 * self.max_x)
        discrete_y2 = torch.floor(y2 * self.max_y)
        return torch.stack(
            [discrete_x1, discrete_y1, discrete_x2, discrete_y2],
            dim=-1).long()

    def discretize_num(self, num: float) -> int:
        return int(math.floor(num * self.max_y))

    def __call__(self, data):
        discrete_bboxes = self.discretize(data['bboxes'])
        data['discrete_bboxes'] = discrete_bboxes
        return data

class LexicographicSort():
    '''
    sort elements in one layout by their top and left postion
    '''

    def __call__(self, data):
        if 'gold_bboxes' not in data.keys():
            data['gold_bboxes'] = copy.deepcopy(data['bboxes'])
        l, t, _, _ = data['bboxes'].t()
        _zip = zip(*sorted(enumerate(zip(t, l)), key=lambda c: c[1:]))
        idx = list(list(_zip)[0])
        data['ori_bboxes'], data['ori_labels'] = data['gold_bboxes'], data[
            'labels']
        data['bboxes'], data['labels'] = data['bboxes'][idx], data['labels'][
            idx]
        data['gold_bboxes'] = data['gold_bboxes'][idx]
        return data

class LabelDictSort():
    '''
    sort elements in one layout by their label
    '''
    def __init__(self, index2label=None):
        self.index2label = index2label

    def __call__(self, data):
        # NOTE: for refinement
        if 'gold_bboxes' not in data.keys():
            data['gold_bboxes'] = copy.deepcopy(data['bboxes'])

        labels = data['labels'].tolist()
        idx2label = [[idx, self.index2label[labels[idx]]] for idx in range(len(labels))]
        idx2label_sorted = sorted(idx2label, key=lambda x : x[1])
        idx_sorted = [d[0] for d in idx2label_sorted]
        data['bboxes'], data['labels'] = data['bboxes'][idx_sorted], data['labels'][idx_sorted]
        data['gold_bboxes'] = data['gold_bboxes'][idx_sorted]
        return data

In [40]:
labels = [
    'text',
    'title',
    'list',
    'table',
    'figure',
]

In [41]:
idx2label={}
for i in range(5):
    idx2label[i+1]=labels[i]
sort=LexicographicSort()
sort_type=LabelDictSort(index2label=idx2label)
discrete_fn=DiscretizeBoundingBox(128,128)


In [42]:
gaussian=AddGaussianNoise(mean=0,std=0.005,normalized=True,bernoulli_beta=1.0)

for i in range(len(file)):

    ### sort ###
    file[i]=sort_type(file[i])
    # file[i]=sort(file[i])

    ### noise for refine ###
    file[i]=gaussian(file[i])

    ### ltrb ###
    file[i]['bboxes']=convert_ltwh_to_ltrb(file[i]['bboxes'])

    ### discretize ###
    file[i]=discrete_fn(file[i])
    
file[0]

Noise: mean=0, std=0.005, beta=1.0
{'name': 'PMC4055390_00006.jpg', 'bboxes': tensor([[0.0790, 0.1139, 0.9091, 0.4752],
        [0.0889, 0.5447, 0.9125, 0.5990],
        [0.0828, 0.6254, 0.4760, 0.9430],
        [0.5128, 0.6241, 0.9155, 0.9426],
        [0.2496, 0.0859, 0.7424, 0.1052],
        [0.0888, 0.4993, 0.9246, 0.5284]]), 'labels': tensor([4, 4, 1, 1, 1, 1]), 'canvas_size': [601.0, 792.0], 'filtered': False, 'gold_bboxes': tensor([[0.0842, 0.1131, 0.8289, 0.3573],
        [0.0842, 0.5350, 0.8289, 0.0498],
        [0.0842, 0.6198, 0.3996, 0.3184],
        [0.5135, 0.6198, 0.3997, 0.3184],
        [0.2491, 0.0892, 0.4990, 0.0153],
        [0.0842, 0.4991, 0.8288, 0.0285]]), 'discrete_bboxes': tensor([[ 10,  14, 115,  60],
        [ 11,  69, 115,  76],
        [ 10,  79,  60, 119],
        [ 65,  79, 116, 119],
        [ 31,  10,  94,  13],
        [ 11,  63, 117,  67]])}


In [43]:
all_layout=[]

for i,_ in enumerate(file):

    sample=file[i]
    layout=[]
    bbox=sample['discrete_bboxes']
    for idx,label in enumerate(sample['labels']):
        layout.append(labels[label-1])
        for pos in bbox[idx]:
            layout.append(str(pos.numpy()))
        if idx!=len(bbox)-1:
            layout.append("|")
    str_layout=' '.join(layout)
    all_layout.append(str_layout)
    
all_layout[0]

In [45]:
data=open("../data/processed_datasets/PublayNet_ltrb_lex_refine_0.005/src1_test.txt",'w+')
for _,layout in enumerate(all_layout): 
    print(layout,file=data)
data.close()