In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# import arrow
# import random

# from tqdm import tqdm

# from datetime import datetime
# from pandas.core.common import flatten
# from collections import OrderedDict


# import json
# import chardet

# import torch.nn as nn


In [3]:
from transformers import BertTokenizer, AutoModel
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-base-p2")

In [20]:
import os
import cv2 as cv
from pathlib import Path
import matplotlib.pyplot as plt

import torch
from torch.utils.data import dataset
from sklearn.model_selection import train_test_split

from laylm.data import utils 
import pandas as pd
import numpy as np
import json


class IDCardDataset(dataset.Dataset):
    def __init__(self, root, tokenizer, labels=None, mode='train', 
                 test_size=0.2, max_seq_length=512):
        self.root = Path(root)
        self.tokenizer = tokenizer
        self.labels = labels
        self.mode = mode
        self.test_size = test_size
        self.max_seq_length = max_seq_length
        self._build_files()
        
    def _build_files(self):
        names = self._get_names("*_json.json")
        data = self._get_filtered_files(names)
        dframe, train, test = self._split_dataset(data)
        self.data_frame = dframe
        self.train_frame = train
        self.test_frame = test
        
        if self.mode=="train":
            self.frame = self.train_frame
        else:
            self.frame = self.test_frame
        
    def _split_dataset(self, data):
        dframe = pd.DataFrame(data)
        train, test = train_test_split(dframe, 
                                       test_size=self.test_size, 
                                       random_state=1261)
        train = train.reset_index(drop=True)
        test = test.reset_index(drop=True)
        return dframe, train, test
        
    def _get_filtered_files(self, names):
        data = {'name':[], 'image':[], 'mask':[],'anno':[]}
        
        jfiles = self._glob_filter("*_json.json")
        ifiles = self._glob_filter("*_image.jpg")
        mfiles = self._glob_filter("*_mask.jpg")
        
        for name in names:
            for (jfile, ifile, mfile)  in zip(jfiles, ifiles, mfiles):
                
                jfpn = jfile.name.split("_")[0]
                ifpn = ifile.name.split("_")[0]
                mfpn = mfile.name.split("_")[0]
                
                if name == jfpn and name == ifpn and name == mfpn:
                    data['name'].append(name)
                    data['image'].append(ifile)
                    data['mask'].append(mfile)
                    data['anno'].append(jfile)
                    
        return data

    def _get_names(self, path_pattern):
        names = []
        files = self._glob_filter(path_pattern)
        for file in files:
            names.append(file.name.split("_")[0])
        return names
    
    def _glob_filter(self, pattern):
        return sorted(list(self.root.glob(pattern)))
    
    def _load_anno(self, path):
        path = str(path)
        with open(path) as f:
            data_dict = json.load(f)
        return data_dict
    
    def _load_image(self, path):
        path = str(path)
        img = cv.imread(path, cv.IMREAD_UNCHANGED)
        return img
        
    def __len__(self):
        return len(self.frame)
    
    def __getitem__(self, idx):
        record = self.frame.iloc[idx]
        anno = self._load_anno(record['anno'])
        img = self._load_image(record['image'])
        mask = self._load_image(record['mask'])
        
        anno_objects = utils.format_annotation_objects(
            anno, self.tokenizer, self.max_seq_length
        )
        
        input_ids, input_masks, segment_ids, label_ids, boxes = anno_objects
        
        input_ids = torch.tensor(input_ids, dtype=torch.long)
        input_masks = torch.tensor(input_masks, dtype=torch.long)
        segment_ids = torch.tensor(segment_ids, dtype=torch.long)
        label_ids = torch.tensor(label_ids, dtype=torch.long)
        boxes = torch.tensor(boxes, dtype=torch.long)
        data = (
            input_ids, input_masks, 
            segment_ids, label_ids, boxes,
            img, mask
        )
        
        return data
        
    

In [21]:
path = 'data/results/combined/1606064001/'
dataset = IDCardDataset(root=path, tokenizer=tokenizer)
# len(data[0])
input_ids, input_masks, segment_ids, label_ids, boxes, img, mask = dataset[0]

In [22]:
boxes

tensor([[  0,   0,   0,   0],
        [359, 755, 388, 828],
        [377, 829, 395, 857],
        ...,
        [  0,   0,   0,   0],
        [  0,   0,   0,   0],
        [  0,   0,   0,   0]])

In [23]:
from torch.utils.data.dataloader import DataLoader
loader = DataLoader(dataset)
input_ids, input_masks, segment_ids, label_ids, boxes, img, mask = next(iter(loader))

tensor([[[[ 9, 17, 30],
          [ 9, 17, 30],
          [ 9, 17, 30],
          ...,
          [ 0,  1, 11],
          [ 0,  1, 11],
          [ 0,  1, 11]],

         [[ 9, 17, 30],
          [ 9, 17, 30],
          [ 9, 17, 30],
          ...,
          [ 0,  1, 11],
          [ 0,  1, 11],
          [ 0,  1, 11]],

         [[ 9, 17, 30],
          [ 9, 17, 30],
          [ 9, 17, 30],
          ...,
          [ 0,  1, 11],
          [ 0,  1, 11],
          [ 0,  1, 11]],

         ...,

         [[20, 49, 80],
          [21, 50, 81],
          [24, 53, 84],
          ...,
          [24, 30, 43],
          [24, 29, 44],
          [24, 29, 44]],

         [[18, 47, 78],
          [20, 49, 80],
          [22, 51, 82],
          ...,
          [24, 29, 44],
          [25, 30, 45],
          [25, 30, 45]],

         [[17, 46, 77],
          [18, 47, 78],
          [20, 49, 80],
          ...,
          [24, 29, 44],
          [24, 29, 44],
          [25, 30, 45]]]], dtype=torch.uint8)