In [1]:
import json
import pandas as pd
import numpy as np
import os
from tqdm import tqdm_notebook as tqdm

# Analysis

In [2]:
with open('train/receipt_00329.json') as json_file:
    data = json.load(json_file)

In [3]:
data.keys()

dict_keys(['dontcare', 'valid_line', 'meta', 'roi', 'repeating_symbol'])

In [4]:
data['meta']

{'version': 'v0.1',
 'split': 'train',
 'image_id': 329,
 'image_size': {'width': 1836, 'height': 3264}}

In [5]:
data['roi']

{}

In [6]:
data['repeating_symbol']

[]

In [7]:
len(data['valid_line'])

9

In [8]:
data['valid_line'][0]

{'words': [{'quad': {'x2': 958,
    'y3': 2094,
    'x3': 958,
    'y4': 2094,
    'x1': 819,
    'y1': 2033,
    'x4': 819,
    'y2': 2033},
   'is_key': 1,
   'row_id': 2163225,
   'text': 'TOTAL'},
  {'quad': {'x2': 1373,
    'y3': 2101,
    'x3': 1373,
    'y4': 2101,
    'x1': 1173,
    'y1': 1989,
    'x4': 1173,
    'y2': 1989},
   'is_key': 0,
   'row_id': 2163225,
   'text': '30.000'}],
 'category': 'total.total_price',
 'group_id': 26}

In [9]:
data['valid_line'][1]

{'words': [{'quad': {'x2': 430,
    'y3': 2283,
    'x3': 437,
    'y4': 2296,
    'x1': 300,
    'y1': 2226,
    'x4': 307,
    'y2': 2213},
   'is_key': 1,
   'row_id': 2163226,
   'text': 'CASH'},
  {'quad': {'x2': 1370,
    'y3': 2247,
    'x3': 1370,
    'y4': 2247,
    'x1': 1183,
    'y1': 2182,
    'x4': 1183,
    'y2': 2182},
   'is_key': 0,
   'row_id': 2163226,
   'text': '50.000'}],
 'category': 'total.cashprice',
 'group_id': 26}

In [10]:
data['valid_line'][2]

{'words': [{'quad': {'x2': 512,
    'y3': 2340,
    'x3': 522,
    'y4': 2372,
    'x1': 299,
    'y1': 2303,
    'x4': 309,
    'y2': 2272},
   'is_key': 1,
   'row_id': 2163227,
   'text': 'CHANGED'},
  {'quad': {'x2': 1371,
    'y3': 2319,
    'x3': 1369,
    'y4': 2314,
    'x1': 1186,
    'y1': 2249,
    'x4': 1185,
    'y2': 2254},
   'is_key': 0,
   'row_id': 2163227,
   'text': '20,000'}],
 'category': 'total.changeprice',
 'group_id': 26}

In [11]:
data['valid_line'][6]

{'words': [{'quad': {'x2': 329,
    'y3': 1948,
    'x3': 329,
    'y4': 1948,
    'x1': 292,
    'y1': 1887,
    'x4': 292,
    'y2': 1887},
   'is_key': 0,
   'row_id': 2163224,
   'text': '1'}],
 'category': 'menu.cnt',
 'group_id': 28}

# Generating the dataset

In [12]:
def scale(box, size):
    return (1000 * box) / size

## Train

In [20]:
tokens_train = []
boxes_train = []
labels_train = []
json_train = []
path = 'train/'
for js in tqdm(os.listdir(path)):
    with open(path + js) as f:
        json_train.append(json.load(f))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for js in tqdm(os.listdir(path)):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=800.0), HTML(value='')))




In [21]:
for js in tqdm(json_train):
    labels = []
    tokens = []
    boxes = []
    width, height = js['meta']['image_size']['width'], js['meta']['image_size']['height']
    for elem in js['valid_line']:
        for word in elem['words']:
            labels.append(elem['category'])
            box = word['quad']
            y1, y3 = int(scale(box['y1'], height)), int(scale(box['y3'], height))
            x1, x3 = int(scale(box['x1'], width)), int(scale(box['x3'], width))
            txt = word['text']
            tokens.append(txt)
            boxes.append([x1, y1, x3, y3])
    tokens_train.append(tokens)
    labels_train.append(labels)
    boxes_train.append(boxes)
            
            

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for js in tqdm(json_train):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=800.0), HTML(value='')))




In [22]:
labels_train[0]

['menu.nm',
 'menu.nm',
 'menu.cnt',
 'menu.price',
 'total.total_price',
 'total.total_price',
 'total.cashprice',
 'total.cashprice',
 'total.menuqty_cnt']

In [23]:
tokens_train[0]

['Tebu', 'Lemon', '1', '22.000', 'Total', '22.000', 'CASH', '22.000', '1']

In [24]:
boxes_train[0]

[[225, 587, 293, 563],
 [306, 587, 390, 562],
 [195, 587, 205, 564],
 [759, 584, 861, 557],
 [224, 669, 310, 623],
 [761, 670, 865, 622],
 [156, 751, 294, 707],
 [661, 755, 866, 704],
 [191, 671, 204, 625]]

## Validation

In [26]:
tokens_val = []
labels_val = []
boxes_val = []
json_val = []
path = 'valid/'
for js in tqdm(os.listdir(path)):
    with open(path + js) as f:
        json_val.append(json.load(f))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for js in tqdm(os.listdir(path)):


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [27]:
for js in tqdm(json_val):
    labels = []
    tokens = []
    boxes = []
    width, height = js['meta']['image_size']['width'], js['meta']['image_size']['height']
    for elem in js['valid_line']:
        for word in elem['words']:
            labels.append(elem['category'])
            box = word['quad']
            y1, y3 = int(scale(box['y1'], height)), int(scale(box['y3'], height))
            x1, x3 = int(scale(box['x1'], width)), int(scale(box['x3'], width))
            txt = word['text']
            tokens.append(txt)
            boxes.append([x1, y1, x3, y3])
    tokens_val.append(tokens)
    labels_val.append(labels)
    boxes_val.append(boxes)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for js in tqdm(json_val):


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [28]:
labels_val[0]

['menu.cnt',
 'menu.nm',
 'menu.price',
 'menu.cnt',
 'menu.nm',
 'menu.nm',
 'menu.price',
 'sub_total.subtotal_price',
 'sub_total.subtotal_price',
 'sub_total.tax_price',
 'sub_total.tax_price',
 'sub_total.tax_price',
 'sub_total.tax_price',
 'total.total_price',
 'total.total_price',
 'total.menuqty_cnt',
 'total.cashprice',
 'total.cashprice',
 'total.changeprice',
 'total.changeprice']

In [29]:
tokens_val[0]

['1',
 'SAYAP',
 '13,636',
 '1',
 'PAHA',
 'BAWAH',
 '13,636',
 'Subtotal',
 '27,272',
 'P.',
 'Resto',
 '10%',
 '2,700',
 'Total',
 '30,000',
 '2',
 'Tendered',
 '50,000',
 'Kembali',
 '20,000']

In [31]:
boxes_val[0]

[[300, 422, 311, 411],
 [330, 423, 409, 410],
 [590, 428, 684, 413],
 [298, 435, 311, 424],
 [327, 437, 395, 424],
 [411, 439, 494, 425],
 [590, 440, 681, 428],
 [297, 461, 429, 446],
 [586, 464, 684, 450],
 [294, 475, 325, 460],
 [329, 475, 411, 461],
 [427, 475, 477, 463],
 [600, 477, 681, 464],
 [330, 500, 411, 477],
 [588, 504, 686, 478],
 [297, 500, 311, 475],
 [297, 531, 430, 515],
 [587, 530, 686, 517],
 [291, 571, 405, 557],
 [588, 571, 686, 557]]

## Test

In [32]:
tokens_test = []
labels_test = []
boxes_test = []
json_test = []
path = 'test/'
for js in tqdm(os.listdir(path)):
    with open(path + js) as f:
        json_test.append(json.load(f))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for js in tqdm(os.listdir(path)):


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [33]:
for js in tqdm(json_test):
    labels = []
    tokens = []
    boxes = []
    width, height = js['meta']['image_size']['width'], js['meta']['image_size']['height']
    for elem in js['valid_line']:
        for word in elem['words']:
            labels.append(elem['category'])
            box = word['quad']
            y1, y3 = int(scale(box['y1'], height)), int(scale(box['y3'], height))
            x1, x3 = int(scale(box['x1'], width)), int(scale(box['x3'], width))
            txt = word['text']
            tokens.append(txt)
            boxes.append([x1, y1, x3, y3])
    tokens_test.append(tokens)
    labels_test.append(labels)
    boxes_test.append(boxes)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for js in tqdm(json_test):


HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [34]:
labels_test[0]

['menu.nm',
 'menu.nm',
 'menu.cnt',
 'menu.unitprice',
 'menu.price',
 'menu.nm',
 'menu.nm',
 'menu.cnt',
 'menu.unitprice',
 'menu.price',
 'sub_total.subtotal_price',
 'sub_total.subtotal_price',
 'sub_total.tax_price',
 'sub_total.tax_price',
 'sub_total.tax_price',
 'sub_total.tax_price',
 'total.total_price',
 'total.total_price',
 'total.total_price',
 'total.cashprice',
 'total.cashprice',
 'total.cashprice',
 'total.cashprice']

In [35]:
tokens_test[0]

['BASO',
 'TAHU',
 '1',
 '43,181',
 '43,181',
 'ES',
 'JERUK',
 '1',
 '13,000',
 '13,000',
 'TOTAL',
 '56,181',
 'TAX',
 '10.00',
 '%',
 '5,618',
 'GRAND',
 'TOTAL',
 '61,799',
 'TUNAI',
 '62,000',
 'KEMBALI',
 '201']

In [36]:
boxes_test[0]

[[145, 510, 233, 483],
 [243, 505, 333, 483],
 [528, 496, 554, 472],
 [564, 496, 686, 471],
 [771, 492, 888, 466],
 [145, 547, 195, 514],
 [206, 543, 312, 517],
 [533, 527, 560, 503],
 [570, 529, 695, 503],
 [777, 529, 902, 497],
 [138, 580, 254, 550],
 [792, 564, 917, 529],
 [133, 655, 206, 625],
 [217, 652, 332, 624],
 [342, 648, 370, 618],
 [834, 637, 935, 604],
 [119, 691, 239, 665],
 [253, 688, 370, 659],
 [820, 671, 952, 640],
 [119, 733, 239, 700],
 [831, 712, 978, 678],
 [109, 775, 278, 743],
 [902, 750, 981, 718]]

# Saving

In [39]:
import pickle
with open('train.pkl', 'wb') as t:
    pickle.dump([tokens_train, labels_train, boxes_train], t)
with open('val.pkl', 'wb') as t:
    pickle.dump([tokens_val, labels_val, boxes_val], t)
with open('test.pkl', 'wb') as t:
    pickle.dump([tokens_test, labels_test, boxes_test], t)