In [42]:
from PIL import Image, ImageDraw, ImageFont
from PIL import ImageDraw
from datasets.features import ClassLabel
from transformers import AutoModelForTokenClassification, AutoProcessor
from datasets import Dataset
import torch
import numpy as np

In [192]:
model = AutoModelForTokenClassification.from_pretrained("bmeisburger/datathon", local_files_only=False)

dataset = Dataset.load_from_disk("bill_dataset/")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

In [193]:
dataset.features

{'id': Value(dtype='string', id=None),
 'words': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'bboxes': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
 'image': Image(decode=True, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['S-TOTAL', 'S-DATE', 'S-ADDRESS', 'S-COMPANY', 'O'], id=None), length=-1, id=None)}

In [194]:
def iob_to_label(label):
    label = label[2:]
    if not label:
        return 'other'
    return label

label2color = {'total': 'blue', 'company': 'green',
               'date': 'orange', 'address': 'violet', 'other': 'grey'}

id2label = {0: 'S-TOTAL', 1: 'S-DATE', 2: 'S-ADDRESS', 3: 'S-COMPANY', 4: 'O'}

In [195]:
def unnormalize_box(bbox, width, height):
    return (
        width * (bbox[0] / 1000),
        height * (bbox[1] / 1000),
        width * (bbox[2] / 1000),
        height * (bbox[3] / 1000))

In [198]:
def process_example(i:int):
    sample = dataset[i]
    image = sample['image'].copy().convert("RGB")  # copy so we don't ruin good art
    width, height = image.size

    encoded_inputs = processor(sample['image'].convert("RGB"), sample['words'], boxes=sample['bboxes'], word_labels=sample['ner_tags'],
                            padding="max_length", truncation=True, return_tensors="pt")

    labels = encoded_inputs.pop('labels').squeeze().tolist()

    outputs = model(**encoded_inputs)
    predictions = outputs.logits.argmax(-1).squeeze().tolist()
    token_boxes = encoded_inputs.bbox.squeeze().tolist()

    true_predictions = [id2label[prediction] for prediction, label in zip(predictions, labels) if label != -100]

    # true_labels = [id2label[label] for prediction, label in zip(predictions, labels) if label != -100]
    # true_boxes = [unnormalize_box(box, width, height) for box, label in zip(token_boxes, labels) if label != -100]
    # draw = ImageDraw.Draw(image)
    # font = ImageFont.truetype("Aeonik-Regular.otf", 32)
    # for word, prediction, box in zip(sample['words'], true_predictions, true_boxes):
    #     predicted_label = iob_to_label(prediction).lower()
    #     if predicted_label != "other":
    #         draw.rectangle(box, outline=label2color[predicted_label], width=4)
    #         # draw.rectangle(box)
    #         # draw.text((box[0] + 10, box[1] - 10), text=predicted_label, font=font)
    #         draw.text((box[2] + 10, box[1] - 3), text=predicted_label, fill=label2color[predicted_label], font=font)
    # image.show()

    return sample['words'], true_predictions

In [199]:
# 6, 7, 8
process_example(6)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


(['PROSPER NIAGA',
  'COMPANY NO : SA0099552-P',
  'LOT PT 1138',
  'BANDAR MAHKOTA CHERAS',
  '43200 CHERAS',
  'SITE : 2365',
  'TEL NO: 03-90199450',
  'GST NO : 000534347776',
  'PRE-AUTHORISATION',
  'PRE AUTH CODE A02A1530024046',
  '38.61 LITRE PUMP # 07',
  'V-POWER 97',
  'RM',
  '100.00 K',
  '2.590',
  'RM',
  '/ LITRE',
  'TOTAL',
  'RM',
  '100.00',
  'CASH',
  'RM',
  '100.00',
  '0.00% SR GST',
  'K',
  'RM',
  '0.00',
  'TOTAL GROSS',
  'K',
  'RM',
  '100.00',
  'CASHIER:',
  'MFIKRI2',
  'THIS IS NOT THE FINAL FISCAL RECEIPT',
  'DATE',
  'TIME',
  'NUM',
  'POS CNO SHIFT',
  '26/06/18 22:40 45107 02 9577',
  '659',
  'DIESEL & PETROL RON95 GIVEN RELIEF',
  'UNDER SECTION 56(3)(B) GST ACT 2014',
  'THANK YOU AND PLEASE COME AGAIN'],
 ['S-COMPANY',
  'O',
  'S-ADDRESS',
  'S-ADDRESS',
  'S-ADDRESS',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O

In [93]:
import pandas as pd

users = pd.read_csv("../data/Users.csv")
test = pd.read_csv("../data/test_transactions.csv")

total = pd.concat([users, test])

doc2payid = {}

for i, row in total.iterrows():
    doc2payid[row.at['documentid']] = row.at['paymentid']

In [95]:
print(len(doc2payid.keys()))

625


In [97]:
df = []

id2label = {0: 'S-TOTAL', 1: 'S-DATE', 2: 'S-ADDRESS', 3: 'S-COMPANY', 4: 'O'}

for i in range(dataset.num_rows):

    words, preds = process_example(i)
    doc_id = dataset[i]['id']
    if doc_id not in doc2payid.keys():
        continue
    payment_id = doc2payid[doc_id]

    row_data = {"doc_id": doc_id, "payment_id": payment_id, "S-TOTAL": "", "S-DATE": "", "S-ADDRESS": "", "S-COMPANY": ""}
    for word, pred in zip(words, preds):
        if pred != "O":
            row_data[pred] = row_data[pred] + ' ' + word
    
    for key in row_data.keys():
        row_data[key] = row_data[key] if len(row_data[key]) > 0 else pd.NA
    
    df.append(row_data)

In [106]:
df = pd.DataFrame.from_records(df)



Unnamed: 0,doc_id,payment_id,S-TOTAL,S-DATE,S-ADDRESS,S-COMPANY
0,00d0377951457,00p0693154757,7.60,,NO 14& 16 JALAN PERMAS 4/3 BANDAR BARU PERMAS...,TRIPLE SIX POINT ENTERPRISE 666
1,00d0660852423,00p0965564934,10.30,,NO 3 BANDAR BARU PERMAS JAYA,LEMON TREE RESTAURANT
2,00d0803190070,00p0247334622,,,NO. 31G&33G 40170 SETIA ALAM,SANYU STATIONERY SHOP
3,00d0259552507,00p0876493998,,05-JAN-2017 03:17:50 PM,BANDAR PINGGIRAN SUBANG SEKSYEN U5 40150 SHAH...,S&Y STATIONERY
4,00d0886341707,00p0151413045,,,LOT S25 NO 1 81100 JOHOR BAHRU,DIMILIKI OLEH : DOVE HOLDINGS SDN BHD
...,...,...,...,...,...,...
557,00d0880901672,00p0741339461,,19 APR 2018 18:22,,UNIHAKKA INTERNATIONAL SDN BHD
558,00d0677299556,00p0504275506,,11-05-16,LOT P.T. 33198 JALAN KAPAR 42100 KLANG 1605-T...,99 SPEED MART S/B (519537-X)
559,00d0242366463,00p0778749877,RM7.90 RM8.35,,NO.145G LAMAN RIMBUNAN KEPONG 52100 KUALA LUMPUR,YAM FRESH
560,00d0504003816,00p0361841562,4.60,,NO.2 SEKSYEN 9 43200 CHERAS,RESTORAN WAN SHENG


In [124]:
print("Fraction of entries w/o total:", df['S-TOTAL'].isna().sum() / len(df))
print("Fraction of entries w/o date:", df['S-DATE'].isna().sum() / len(df))
print("Fraction of entries w/o address:", df['S-ADDRESS'].isna().sum() / len(df))
print("Fraction of entries w/o company:", df['S-COMPANY'].isna().sum() / len(df))
print("Fraction of entries w/o total AND date:", len(df[df['S-TOTAL'].isna()][df[df['S-TOTAL'].isna()]['S-DATE'].isna()]) / len(df))

Fraction of entries w/o total: 0.2846975088967972
Fraction of entries w/o date: 0.5124555160142349
Fraction of entries w/o address: 0.07651245551601424
Fraction of entries w/o company: 0.014234875444839857
Fraction of entries w/o total AND date: 0.0800711743772242


In [188]:
for i, row in df.iterrows():
    print(list(row[row.notna()]))
    break

['00d0377951457', '00p0693154757', ' 7.60', ' NO 14& 16 JALAN PERMAS 4/3 BANDAR BARU PERMAS JAY', ' TRIPLE SIX POINT ENTERPRISE 666']


In [189]:
df.to_csv("./dataframe.csv", sep='\t')