In [2]:
import re
import torch
import numpy as np
import pandas as pd
from glob import glob
from transformers import *
from sklearn.model_selection import train_test_split
from itertools import zip_longest

In [3]:
pd.options.display.max_rows = 999

## BERT tokenizer loading

In [4]:
pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

In [5]:
max_len = tokenizer.max_len
print(max_len)

512




In [7]:
tokenizer.tokenize('float')
tokenizer.tokenize('total')
tokenizer.tokenize('gst')
tokenizer.tokenize('round')
tokenizer.tokenize('10%') 
tokenizer.tokenize('9.24')
tokenizer.tokenize('273500') 
tokenizer.tokenize('currency')
tokenizer.tokenize('row')
tokenizer.tokenize('date')

['date']

## data loading

In [8]:
data_dict = torch.load('/home/long8v/ICDAR-2019-SROIE/task3/data/data_dict4.pth')
zipped_data = list(zip(*data_dict.values()))
texts = zipped_data[0]
labels = zipped_data[1]

In [10]:
(texts[0], labels[0])

('TAN WOON YANN\nBOOK TA .K(TAMAN DAYA) SDN BND\n789417-W\nNO.53 55,57 & 59, JALAN SAGU 18,\nTAMAN DAYA,\n81100 JOHOR BAHRU,\nJOHOR.\nDOCUMENT NO : TD01167104\nDATE:\t25/12/2018 8:13:39 PM\nCASHIER:\tMANIS\nMEMBER:\nCASH BILL\nCODE/DESC\tPRICE\tDISC\tAMOUNT\nQTY\tRM\tRM\n9556939040116\tKF MODELLING CLAY KIDDY FISH\n1 PC\t*\t9.000\t0.00\t9.00\nTOTAL:\t9.00\nROUNDING ADJUSTMENT:\t0.00\nROUNDED TOTAL (RM):\t9.00\nCASH\t10.00\nCHANGE\t1.00\nGOODS SOLD ARE NOT RETURNABLE OR\nEXCHANGEABLE\n***\n***\nTHANK YOU\nPLEASE COME AGAIN !',
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0

## replace special token 

In [17]:
re_int = re.compile('\d+')
re_float = re.compile('(\d+\.\d+)')
re_percent = re.compile('(\d+.?\d+%)')
re_date = re.compile('(\d{2}[/-]\d{2}[/-]\d{2,4})')
re_row = re.compile('\n')

In [19]:
re_date.findall('20-02-2020')

['20-02-2020']

In [20]:
re_dict = {re_float:'float', re_percent:'percent', re_date:'date', re_int:'int', re_row:' row '}

In [21]:
corpus = ' '.join(texts)
for _ in re_dict.values():
    print('{} : {}'.format(_, sum([_ in corpus])))

float : 0
percent : 0
date : 0
int : 0
 row  : 0


In [22]:
re_int.search(texts[1]).group()

'27'

In [23]:
for text, label in zip(texts, labels):
    if len(text)!=len(label):
        print(len(text), len(label))

In [37]:
def replace_digits(text):
    for key, value in re_dict.items():
        text = key.sub(value, text)
    return text

In [24]:
def replace_special_token(text, label):
    text_copy = text
    label = list(label)
    label_copy = label[:]
    for re_exp, special_token in re_dict.items():
        while re_exp.search(text):
            match = re_exp.search(text)
            span_start, span_end = match.span()
            word = match.group()
            len_diff = len(special_token) - len(word)
            if len_diff > 0:
                while len_diff:
                    len_diff -= 1
                    try:
                        label_copy.insert(span_start, label_copy[span_start])
                    except:
                        print(len(label), span_start)
            elif len_diff < 0: 
                del label_copy[span_start:span_start-len_diff]
            else:
                pass
            text = list(text)
            del text[span_start:span_end]
            text.insert(span_start, special_token)
            text = ''.join(text)
    assert len(text) == len(label_copy)
    return text, label_copy

In [25]:
replace_list = dict([replace_special_token(text, label) for text, label in zip(texts, labels)])

In [26]:
texts = list(replace_list.keys())
labels = list(replace_list.values())

## bio-tagging for bert tokenizer

In [27]:
def get_tokenized_word(text):
    token_word = tokenizer.tokenize(text)
    return token_word

In [28]:
def get_token_labels(token_word, text, label):
    index = 0
    token_labels = []
    label_clean = [lbl for txt, lbl in list(zip(text, label)) if txt.strip()]
    for token in token_word[:max_len]:
        token_clean = token.replace('##', '')
        token_labels.append(label_clean[index:index+len(token_clean)])
        index += len(token_clean)
    return token_labels

In [29]:
def get_bio_tag(token_labels):
    label_dict = {0: 'O', 1: 'COMPANY', 2:'DATE', 3:'ADDRESS', 4:'TOTAL'}
    token_label_bio = []
    current = 0 
    for token_label in token_labels:
        try:
            temp_label = token_label[0]
        except IndexError as e:
            pass
        if temp_label == 0:
            token_label_bio.append(label_dict[temp_label])
        elif temp_label != current:
            token_label_bio.append('B-{}'.format(label_dict[temp_label]))
        else:
            token_label_bio.append('I-{}'.format(label_dict[temp_label]))
        current = temp_label
    return token_label_bio

In [30]:
def get_paired_token(text, label):
    token_word = get_tokenized_word(text)
    token_labels = get_token_labels(token_word, text, label)
    token_label_bio = get_bio_tag(token_labels)
    return pd.DataFrame(zip(token_word, token_label_bio))

In [38]:
def get_paired_token_text_label(texts, labels):
    df_list = []
    for text, label in zip(texts, labels):
        df = pd.DataFrame()
        df = df.append({0:'-DOCSTART-', 1: 'O'}, ignore_index=True)
        df = df.append(get_paired_token(text, label))
        df = df.append({0:'', 1:'O'}, ignore_index=True)
        df[0] = df[0].apply(lambda e: replace_digits(e))
        df_list.append(df)
    return df_list

In [39]:
train_text, test_text,  train_label, test_label = train_test_split(texts, labels)

In [40]:
train_text, val_text,  train_label, val_label = train_test_split(train_text, train_label)

In [41]:
train_df = get_paired_token_text_label(train_text, train_label)
val_df = get_paired_token_text_label(val_text, val_label)
test_df = get_paired_token_text_label(test_text, test_label)

In [42]:
from functools import reduce

In [43]:
train_df_long = reduce(lambda a, b: pd.concat([a,b]), train_df)
val_df_long = reduce(lambda a, b: pd.concat([a,b]), val_df)
test_df_long = reduce(lambda a, b: pd.concat([a,b]), test_df)

In [48]:
train_df[0]

Unnamed: 0,0,1
0,-DOCSTART-,O
1,popular,B-COMPANY
2,book,I-COMPANY
3,row,I-COMPANY
4,co,I-COMPANY
5,.,I-COMPANY
6,(,I-COMPANY
7,m,I-COMPANY
8,),I-COMPANY
9,sd,I-COMPANY


In [44]:
train_df_long.to_csv('data/train.txt', sep=' ', index=False, header=False)
val_df_long.to_csv('data/valid.txt', sep=' ', index=False, header=False)
test_df_long.to_csv('data/test.txt', sep=' ', index=False, header=False)