In [42]:
import json
import math
import os
import pandas as pd
from dateparser_data.settings import default_parsers
from dateparser.search import search_dates 
date_vocab=json.load(open('date_vocab.json'))
parsers = [parser for parser in default_parsers if parser not in ['relative-time','timestamp','negative-timestamp']]
def words_to_id(text):
    if pd.isna(text):
        return None
    if text in date_vocab:
        return date_vocab[text] 
    return date_vocab["rare"]  #rare means the words that does not exist in vocabulary             

In [43]:
def get_date_candidates(df,correct_date):
    text=' '.join(df['text'])
    size=len(df['text'])
    date_candidates=pd.DataFrame(columns=['field_id','candidate_position','neighbour_id','neighbour_relative_position','correct_candidate','left','top','width','height','text'])
    dates = search_dates(text,settings={'STRICT_PARSING': True,'DATE_ORDER': 'DMY','PARSERS': parsers})
    if dates==None:
        return date_candidates
    date_candidates['field_id']=[2]*len(dates)
    max_date_tokens=0
    date_info=[]
    for date in dates:  
        tokens=len(date[0].split())
        index=text.find(date[0])
        if index!=0 and text[index-1]!=' ':
            tokens-=1
        start_index=len(text[:index].split())
        end_index=start_index+tokens
        date_info.append([start_index,end_index,date[1]])
    ind=0
    for info in date_info:
        start=info[0]
        end=info[1]
        if end>size:
            continue
        date_string=info[2]
        left=df.iloc[start]['left']
        top=df.iloc[start]['top']
        width=df.iloc[end-1]['left']+df.iloc[end-1]['width']-left
        height=df.iloc[start]['height']
        for i in range(start+1,end):
            height=max(height,df.iloc[i]['height'])
            top=min(top,df.iloc[i]['top'])
        date_candidates.at[ind,'left']=left
        date_candidates.at[ind,'top']=top 
        date_candidates.at[ind,'width']=width
        date_candidates.at[ind,'height']=height 
        date_candidates.at[ind,'text']=date_string.strftime("%d-%m-%Y")
        ind+=1
    if correct_date!=None:
        date_candidates['correct_candidate']=date_candidates['text'].apply(lambda txt:txt==correct_date)
    return date_candidates

In [44]:
def generate(df,num_neighbours,correct_date,height,width):
    df.dropna(inplace=True)
    df['text']=df['text'].apply(lambda word:word.lower())    
    df.reset_index(inplace=True)
    df['left']=df['left']/width 
    df['top']=df['top']/height
    df['width']=df['width']/width 
    df['height']=df['height']/height
    candidates_df=get_date_candidates(df,correct_date)
    if len(candidates_df['text'])==0:
        return candidates_df 
    df['text']=df['text'].apply(words_to_id)    
    #Example: for each number get it's closest neighbour words with their positional features for model training  
    for i,cand_row in candidates_df.iterrows():               
        neighbour=dict()
        x1=cand_row['left']+cand_row['width']/2
        y1=cand_row['top']+cand_row['height']/2
        for j,neigh_row in df.iterrows():
#             if x1>=neigh_row['left'] and x1<=(neigh_row['left']+neigh_row['width']) and y1>=neigh_row['top'] and y1<=(neigh_row['top']+neigh_row['height']):
#                 continue
            id=neigh_row['text'] # earlier each word was converted to it's numerical value , so used here 
            # positions of words need to be normalized 
            # there centroid coordinate is taken in consideration
            x2=neigh_row['left']+neigh_row['width']/2
            y2=neigh_row['top']+neigh_row['height']/2
            #Ex. neighbours are searched towards left and half page upwards to the amount
            if x2>x1+0.04 or y2>y1+.02 or y2<y1-0.1:  
                continue
            distance=math.dist([x1,y1],[x2,y2])
            if id in neighbour:
                if distance<neighbour[id]['dist']:
                    neighbour[id]={
                        'dist':distance,
                        'left':x2,'top':y2
                    }
            else:
                neighbour[id]={
                    'dist':distance,
                    'left':x2,'top':y2
                }     
        # if an entity has no neighbours, then there is no point to train it so continue .
        if len(neighbour)==0:
            continue
        # sort to form n closest neighbours
        neighbour=dict(sorted(neighbour.items(), key=lambda item: item[1]['dist'])[:num_neighbours])
        neighbours_remaining=num_neighbours-len(neighbour)
        neighbour_positions=list()
        neighbour_id=list()
        num_valid_values=0
        for key in neighbour:
            if key!=3: 
                num_valid_values+=1 
            neighbour_id.append(key)
            neighbour_positions.append([neighbour[key]['left']-x1,neighbour[key]['top']-x2])
        
        if candidates_df.at[i,'correct_candidate']==True and num_valid_values==0:
            candidates_df.at[i,'correct_candidate']=False
        # To make the data consistent , like if 10 neighbours are needed and only 4 neighbours are present then other values
        # need to be padded with zero's to feed machine learning model.
        while neighbours_remaining: #used for masking
            neighbour_id.append(0)  
            neighbour_positions.append([-1,-1]) 
            neighbours_remaining-=1

        candidates_df.at[i,'neighbour_id']=neighbour_id
        candidates_df.at[i,'neighbour_relative_position']=neighbour_positions
        candidates_df.at[i,'candidate_position']=list([float(x1),float(y1)]) 
        
    # remove the invalid rows from the dataframe , the invalid rows has undefined values . 
    candidates_df.dropna(subset=['field_id','candidate_position','neighbour_id','neighbour_relative_position','left','top','width','height'],inplace=True)
    return candidates_df

In [45]:
def generate_date_dataset(dir,annotated_file,num_neighbours):
        candidates=None
        invoices=os.listdir(dir)
        annotated=json.load(open(annotated_file,'r+'))
        for invoice_dir in invoices:
            inv_csv=os.listdir(f'{dir}/{invoice_dir}')
            file=invoice_dir+'.pdf'
            true_candidate=annotated[file]['date'] 
            for inv in inv_csv:
               df=pd.read_csv(f'{dir}/{invoice_dir}/{inv}')
               height=max(df['top'].max(),df['height'].max())
               width=max(df['left'].max(),df['width'].max())
               df=generate(df,num_neighbours,true_candidate,height,width)
               if type(candidates)!=None:
                 if not df.empty:
                    candidates=pd.concat([candidates,df])
               else:
                 candidates=df  
        candidates.reset_index(inplace=True) 
        return candidates

In [46]:
dataset=generate_date_dataset('./DATA','./annotated.json',10)

In [15]:
# data=json.load(open('annotated.json','r+'))
# df=pd.read_csv('./DATA/OLA_1/1.csv')
# height=max(df['top'].max(),df['height'].max())
# width=max(df['left'].max(),df['width'].max())
# df2=generate(df,10,data['OLA_1.pdf']['date'],height,width)

In [47]:
dataset[dataset['correct_candidate']==True]
# dataset

Unnamed: 0,index,field_id,candidate_position,neighbour_id,neighbour_relative_position,correct_candidate,left,top,width,height,text
0,0,2,"[0.8078629032258065, 0.22214306070105444]","[1.0, 3.0, 0, 0, 0, 0, 0, 0, 0, 0]","[[0.0, 0.04512693166879639], [-0.0449596774193...",True,0.792339,0.218011,0.031048,0.008264,30-04-2023
1,0,2,"[0.5203629032258064, 0.5889142205756626]","[3.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0]","[[-0.007862903225806406, 0.3246103337960452], ...",True,0.441532,0.583642,0.157661,0.010544,30-04-2023
2,1,2,"[0.8667338709677419, 0.18196067255628384]","[3.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0]","[[0.0, -0.08134577905661938], [-0.058870967741...",True,0.828629,0.177828,0.07621,0.008264,30-04-2023
6,0,2,"[0.8078629032258065, 0.20133941293815905]","[1.0, 3.0, 0, 0, 0, 0, 0, 0, 0, 0]","[[0.0, -0.6615638128682926], [-0.0449596774193...",True,0.792339,0.197207,0.031048,0.008264,29-06-2019
8,0,2,"[0.8667338709677419, 0.18196067255628384]","[3.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0]","[[0.0, -0.699490940346942], [-0.05887096774193...",True,0.828629,0.177828,0.07621,0.008264,29-06-2019
15,0,2,"[0.8112141992739008, 0.18523795953263036]","[1.0, 3.0, 0, 0, 0, 0, 0, 0, 0, 0]","[[0.0, -0.6747862437751551], [-0.0467930617184...",True,0.795482,0.181248,0.031464,0.007979,10-12-2018
17,0,2,"[0.8713190802743042, 0.1818181818181818]","[3.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0]","[[0.0, -0.030162455535589883], [-0.06010488100...",True,0.838241,0.177828,0.066156,0.007979,10-12-2018
18,0,2,"[0.5144, 0.12097971301335972]","[1.0, 3.0, 0, 0, 0, 0, 0, 0, 0, 0]","[[-0.27264, -0.4231950915388422], [-0.27711999...",True,0.22656,0.116032,0.57568,0.009896,01-11-2022
20,0,2,"[0.5144, 0.13236021771400297]","[1.0, 3.0, 0, 0, 0, 0, 0, 0, 0, 0]","[[-0.27264, -0.411567184562098], [-0.277119999...",True,0.22656,0.127412,0.57568,0.009896,03-11-2022
22,0,2,"[0.5144, 0.25903018307768433]","[3.0, 1.0, 0, 0, 0, 0, 0, 0, 0, 0]","[[-0.06255999999999995, -0.34080870856011874],...",True,0.22656,0.254082,0.57568,0.009896,03-11-2022


In [9]:
import tensorflow as tf
from functional_model import Model
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam

In [48]:
VOCAB_SIZE=4
EMBEDDING_SIZE=100
NEIGHBOURS=10
HEADS=4
df=dataset
y_train=tf.convert_to_tensor(list(df['correct_candidate']))
cand_pos=tf.convert_to_tensor(list(df['candidate_position']))
neighbours=tf.convert_to_tensor(list(df['neighbour_id']))
neighbour_positions=tf.convert_to_tensor(list(df['neighbour_relative_position']))
field_id=tf.convert_to_tensor(list(df['field_id']))
model = Model(VOCAB_SIZE, EMBEDDING_SIZE, NEIGHBOURS, HEADS)
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(0.0001),
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
        tf.keras.metrics.AUC(),
    ],
)

In [49]:
history=model.fit((field_id,cand_pos,neighbours,neighbour_positions),y_train,epochs=90)

Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90


Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90
Epoch 61/90
Epoch 62/90
Epoch 63/90
Epoch 64/90
Epoch 65/90
Epoch 66/90
Epoch 67/90
Epoch 68/90
Epoch 69/90
Epoch 70/90
Epoch 71/90
Epoch 72/90
Epoch 73/90
Epoch 74/90
Epoch 75/90
Epoch 76/90
Epoch 77/90
Epoch 78/90
Epoch 79/90
Epoch 80/90
Epoch 81/90
Epoch 82/90
Epoch 83/90
Epoch 84/90
Epoch 85/90
Epoch 86/90
Epoch 87/90
Epoch 88/90
Epoch 89/90
Epoch 90/90


In [52]:
dir='./DATA'
invoices=os.listdir('./DATA')
annotated=json.load(open('annotated.json','r+'))
total=0
correct=0
for invoice_dir in invoices:
    total+=1
    inv_csv=os.listdir(f'{dir}/{invoice_dir}')
    file=invoice_dir+'.pdf'
    true_candidate=annotated[file]['date'] 
    candidates=None
    for inv in inv_csv:
       df=pd.read_csv(f'{dir}/{invoice_dir}/{inv}')
       height=max(df['top'].max(),df['height'].max())
       width=max(df['left'].max(),df['width'].max())
       df=generate(df,10,'',height,width)
       if type(candidates)!=None:
         if not df.empty:
            candidates=pd.concat([candidates,df])
       else:
         candidates=df  
    if candidates is None:
        print('none')
        continue
    print('invoice ',len(candidates))
    candidates.reset_index(inplace=True)
    cand_pos=tf.convert_to_tensor(list(candidates['candidate_position']))
    neighbours=tf.convert_to_tensor(list(candidates['neighbour_id']))
    neighbour_positions=tf.convert_to_tensor(list(candidates['neighbour_relative_position']))
    field_id=tf.convert_to_tensor(list(candidates['field_id']))
    data=model.predict((field_id,cand_pos,neighbours,neighbour_positions))
    date_predicted=''
    score=0
    for i in range(len(data)):
        if data[i]>score:
            date_predicted=candidates.at[i,'text']
            score=data[i]
    if date_predicted==annotated[invoice_dir+'.pdf']['date']:
        correct+=1
print(correct,total)
    

invoice  3
invoice  1
invoice  2
invoice  3
invoice  1
none
invoice  1
invoice  4
none
invoice  3
invoice  2
invoice  3
invoice  2
invoice  1
invoice  2
invoice  5
invoice  3
invoice  5
invoice  1
invoice  1
invoice  2
invoice  1
invoice  2
invoice  3
invoice  2
invoice  1
invoice  1
invoice  1
invoice  1
invoice  2
invoice  1
invoice  2
none
invoice  2
invoice  2
invoice  2
invoice  2
invoice  2
invoice  2
invoice  1
invoice  2
invoice  3
invoice  2
invoice  2
invoice  2
invoice  2
invoice  2
invoice  2
invoice  2
invoice  1
invoice  3
invoice  2
invoice  1
invoice  5
invoice  2
invoice  4
invoice  2
invoice  3
invoice  3
invoice  2
invoice  2
invoice  2
none
invoice  2
invoice  3
invoice  2
invoice  2
invoice  2
none
invoice  2
invoice  2
invoice  3
invoice  3
invoice  2
invoice  3
invoice  2
invoice  2
invoice  2
invoice  2
invoice  1
invoice  2
none
invoice  2
65 83


In [51]:
model.save('date_model_2.h5')

  saving_api.save_model(
