In [212]:
import cv2
import pytesseract
import time
import os
import numpy as np
import re
import pandas as pd
import json
import math
vocab_file='./amount_vocab.json'

In [165]:
def preprocess_img(img):
    # Decode and convert to grayscale
    img = cv2.imdecode(np.frombuffer(img, np.uint8), cv2.IMREAD_GRAYSCALE)
    # Denoising Image
    img = cv2.fastNlMeansDenoising( img, None, 15, 7, 21 )   
    # Image Binarization
    img=cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    # Perform morphological operations (erosion and dilation)
    kernel = np.ones((1, 1), np.uint8)
    return cv2.morphologyEx(img, cv2.MORPH_DILATE, kernel)

In [180]:
vocab=json.load(open(vocab_file,'r+'))
def words_to_id(text):
    if pd.isna(text):
        return None
    if re.fullmatch(r"[0-9]*\.?[0-9]+", text):
        return vocab["number"]
    if text in vocab:
        return vocab[text] 
    return vocab["rare"]  #rare means the words that does not exist in vocabulary     


def detect_candidate(text):
  if pd.isna(text):
     return None
  if re.fullmatch(r"[0-9]*\.?[0-9]+",text): 
      return 1  # amount type = 1
  return None

def preprocess(text):
    if pd.isna(text) or text=='':
        return None
    text=str(text)
    #remove punctuation mark from the text
    text=text.translate(str.maketrans('','',''',!"#%&'()*+-/:;<=>?@[\]^_`{|}~₹$'''))
    # lower case each letter of the word
    text=text.lower()
    if re.fullmatch(r"rm\s*[0-9]*\.?[0-9]+",text):
        text=re.sub(r'[^\d\.]','',text)
    return text

# check if amount is the total amount of invoice or not with the help of json labels. 
def check_correctness(text,amount):
    if pd.isna(text):
        return None
    if re.fullmatch(r"[0-9]*\.?[0-9]+", text):
        if float(text) ==float(amount):
            return True
    return False

# Form candidates dataframe which contains features of it's position and neighbour words.
def get_candidates(df,correct_amount):
    cand=pd.DataFrame(columns=['field_id','candidate_position','neighbour_id','neighbour_relative_position','correct_candidate','left','top','width','height','text'])
    cand['left']=df['left']
    cand['top']=df['top']
    cand['width']=df['width']
    cand['height']=df['height']
    cand['field_id']=df['text'].apply(detect_candidate)
    cand['text']=df['text']
    cand['correct_candidate']=df['text'].apply(check_correctness,amount=correct_amount)
    cand.dropna(subset=['field_id','top','width','height','left','text'],inplace=True)
    return cand


In [181]:
 def generate(df,num_neighbours,correct_amount,height,width):
        df['text']=df['text'].apply(preprocess)   #preprocess all the words 
        candidates_df=get_candidates(df,correct_amount)
        df['text']=df['text'].apply(words_to_id)
        df.dropna(subset=['text'],inplace=True)
        
        #Example: for each number get it's closest neighbour words with their positional features for model training  
        for i,cand_row in candidates_df.iterrows():               
            neighbour=dict()
            x1=(cand_row['left']+cand_row['width']/2)/width
            y1=(cand_row['top']+cand_row['height']/2)/height
            for j,neigh_row in df.iterrows():
                id=neigh_row['text'] # earlier each word was converted to it's numerical value , so used here 
                # positions of words need to be normalized 
                # there centroid coordinate is taken in consideration
                if id==vocab["number"]:
                    continue
                x2=(neigh_row['left']+neigh_row['width']/2)/width
                y2=(neigh_row['top']+neigh_row['height']/2)/height
                #Ex. neighbours are searched towards left and half page upwards to the amount
                if x2 > x1 or y2 > y1+0.02 or y2 < y1-0.1:
                    continue
                distance=math.dist([x1,y1],[x2,y2])
                if id in neighbour:
                    if distance<neighbour[id]['dist']:
                        neighbour[id]={
                            'dist':distance,
                            'left':x2,'top':y2
                        }
                else:
                    neighbour[id]={
                        'dist':distance,
                        'left':x2,'top':y2
                    }     
            # if an entity has no neighbours, then there is no point to train it so continue .
            if len(neighbour)==0:
                continue
            # sort to form n closest neighbours
            neighbour=dict(sorted(neighbour.items(), key=lambda item: item[1]['dist'])[:num_neighbours])
            neighbours_remaining=num_neighbours-len(neighbour)
            neighbour_positions=list()
            neighbour_id=list()
            num_valid_values=0
            for key in neighbour:
                if key!=vocab['rare']: 
                   num_valid_values+=1 
                neighbour_id.append(key)
                neighbour_positions.append([neighbour[key]['left']-x1,neighbour[key]['top']-x2])
            
            # if a number is true amount and it does not has valid neighbours then do not take it into consideration for training
            if candidates_df.at[i,'correct_candidate']==True and num_valid_values==0:
                continue
            # To make the data consistent , like if 10 neighbours are needed and only 4 neighbours are present then other values
            # need to be padded with zero's to feed machine learning model.
            while neighbours_remaining: #used for masking
                neighbour_id.append(0)  
                neighbour_positions.append([-1,-1]) 
                neighbours_remaining-=1
    
            candidates_df.at[i,'neighbour_id']=neighbour_id
            candidates_df.at[i,'neighbour_relative_position']=neighbour_positions
            candidates_df.at[i,'candidate_position']=list([float(x1),float(y1)]) 
            
        # remove the invalid rows from the dataframe , the invalid rows has undefined values . 
        candidates_df.dropna(subset=['field_id','candidate_position','neighbour_id','neighbour_relative_position','left','top','width','height'],inplace=True)
        return candidates_df

In [197]:
def generate_dataset(csv_dataset,key,num_neighbours):
        candidates=None
        invoices=os.listdir(csv_dataset)
        length=len(invoices)
        for inv_csv in range(0,length): 
           file_name=str(inv_csv)
           file_name=file_name.zfill(3)
           data=json.load(open(key+'/'+file_name+'.json','r'))
           if 'amount' in data:
               total=str(data['amount'])
           else:
               total=str(data['total'])
           if total=='':
              continue
           total=preprocess(total)
           df=pd.read_csv(f'{csv_dataset}/{file_name}.csv')
           height=max(df['top'].max(),df['height'].max())
           width=max(df['left'].max(),df['width'].max())
           df=generate(df,num_neighbours,total,height,width)
           if type(candidates)!=None :
             if not df.empty:
                candidates=pd.concat([candidates,df])
                if len(candidates[candidates['correct_candidate']==True])==0:
                    print('incorrect')
             else:
                print('a empty dataframe for file',inv_csv)
           else:
             candidates=df  
        candidates.reset_index(inplace=True) 
        return candidates

In [198]:
key='./json_keys'
csv_dataset='./tesseract_invoice_csv_dataset'
dataset=generate_dataset(csv_dataset,key,10)

a empty dataframe for file 249
a empty dataframe for file 413


In [199]:
print('total candidates: ',len(dataset))
print('positive candidates: ',len(dataset[dataset['correct_candidate']==True]))

total candidates:  17088
positive candidates:  913


In [200]:
import tensorflow as tf
from functional_train import Model
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.optimizers import Adam

In [201]:
VOCAB_SIZE=28
EMBEDDING_SIZE=100
NEIGHBOURS=10
HEADS=4
df=dataset
y_train=tf.convert_to_tensor(list(df['correct_candidate']))
cand_pos=tf.convert_to_tensor(list(df['candidate_position']))
neighbours=tf.convert_to_tensor(list(df['neighbour_id']))
neighbour_positions=tf.convert_to_tensor(list(df['neighbour_relative_position']))
field_id=tf.convert_to_tensor(list(df['field_id']))
model = Model(VOCAB_SIZE, EMBEDDING_SIZE, NEIGHBOURS, HEADS)
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(0.0001),
    metrics=[
        'accuracy',
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall(),
        tf.keras.metrics.AUC(),
    ],
)

In [202]:
history=model.fit((field_id,cand_pos,neighbours,neighbour_positions),y_train,epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


In [207]:
key='./json_keys'
csv_dataset='./tesseract_invoice_csv_dataset'
invoices=os.listdir(csv_dataset)
total_check=0
correct=0
for inv_csv in range(626,723): #these are cab aggregator service invoice from 626 to 723
   total_check+=1
   file_name=str(inv_csv)
   file_name=file_name.zfill(3) 
   data=json.load(open(key+'/'+file_name+'.json','r'))
   if 'amount' in data:
       total=str(data['amount'])
   else:
       total=str(data['total'])
   if total=='':
      continue
   total=preprocess(total)
   df=pd.read_csv(f'{csv_dataset}/{file_name}.csv')
   height=max(df['top'].max(),df['height'].max())
   width=max(df['left'].max(),df['width'].max())
   candidates=generate(df,10,'-1',height,width)
   if len(candidates)==0:
      continue
   candidates.reset_index(inplace=True)
   cand_pos=tf.convert_to_tensor(list(candidates['candidate_position']))
   neighbours=tf.convert_to_tensor(list(candidates['neighbour_id']))
   neighbour_positions=tf.convert_to_tensor(list(candidates['neighbour_relative_position']))
   field_id=tf.convert_to_tensor(list(candidates['field_id']))
   data=model.predict((field_id,cand_pos,neighbours,neighbour_positions))
   amount_predicted=''
   score=0
   for i in range(len(data)):
      if data[i]>score:
          amount_predicted=candidates.at[i,'text']
          score=data[i]
   if float(amount_predicted)==float(total):
      correct+=1 
print(correct,total_check)

93 97


In [206]:
model.save('amount_large.h5')

  saving_api.save_model(
