In [None]:
from paddleocr import PaddleOCR, draw_ocr
from PIL import Image, ImageDraw, ImageFont
import numpy as np
from shapely.geometry import Polygon
import json
import re

In [None]:
import os
os.getcwd()

In [None]:
work_dir = "E:/Works/narendra_tasks/invoice-ocr/docai/pocr"

In [None]:
ocr_model = PaddleOCR(lang='en', use_gpu=True)

### Read Template Configuration file template_config.json

In [343]:
template_config_file = f"{work_dir}/template_config.json"
with open(template_config_file) as f:
    configdata = json.load(f)


<div class="alert alert-block alert-danger"> <br> 
    - Read PDF 
    - Convert to Image
    - Scan all pages/images
</div>

#### Read folder  for pdf and convert to images

In [344]:
import fitz
import shutil

processed_folder = f"{work_dir}/processed_files"
temp_dir = f"{work_dir}/temp_files"

def convert_pdf_to_image(temp_dir):
    dpi = 300
    zoom = dpi/72    
    magnify = fitz.Matrix(zoom, zoom)
    file_array = []
    for filename in os.listdir(temp_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(temp_dir, filename)
    
            doc = fitz.open(pdf_path)
    
            for page_num, page in enumerate(doc):
                count = page_num + 1
                pix = page.get_pixmap(matrix=magnify)
                output_path = os.path.join(processed_folder, f"{os.path.splitext(filename)[0]}_page_{count}.png")
                pix.save(output_path)
                file_array.append(f"{os.path.splitext(filename)[0]}_page_{count}.png")
            doc.close()
            shutil.move(pdf_path, f"{processed_folder}/{filename}")
    return file_array

# ----------------------------------------------------

files = os.listdir(temp_dir)
image_files = []
if(len(files)>0):
    image_files = convert_pdf_to_image(temp_dir)
    
print(image_files)

['Adani_Commercial_HT_II_page_1.png', 'Adani_Commercial_HT_II_page_2.png']


#### Loop all converted images - First Page find Template. Template Found - Process all Images for Invoice Fileds extraction
<div class="alert alert-block alert-danger"> <br> 
    Execute Safely - OCR Data
</div>

In [345]:
page_data = []
for image_file in image_files:
    image_file_path = f"{processed_folder}/{image_file}"
    # For first page screen search template
    print(image_file_path)
    pimg = Image.open(image_file_path)
    
    pimg = np.asarray(pimg)
    img_height, img_width = pimg.shape[:2]
    ocrdata = ocr_model.ocr(pimg)
    image_result = []
    image_result = {"image":image_file,"ocrdata":ocrdata,"image_height":img_height,"image_width":img_width}
    
    page_data.append(image_result)


E:/Works/narendra_tasks/invoice-ocr/docai/pocr/processed_files/Adani_Commercial_HT_II_page_1.png
[2024/03/18 15:12:48] ppocr DEBUG: dt_boxes num : 156, elapsed : 4.127188682556152
[2024/03/18 15:13:53] ppocr DEBUG: rec_res num  : 156, elapsed : 65.04003882408142
E:/Works/narendra_tasks/invoice-ocr/docai/pocr/processed_files/Adani_Commercial_HT_II_page_2.png
[2024/03/18 15:13:54] ppocr DEBUG: dt_boxes num : 219, elapsed : 0.8634653091430664
[2024/03/18 15:15:04] ppocr DEBUG: rec_res num  : 219, elapsed : 69.34561491012573


In [346]:
import pprint
pprint.pprint(page_data)

[{'image': 'Adani_Commercial_HT_II_page_1.png',
  'image_height': 3509,
  'image_width': 2475,
  'ocrdata': [[[[[940.0, 36.0], [1194.0, 44.0], [1192.0, 96.0], [939.0, 87.0]],
                (' BILL OF SUPPLY', 0.9744073748588562)],
               [[[92.0, 58.0], [284.0, 58.0], [284.0, 135.0], [92.0, 135.0]],
                ('adani', 0.9991832971572876)],
               [[[2199.0, 58.0],
                 [2416.0, 58.0],
                 [2416.0, 95.0],
                 [2199.0, 95.0]],
                ('Scan QR code for', 0.9979081153869629)],
               [[[1657.0, 77.0],
                 [1819.0, 77.0],
                 [1819.0, 113.0],
                 [1657.0, 113.0]],
                ('Electric ', 0.9574827551841736)],
               [[[1805.0, 77.0],
                 [1967.0, 77.0],
                 [1967.0, 117.0],
                 [1805.0, 117.0]],
                ('429510', 0.999929666519165)],
               [[[1668.0, 110.0],
                 [1783.0, 110.0],
           

#### Transform core pocr data into normalised bbox

In [347]:
bill_ocr_data = []

for data_dict in page_data:
    transform_data = {}
    image_width = data_dict["image_width"]
    image_height = data_dict["image_height"]
    image = data_dict["image"]
    ocrdata = data_dict["ocrdata"]
    ocr_box = []
    for box, (text, confidence) in ocrdata[0]:
        four_co_ord = [box[0][0],box[1][1],box[2][0]-box[0][0],box[2][1]-box[1][1]]
        target_bbox = {
            'x': 100 * four_co_ord[0] / image_width,
            'y': 100 * four_co_ord[1] / image_height,
            'width': 100 * four_co_ord[2] / image_width,
            'height': 100 * four_co_ord[3] / image_height,
            #'rotation': 0
        }
        ocr_box.append({"bbox":target_bbox,"text_result":text})
    
    transform_data["image"] = data_dict["image"]
    transform_data["image_width"] = data_dict["image_width"]
    transform_data["image_height"] = data_dict["image_height"]
    transform_data["ocrdata"] = ocr_box
    bill_ocr_data.append(transform_data)
    

In [348]:
pprint.pprint(bill_ocr_data[0]['ocrdata'])
#pprint.pprint(bill_ocr_data[1])

[{'bbox': {'height': 1.4819036762610431,
           'width': 10.181818181818182,
           'x': 37.97979797979798,
           'y': 1.2539184952978057},
  'text_result': ' BILL OF SUPPLY'},
 {'bbox': {'height': 2.19435736677116,
           'width': 7.757575757575758,
           'x': 3.717171717171717,
           'y': 1.6528925619834711},
  'text_result': 'adani'},
 {'bbox': {'height': 1.0544314619549728,
           'width': 8.767676767676768,
           'x': 88.84848484848484,
           'y': 1.6528925619834711},
  'text_result': 'Scan QR code for'},
 {'bbox': {'height': 1.0259333143345684,
           'width': 6.545454545454546,
           'x': 66.94949494949495,
           'y': 2.19435736677116},
  'text_result': 'Electric '},
 {'bbox': {'height': 1.139925904816187,
           'width': 6.545454545454546,
           'x': 72.92929292929293,
           'y': 2.19435736677116},
  'text_result': '429510'},
 {'bbox': {'height': 1.0259333143345684,
           'width': 4.646464646464646,
     

### Search keyword in Template
#### Search only in first page (bill_ocr_data[0])

In [349]:
def search_all_keys(target_data, substrings):
    result = all(
        any(
            re.search(re.compile(re.escape(keyword)), item['text_result'])
            for item in target_data
        )
        for keyword in substrings
    )
    #result = all(any(keyword in item['text_result'] for item in target_data) for keyword in substrings)
    return result

found_template = {}

for configitem in configdata:
    #for entry in configitem['keywords']:
    substrings = [entry['key'] for entry in configitem['keywords']]
    #print(substrings)
    if search_all_keys(bill_ocr_data[0]['ocrdata'], substrings):
        found_template = configitem.copy()
print(found_template)

{'name': 'Adani Commercial HT II', 'keywords': [{'key': 'adani', 'required': True, 'box': {'x': 3.715670436187399, 'y': 1.6524216524216524, 'width': 7.754442649434572, 'height': 2.2222222222222223}}, {'key': 'COMMERCIAL', 'required': True, 'box': {'x': 39.17609046849758, 'y': 3.4188034188034186, 'width': 8.319870759289175, 'height': 1.0826210826210827}}, {'key': 'HT II', 'required': True, 'box': {'x': 36.42972536348948, 'y': 11.794871794871773, 'width': 5.217422472544567, 'height': 1.1396011396011383}}], 'pages': ['Adani_Commercial_HT_II_page_1.json', 'Adani_Commercial_HT_II_page_2.json']}


In [350]:
def calculate_iou(bbox1, bbox2):
  """
  Calculates the Intersection over Union (IoU) between two bounding boxes.

  Args:
      bbox1: A dictionary representing the first bounding box with keys 'x', 'y', 'width', and 'height'.
      bbox2: A dictionary representing the second bounding box with the same keys.

  Returns:
      The IoU value between the two bounding boxes (float).
  """
  # Calculate bottom right coordinates for each bbox
  bbox1_br_x = bbox1['x'] + bbox1['width']
  bbox1_br_y = bbox1['y'] + bbox1['height']
  bbox2_br_x = bbox2['x'] + bbox2['width']
  bbox2_br_y = bbox2['y'] + bbox2['height']

  # Determine intersection coordinates
  xmin = max(bbox1['x'], bbox2['x'])
  ymin = max(bbox1['y'], bbox2['y'])
  xmax = min(bbox1_br_x, bbox2_br_x)
  ymax = min(bbox1_br_y, bbox2_br_y)

  # Calculate intersection area (handle no overlap case)
  intersection_area = 0
  if xmax >= xmin and ymax >= ymin:
    intersection_area = (xmax - xmin) * (ymax - ymin)

  # Calculate union area
  union_area = (bbox1['width'] * bbox1['height']) + (bbox2['width'] * bbox2['height']) - intersection_area

  # Calculate and return IoU
  iou = intersection_area / union_area if union_area > 0 else 0
  return iou

#### New Comparision Logic with Required and Optional Keywords

In [351]:
def find_matching_template(templates, target_data):
    for template in templates:
        print("Searching Template:",template["name"])
        found_required = 0
        matched_optional_keywords = []
        found_optional = False
        iou_match = True
        for template_keyword in template['keywords']:
            if template_keyword["required"]:
                found = False
                for keyword in target_data:
                    #text = keyword["text_result"].lower().strip()
                    text = keyword["text_result"].strip()
                    #print(f'\t{text} == {template_keyword["key"].lower()}')
                    #if text == template_keyword["key"].lower():
                    if template_keyword["key"] in text:
                        print(template_keyword["box"])
                        print(keyword["bbox"])
                        iou = calculate_iou(template_keyword["box"],keyword["bbox"])
                        if iou < 0.3:
                            iou_match = False
                        print("iou:",iou)
                        found_required += 1  
                        #print(f'\t\t{found_required}')
                        found = True
                        break
                if not found:
                    break # Exit inner loop if a required keyword is missing in this template
            else: # Optional keyword
                #found_optional = found_optional or any(keyword["text_result"].lower().strip() == tk["key"].lower() for tk in template["keywords"])
                matching_items = [(template_keyword, keyword) for keyword in target_data if template_keyword["key"] in keyword["text_result"]]
                if matching_items:
                    for keyword_match, item_match in matching_items:
                        #print(f"Found match: '{keyword_match['key']}' in '{item_match['text_result']}'")
                        print(keyword_match["box"])
                        print(item_match["bbox"])
                        o_iou = calculate_iou(keyword_match["box"],item_match["bbox"])
                        if o_iou < 0.3:
                            iou_match = False
                        print("oiu: ",o_iou)
                        matched_optional_keywords.append(template_keyword["key"])
                    found_optional = True
                '''
                if any(template_keyword["key"] in keyword["text_result"] for keyword in target_data):
                    matched_optional_keywords.append(template_keyword["key"])
                    found_optional = True
                '''    
        if found_required == len(template["keywords"]) - len([kw for kw in template["keywords"] if not kw["required"]]):
            # All required keywords and at least one optional keyword found
            return template,matched_optional_keywords,iou_match
    return None

matched_template,matched_optional_keywords,iou_match  = find_matching_template(configdata, bill_ocr_data[0]['ocrdata'])

if matched_template:
  print(f"Matching template: {matched_template['name']}")
  if matched_optional_keywords:
    print(f"Matched optional keywords: {matched_optional_keywords}")
  else:
    print("No optional keywords matched")
  print("IOU MATCH: ",iou_match)
else:
  print("No matching template found")

#pprint.pprint(matched_template)

Searching Template: Adani Commercial HT II
{'x': 3.715670436187399, 'y': 1.6524216524216524, 'width': 7.754442649434572, 'height': 2.2222222222222223}
{'x': 3.717171717171717, 'y': 1.6528925619834711, 'width': 7.757575757575758, 'height': 2.19435736677116}
iou: 0.9866873472836515
{'x': 39.17609046849758, 'y': 3.4188034188034186, 'width': 8.319870759289175, 'height': 1.0826210826210827}
{'x': 39.15151515151515, 'y': 3.4482758620689653, 'width': 8.323232323232324, 'height': 1.0259333143345684}
iou: 0.9425837166579175
{'x': 36.42972536348948, 'y': 11.794871794871773, 'width': 5.217422472544567, 'height': 1.1396011396011383}
{'x': 36.44444444444444, 'y': 11.883727557708749, 'width': 3.1515151515151514, 'height': 0.8264462809917356}
iou: 0.438051477788025
Matching template: Adani Commercial HT II
No optional keywords matched
IOU MATCH:  True


### Match position of keywords. If all matched then finalise Template

In [None]:
for keys in matched_template["keywords"]:
    target_bbox_dict = result_dict = next((item for item in bill_ocr_data[0]['ocrdata'] if item['text_result'] == keys['key']), None)
    print(keys['box'], target_bbox_dict['bbox'])


#### Load Template Json file having master data for matching fields
Example : Adani_Commercial_HT_II_page_1.json, Adani_Commercial_HT_II_page_2.json

In [None]:

def extract_data(master_data,bill_data):
    annotations = master_data.get("annotations", {})
    for variable_name, master_variable_data in annotations.items():
        for bill_data_item in bill_data:
            target_bbox = bill_data_item["bbox"].copy()
            try:
                if isinstance(master_variable_data['box'], dict):
                    iou = calculate_iou(master_variable_data['box'],target_bbox) 
                    if(iou >= 0.3):
                        print(f"Found: {variable_name}: {bill_data_item['text_result']} == {iou}")
                    elif(0.1 <= iou < 0.3):
                        print("No match found",iou)
                else:
                    print("Not dict box:", master_variable_data)
            except TypeError:
                print("No box found: ",master_variable_data)

In [None]:
template_folder = f"{work_dir}/template_data/master_templates"

for i,json_template in enumerate(found_template['pages']):
    json_file_path = f"{template_folder}/{json_template}"
    master_data = {}
    with open(json_file_path, 'r') as file:
        master_data = json.load(file)
        ### ---- Compare Bill OCR Data (bill_ocr_data[0]['ocrdata']) with Master Veairble Data (data)
        print(f"Extracting using {json_template}")
        #print(bill_ocr_data[i]['ocrdata'])
        extract_data(master_data,bill_ocr_data[i]['ocrdata'])
        