## Sample dataset contains pdf files of same type

### Transform pdf files into ocr data in json format

In [9]:
import os
import sys
from glob import glob
import pandas as pd
sys.path.append('/home/nikhil/development/projects/document-extraction/lib/')
import json

from extraction.pdf.PDFExtractor import PDFExtractor


In [79]:
def train(label_texts, value_texts):
    for label in label_texts:
        for value in value_texts:            
            dLeft = int(label['left']) - int(value['left'])
            dTop = int(label['top']) - int(value['top']))
            print("({},{}) - ({},{}) = ({},{})".format(label['left'], label['top'], value['left'], value['top'], dLeft, dTop))
        
    
    

In [80]:
invoice_pdf_files_glob = glob("./document_extraction/data/invoicefiles/*.pdf")
for invoice_pdf_file_path in invoice_pdf_files_glob:
    pdf_file_name = os.path.basename(invoice_pdf_file_path)
    
    print("Extracting OCR data from PDF File \n{}".format(invoice_pdf_file_path))
    pdf_extractor = PDFExtractor(invoice_pdf_file_path)
    text_boxes = pdf_extractor.extract_data()
    print("OCR Extraction Completed")
    
    print("Saving extracted OCR to JSON File")
    text_boxes_df = pd.DataFrame(text_boxes)
    ocr_json_file_name = str(pdf_file_name) + '.json'
    print(ocr_json_file_name)
    text_boxes_df.to_json('./document_extraction/data/ocrfiles/' + ocr_json_file_name, orient='records')
    print("Saved extracted OCR to JSON File\n")
    

Extracting OCR data from PDF File 
./document_extraction/data/invoicefiles/2168.pdf
PDF Data Extracted
OCR Extraction Completed
Saving extracted OCR to JSON File
2168.pdf.json
Saved extracted OCR to JSON File

Extracting OCR data from PDF File 
./document_extraction/data/invoicefiles/2126.pdf
PDF Data Extracted
OCR Extraction Completed
Saving extracted OCR to JSON File
2126.pdf.json
Saved extracted OCR to JSON File

Extracting OCR data from PDF File 
./document_extraction/data/invoicefiles/2178.pdf
PDF Data Extracted
OCR Extraction Completed
Saving extracted OCR to JSON File
2178.pdf.json
Saved extracted OCR to JSON File

Extracting OCR data from PDF File 
./document_extraction/data/invoicefiles/2124.pdf
PDF Data Extracted
OCR Extraction Completed
Saving extracted OCR to JSON File
2124.pdf.json
Saved extracted OCR to JSON File

Extracting OCR data from PDF File 
./document_extraction/data/invoicefiles/2123.pdf
PDF Data Extracted
OCR Extraction Completed
Saving extracted OCR to JSON Fil

### Read annotated data and train model

In [81]:
annotations_file = open("./document_extraction/data/annotations.json", "r")

In [82]:
annotations_json_string = annotations_file.read()
annotations_file.close()

In [83]:
annotations_json = json.loads(annotations_json_string)

In [85]:
for annotatted_invoice in annotations_json:    
    print(annotatted_invoice['invoice_file_name'])
    invoice_json_file = open("./document_extraction/data/ocrfiles/{}.json".format(annotatted_invoice['invoice_file_name']), "r")
    invoice_json_string = invoice_json_file.read()
    invoice_json = json.loads(invoice_json_string)
    invoice_json_file.close()
    
    invoice_text_map = {}
    for invoice_text in invoice_json:
        text = invoice_text.get('text', '')
        texts = invoice_text_map.get(text, [])
        texts.append(invoice_text)    
        invoice_text_map[text] = texts
        
    
    for field in annotatted_invoice['invoice_info_fields']:
        label_text = None;
        value_text = None;
        print("Searching for field label={}".format(field['label']))
        for text, invoice_text in invoice_text_map.items():
            if text == field['label']:
                print('Label found')
                label_text = invoice_text
        
        print("Searching for field value={}".format(field['value']))
        for text, invoice_text in invoice_text_map.items():
            if text == field['value']:
                print('Value found')
                value_text = invoice_text      
                
        train(label_text, value_text)    
        print("\n")

2122.pdf
Searching for field label=Invoice No.
Label found
Searching for field value=SEPLOWN1819-2122
Value found
(451,118) - (414,137) = (37,-19)


2123.pdf
Searching for field label=Invoice No.
Label found
Searching for field value=SEPLOWN1819-2123
Value found
(451,118) - (414,137) = (37,-19)


2124.pdf
Searching for field label=Invoice No.
Label found
Searching for field value=SEPLOWN1819-2124
Value found
(451,118) - (414,137) = (37,-19)


2125.pdf
Searching for field label=Invoice No.
Label found
Searching for field value=SEPLOWN1819-2125
Value found
(451,118) - (414,137) = (37,-19)
(451,118) - (937,63) = (-486,55)
(451,118) - (937,63) = (-486,55)


2126.pdf
Searching for field label=Invoice No.
Label found
Searching for field value=SEPLOWN1819-2126
Value found
(451,118) - (414,137) = (37,-19)
(451,118) - (937,63) = (-486,55)


2167.pdf
Searching for field label=Invoice No.
Label found
Searching for field value=SEPLOWN1819-2167
Value found
(451,118) - (414,137) = (37,-19)
(451,118)

In [93]:
invoice_ocr_files_glob = glob("./document_extraction/data/ocrfiles/*.pdf.json")
for invoice_ocr_file_path in invoice_ocr_files_glob:
    ocr_file_name = os.path.basename(invoice_ocr_file_path)
    ocr_json_file =  open(invoice_ocr_file_path, "r")
    ocr_json_file_string = ocr_json_file.read()
    ocr_json_file.close()
    ocr_json = json.loads(ocr_json_file_string)
    
    for text in ocr_json:
        # Finding Invoice Label
        if text['left'] == "451" and text['top'] == "118":
            print("Invoice Label = ", text.get('text', ''))        
        # Finding Invoice Value
        elif text['left'] == "414" and text['top'] == "137":
            print("Invoice Value = ", text.get('text', ''))
            
    

Invoice Value =  SEPLOWN1819-2125
Invoice Label =  Invoice No.
Invoice Value =  SEPLOWN1819-2123
Invoice Label =  Invoice No.
Invoice Value =  SEPLOWN1819-2124
Invoice Label =  Invoice No.
Invoice Value =  SEPLOWN1819-2126
Invoice Label =  Invoice No.
Invoice Value =  SEPLOWN1819-2167
Invoice Label =  Invoice No.
Invoice Value =  SEPLOWN1819-2122
Invoice Label =  Invoice No.
Invoice Value =  SEPLOWN1819-2168
Invoice Label =  Invoice No.
Invoice Value =  SEPLOWN1819-2178
Invoice Label =  Invoice No.
