In [117]:
import os
import glob
from subprocess import run
import logging
from lxml import etree
import pandas as pd
logger = logging.getLogger(__name__)
def extract_file(pdf_file_path):
    """
        This method extracts texts from pdf files.
    """
    try:
        
        # Output directory prefix
        OUTPUT_DIR = ''
        
        # Extract the name pdf file from file_path
        pdf_file_name = os.path.basename(pdf_file_path)
        logger.debug('pdf_file_name : {}'.format(pdf_file_name))
        
        
        # Create temporary output directory
        output_directory_name = os.path.splitext(pdf_file_name)[0]
        logger.debug('output_directory_name : {}'.format(output_directory_name))
        
        output_path = os.path.join(OUTPUT_DIR, output_directory_name)
        logger.debug('output_path : {}'.format(output_path))
        
        os.makedirs(output_path, exist_ok=True)
        
        
        output_xml_file_path = os.path.join(output_path, pdf_file_name+'.xml')
        output_txt_file_path = os.path.join(output_path, pdf_file_name+'.txt')
        output_image_file_path = os.path.join(output_path, pdf_file_name+'')
        
        logger.debug('output_xml_file_path : {}'.format(output_xml_file_path))
        logger.debug('output_txt_file_path : {}'.format(output_txt_file_path))
        
        # Convert PDF to HTML using poppler
        run(['pdftohtml', pdf_file_path, '-c', '-hidden', '-xml', '-s', '-nomerge', output_xml_file_path])

        # Convert PDF to Text using poppler
        run(['pdftotext', pdf_file_path, output_txt_file_path])
        
        # Convert PDF to Image using poppler
        run(['pdftoppm', '-rx', '300', '-ry', '300', '-tiff', pdf_file_path, output_image_file_path])
        
        
        # Return generated output file paths
        
        return {
            'output_xml_file_path' : output_xml_file_path,
            'output_txt_file_path': output_txt_file_path,
            'output_image_file_path': output_image_file_path
        }
    except:
        logger.exception()
        
def read_text_boxes_from_xml(xml_file_path):
    
    xml_tree = etree.parse(xml_file_path)
    
    xml_root = xml_tree.getroot()
    
    # Read the XML root tag
    logger.debug("Root Tag {}".format(xml_root.tag))
    
    # Read the children tags
    children = [ page for page in xml_root ]
    
    # Filter Page tags
    pages = list(filter(lambda child: child.tag == 'page', children))
    logger.debug("Pages : {}".format(pages))
    
    
    # iterate each page to extract all text boxes in each page
    
    text_boxes = []
    
    for page in pages:          
        children = [ child for child in page]
        
        # Filter Text tags
        text_tags = list(filter(lambda child: child.tag == 'text', children))

        for text_tag in text_tags:
            
            logger.debug("Text: {}\n".format(text_tag.text))
            text_content = str(text_tag.text if text_tag.text else '')
                        
            for child_tag in text_tag:                
                text_content = text_content + str(child_tag.text if child_tag.text else '')
                logger.debug("Children : {}\n".format(child_tag.text))
            
            text_boxes.append([text_content, text_tag.get('top'), text_tag.get('left'),text_tag.get('width'), text_tag.get('height'), page.get('number'), page.get('width'), page.get('height') ])
    return text_boxes;


def find_common_words(text_boxes):
    text_hash = {}
    
    for text_box in text_boxes:
        text = text_box[0]        
        count = text_hash.get(text, 0)                
        text_hash[text] = count + 1
        
    return text_hash    

In [120]:
file_name_glob = '../../data/train/request_history/8ef79f79-1697-4710-b1b9-d004006a21ab/invoicefiles/*.pdf'

pdf_file_names = glob.glob(file_name_glob)

all_text_boxes = []

for pdf_file_path in pdf_file_names:
    output_files = extract_file(pdf_file_path)
    text_boxes = read_text_boxes_from_xml(output_files['output_xml_file_path'])
    all_text_boxes =  all_text_boxes + text_boxes

all_text_boxes_df = pd.DataFrame(all_text_boxes)

print(all_text_boxes_df)

writer = pd.ExcelWriter('text_boxes.xlsx')
all_text_boxes_df.to_excel(writer,'Sheet1')
writer.save()



                                                      0     1    2    3   4  \
0                                           Tax Invoice    64  393  110  19   
1     (See Section 31 of GST Act & Rule 1 of Invoice...    86  267  362  14   
2                           Emtex Engineering Pvt. Ltd.   129  327  209  15   
3                                  Khasra No. 401 & 402   146  376  111  10   
4                            MG Road, New Delhi-110030,   159  357  150  10   
5                                       State Code : 07   172  393   78  10   
6                                Original For Recipient   141  723  109  10   
7                             Duplicate For Transporter   178  723  128  10   
8                               Triplicate For Supplier   215  723  110  10   
9                                            Extra Copy   252  723   56  10   
10                                                PAN :   288   39   29  10   
11                                           AACCE84