In [1]:
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
import pdf2image
import os
import csv

In [2]:
def extract_jpg(file_name,csv_output_file):
    table_csv = get_table_csv_results(file_name)
    with open(csv_output_file, "w") as fout:
        fout.write(table_csv)
    
    print('CSV OUTPUT FILE: ',csv_output_file)

In [3]:
def extract_pdf(file_name,csv_output_file):    
    from pdf2image import convert_from_path
    pages = convert_from_path(file_name, 500)
    table_csv=""
    for page in pages:
        page.save('out.jpeg', 'JPEG')
        table_csv+= get_table_csv_results('out.jpeg')
        os.remove("out.jpeg")
         
    with open(csv_output_file, "w") as fout:
        fout.write(table_csv)
        
    print('CSV OUTPUT FILE: ',csv_output_file)

In [4]:
def get_rows_columns_map(table_result, blocks_map):
    rows = {}
    for relationship in table_result['Relationships']:
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                cell = blocks_map[child_id]
                if cell['BlockType'] == 'CELL':
                    row_index = cell['RowIndex']
                    col_index = cell['ColumnIndex']
                    if row_index not in rows:
                        # create new row
                        rows[row_index] = {}
                        
                    # get the text value
                    rows[row_index][col_index] = get_text(cell, blocks_map)
    return rows


def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] =='SELECTED':
                            text +=  'X '    
    return text


def get_table_csv_results(file_name):

    with open(file_name, 'rb') as file:
        img_test = file.read()
        bytes_test = bytearray(img_test)
        print('Image loaded', file_name)

    # process using image bytes
    # get the results
    client = boto3.client('textract')

    response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])

    # Get the text blocks
    blocks=response['Blocks']
    pprint(blocks)

    blocks_map = {}
    table_blocks = []
    for block in blocks:
        blocks_map[block['Id']] = block
        if block['BlockType'] == "TABLE":
            table_blocks.append(block)

    if len(table_blocks) <= 0:
        return "<b> NO Table FOUND </b>"

    csv = ''
    for index, table in enumerate(table_blocks):
        csv += generate_table_csv(table, blocks_map, index +1)
        csv += '\n\n'

    return csv

def generate_table_csv(table_result, blocks_map, table_index):
    rows = get_rows_columns_map(table_result, blocks_map)

    table_id = 'Table_' + str(table_index)
    
    # get cells.
    csv = 'Table: {0}\n\n'.format(table_id)

    for row_index, cols in rows.items():
        
        for col_index, text in cols.items():
            csv += '{}'.format(text) + ","
        csv += '\n'
        
    csv += '\n\n\n'
    return csv

In [5]:
def csv_json(file_name):
    file_name= csv_output_file
    rows=[]
    with open(file_name, 'r',encoding="utf8") as csvfile:        
        csvreader = csv.reader(csvfile) 

        # extracting field names through first row 
        fields = next(csvreader) 

        # extracting each data row one by one 
        for row in csvreader: 
            rows.append(row) 
            
    #removing empty lines
    master_list = []
    for i in range(len(rows)):  
        if(len(rows[i])>=2):
            master_list.append(rows[i])
            
    master_dict = {}
    for i in master_list:
        master_dict[i[0]] = i[1:]
        
    json_object = json.dumps(master_dict, indent = 4)  
    json_output_file = output_file_name+'.json'
    with open(json_output_file,'w') as outfile:
        json.dump(json_object, outfile)
        
    #print(json_object)    
    return json_object

In [6]:
def core_file_name(file_path):
    # This function has been created to get the actaul name of the file
    # i.e irrespective of any file extensions(.pdf/.jpg etc.) and path extensions(//.../../)
    # E.G: from asdf//dsfdsf/xyz.pdf it will return xyz only 
    
    #CAUTION : VERY NOOB CODE :P
    
    #removing the extension 
    file_path = file_path[::-1]
    file_less_ext = file_path[file_path.index('.')+1:][::-1]
    
    #print(file_less_ext)
    
    #removing the path to get the actual name
    if(file_less_ext[::-1].index("/")):
        #path_index = file_less_ext[::-1].index("/")        
        temp_less_path = file_less_ext[::-1]        
        temp_less_path=temp_less_path[:temp_less_path.index('/')]
        final_name = temp_less_path[::-1]
        
    else:
        final_name = file_less_ext
        
    return final_name

In [7]:
file_name = 'Forms/Land Owner Consent Form.pdf'
output_file_name = core_file_name(file_name)
#print(output_file_name)

csv_output_file = output_file_name+'.csv'
if(file_name[-3:]=="pdf"):
    extract_pdf(file_name,csv_output_file)
    
else:
    extract_jpg(file_name,csv_output_file)
    
    
result = csv_json(csv_output_file)
print(result)


Land Owner Consent Form
Image loaded out.jpeg
[{'BlockType': 'PAGE',
  'Geometry': {'BoundingBox': {'Height': 1.0,
                               'Left': 0.0,
                               'Top': 0.0,
                               'Width': 1.0},
               'Polygon': [{'X': 0.0, 'Y': 0.0},
                           {'X': 1.0, 'Y': 0.0},
                           {'X': 1.0, 'Y': 1.0},
                           {'X': 0.0, 'Y': 1.0}]},
  'Id': 'f35a91dc-3b08-4cae-96a0-e3396f51da33',
  'Relationships': [{'Ids': ['a0d47da6-d43c-4abd-9322-aaa3ced0ed79',
                             '3aa34057-9791-4bd2-9d5d-799d49df99ef',
                             'cb9dde6c-3fdd-40ca-b5da-8262271498f6',
                             '28114df6-3947-4f6e-9afb-c260d1c9b0b7',
                             '2ecfa449-583f-4af7-9c93-bf164acfdb28',
                             'd8788e4f-e374-4740-9011-bb31be393c6d',
                             '1b89adee-6863-424a-95e9-0488345dd150',
                       

  'Id': '0da7824a-ad69-4176-b9bd-0fc0ccddecdf',
  'Relationships': [{'Ids': ['a7d0de24-03a2-4438-b8e9-cd755d5d2bfb',
                             '69c09bce-a976-4044-be89-91396885ddec'],
                     'Type': 'CHILD'}],
  'Text': 'In progress'},
 {'BlockType': 'WORD',
  'Confidence': 99.86685180664062,
  'Geometry': {'BoundingBox': {'Height': 0.0159549992531538,
                               'Left': 0.32446542382240295,
                               'Top': 0.0700971707701683,
                               'Width': 0.0667736828327179},
               'Polygon': [{'X': 0.32446542382240295, 'Y': 0.0700971707701683},
                           {'X': 0.39123910665512085, 'Y': 0.0700971707701683},
                           {'X': 0.39123910665512085, 'Y': 0.08605217188596725},
                           {'X': 0.32446542382240295,
                            'Y': 0.08605217188596725}]},
  'Id': '32f6be7b-1cee-42a5-91bb-1426ffb428dd',
  'Text': 'Land'},
 {'BlockType': 'WORD',
  'Conf

                               'Left': 0.30708590149879456,
                               'Top': 0.23962008953094482,
                               'Width': 0.05429128184914589},
               'Polygon': [{'X': 0.30708590149879456, 'Y': 0.23962008953094482},
                           {'X': 0.36137717962265015, 'Y': 0.23962008953094482},
                           {'X': 0.36137717962265015, 'Y': 0.2506343126296997},
                           {'X': 0.30708590149879456,
                            'Y': 0.2506343126296997}]},
  'Id': '4e648b9f-5e6d-42fe-8ef4-0acc2befeebc',
  'Text': 'Owner'},
 {'BlockType': 'WORD',
  'Confidence': 99.62513732910156,
  'Geometry': {'BoundingBox': {'Height': 0.011650090105831623,
                               'Left': 0.4177606403827667,
                               'Top': 0.23917478322982788,
                               'Width': 0.10524725914001465},
               'Polygon': [{'X': 0.4177606403827667, 'Y': 0.23917478322982788},
                  

               'Polygon': [{'X': 0.4188205897808075, 'Y': 0.42663347721099854},
                           {'X': 0.4357873201370239, 'Y': 0.42663347721099854},
                           {'X': 0.4357873201370239, 'Y': 0.4375911355018616},
                           {'X': 0.4188205897808075, 'Y': 0.4375911355018616}]},
  'Id': 'a7d0de24-03a2-4438-b8e9-cd755d5d2bfb',
  'Text': 'In'},
 {'BlockType': 'WORD',
  'Confidence': 99.95259857177734,
  'Geometry': {'BoundingBox': {'Height': 0.011057649739086628,
                               'Left': 0.4403235614299774,
                               'Top': 0.42909950017929077,
                               'Width': 0.06709001213312149},
               'Polygon': [{'X': 0.4403235614299774, 'Y': 0.42909950017929077},
                           {'X': 0.5074135661125183, 'Y': 0.42909950017929077},
                           {'X': 0.5074135661125183, 'Y': 0.4401571452617645},
                           {'X': 0.4403235614299774, 'Y': 0.440157145261764

                             '3986c390-f361-4c6c-9554-4b516bec61dc',
                             '7dcaaba0-423d-46c5-b891-7dcb5d088c89'],
                     'Type': 'CHILD'}],
  'RowIndex': 9,
  'RowSpan': 1},
 {'BlockType': 'CELL',
  'ColumnIndex': 2,
  'ColumnSpan': 1,
  'Confidence': 99.98334503173828,
  'Geometry': {'BoundingBox': {'Height': 0.021754659712314606,
                               'Left': 0.4165653586387634,
                               'Top': 0.3141297698020935,
                               'Width': 0.227045476436615},
               'Polygon': [{'X': 0.4165653586387634, 'Y': 0.3141297698020935},
                           {'X': 0.6436108350753784, 'Y': 0.3141297698020935},
                           {'X': 0.6436108350753784, 'Y': 0.3358844220638275},
                           {'X': 0.4165653586387634, 'Y': 0.3358844220638275}]},
  'Id': '5ebe1540-fda2-4b89-95a3-8f110e9aa8b2',
  'Relationships': [{'Ids': ['8242ce01-c47f-443d-b8fd-5517c9e44179'],
              