In [1]:
import webbrowser
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
import pdf2image
import os
import csv
import shutil

In [2]:
def extract_jpg(file_name,csv_output_file):
    table_csv = get_table_csv_results(file_name)
    with open(csv_output_file, "w") as fout:
        fout.write(table_csv)
    
    print('CSV OUTPUT FILE: ',csv_output_file)

In [3]:
def extract_pdf(file_name,csv_output_file):    
    from pdf2image import convert_from_path
    pages = convert_from_path(file_name, 500)
    table_csv=""
    for page in pages:
        page.save('out.jpeg', 'JPEG')
        table_csv+= get_table_csv_results('out.jpeg')
        os.remove("out.jpeg")
         
    with open(csv_output_file, "w") as fout:
        fout.write(table_csv)
        
    print('CSV OUTPUT FILE: ',csv_output_file)

In [4]:
def get_rows_columns_map(table_result, blocks_map):
    rows = {}
    for relationship in table_result['Relationships']:
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                cell = blocks_map[child_id]
                if cell['BlockType'] == 'CELL':
                    row_index = cell['RowIndex']
                    col_index = cell['ColumnIndex']
                    if row_index not in rows:
                        # create new row
                        rows[row_index] = {}
                        
                    # get the text value
                    rows[row_index][col_index] = get_text(cell, blocks_map)
    return rows


def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] =='SELECTED':
                            text +=  'X '    
    return text


def get_table_csv_results(file_name):

    with open(file_name, 'rb') as file:
        img_test = file.read()
        bytes_test = bytearray(img_test)
        print('Image loaded', file_name)

    # process using image bytes
    # get the results
    client = boto3.client('textract')

    response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])

    # Get the text blocks
    blocks=response['Blocks']
    pprint(blocks)

    blocks_map = {}
    table_blocks = []
    for block in blocks:
        blocks_map[block['Id']] = block
        if block['BlockType'] == "TABLE":
            table_blocks.append(block)

    if len(table_blocks) <= 0:
        return "<b> NO Table FOUND </b>"

    csv = ''
    for index, table in enumerate(table_blocks):
        csv += generate_table_csv(table, blocks_map, index +1)
        csv += '\n\n'

    return csv

def generate_table_csv(table_result, blocks_map, table_index):
    rows = get_rows_columns_map(table_result, blocks_map)

    table_id = 'Table_' + str(table_index)
    
    # get cells.
    csv = 'Table: {0}\n\n'.format(table_id)

    for row_index, cols in rows.items():
        
        for col_index, text in cols.items():
            csv += '{}'.format(text) + ","
        csv += '\n'
        
    csv += '\n\n\n'
    return csv

In [5]:
def csv_json(file_name,json_output_file):
    file_name= csv_output_file
    rows=[]
    with open(file_name, 'r',encoding="utf8") as csvfile:        
        csvreader = csv.reader(csvfile) 

        # extracting field names through first row 
        fields = next(csvreader) 

        # extracting each data row one by one 
        for row in csvreader: 
            rows.append(row) 
            
    #removing empty lines
    master_list = []
    for i in range(len(rows)):  
        if(len(rows[i])>=2):
            master_list.append(rows[i])
            
    master_dict = {}
    for i in master_list:
        master_dict[i[0]] = i[1:]
        
    json_object = json.dumps(master_dict, indent = 4)    
    with open(json_output_file,'w') as outfile:
        json.dump(json_object, outfile)
        
    #print(json_object)    
    return json_object

In [6]:
def core_file_name(file_path):
    # This function has been created to get the actaul name of the file
    # i.e irrespective of any file extensions(.pdf/.jpg etc.) and path extensions(//.../../)
    # E.G: from asdf//dsfdsf/xyz.pdf it will return xyz only 
    
    #CAUTION : VERY NOOB CODE :P
    
    #removing the extension 
    file_path = file_path[::-1]
    file_less_ext = file_path[file_path.index('.')+1:][::-1]
    
    #print(file_less_ext)
    
    #removing the path to get the actual name
    if(file_less_ext[::-1].index("/")):
        #path_index = file_less_ext[::-1].index("/")        
        temp_less_path = file_less_ext[::-1]        
        temp_less_path=temp_less_path[:temp_less_path.index('/')]
        final_name = temp_less_path[::-1]
        
    else:
        final_name = file_less_ext
        
    return final_name

In [7]:
file_name = 'Forms/Land Owner Consent Form.pdf'
output_file_name = core_file_name(file_name)
#print(output_file_name)

csv_output_file = output_file_name+'.csv'
if(file_name[-3:]=="pdf"):
    extract_pdf(file_name,csv_output_file)
    
else:
    extract_jpg(file_name,csv_output_file)
    
json_output_file = output_file_name+'.json'    
result = csv_json(csv_output_file,json_output_file)
print(result)


Image loaded out.jpeg
[{'BlockType': 'PAGE',
  'Geometry': {'BoundingBox': {'Height': 1.0,
                               'Left': 0.0,
                               'Top': 0.0,
                               'Width': 1.0},
               'Polygon': [{'X': 0.0, 'Y': 0.0},
                           {'X': 1.0, 'Y': 0.0},
                           {'X': 1.0, 'Y': 1.0},
                           {'X': 0.0, 'Y': 1.0}]},
  'Id': '842b07eb-c5cb-42ed-ab72-87d3f1821722',
  'Relationships': [{'Ids': ['0525b0e3-a9ac-403f-8a6d-d9fc8a95d430',
                             '5add532e-7341-432b-88ad-98885b68eb7f',
                             '8ffe8061-8b0a-4cf9-a791-f147ed57bb0b',
                             'a2e26e82-bf92-4336-8bd0-f60b0375db44',
                             '3763124e-1197-4bdc-8bb9-dac69135abf2',
                             '5d889194-d28f-4648-bab1-a5b343d98be6',
                             'c95fe97d-13a6-4fd1-b09e-e76568adec85',
                             '5490d259-2ac1-49b

  'Confidence': 99.91378021240234,
  'Geometry': {'BoundingBox': {'Height': 0.01121996995061636,
                               'Left': 0.24668622016906738,
                               'Top': 0.16449524462223053,
                               'Width': 0.06491845846176147},
               'Polygon': [{'X': 0.24668622016906738, 'Y': 0.16449524462223053},
                           {'X': 0.31160467863082886, 'Y': 0.16449524462223053},
                           {'X': 0.31160467863082886, 'Y': 0.17571520805358887},
                           {'X': 0.24668622016906738,
                            'Y': 0.17571520805358887}]},
  'Id': '9c7826ce-6e6c-42f7-ada1-0e46476e91ec',
  'Text': 'Address'},
 {'BlockType': 'WORD',
  'Confidence': 99.95372772216797,
  'Geometry': {'BoundingBox': {'Height': 0.013653839938342571,
                               'Left': 0.41905486583709717,
                               'Top': 0.1646329164505005,
                               'Width': 0.08206573128700256

                            'Y': 0.35217827558517456}]},
  'Id': '40bb6bdc-79b8-4a53-b889-69a429f656e7',
  'Text': 'Mandir'},
 {'BlockType': 'WORD',
  'Confidence': 99.9496078491211,
  'Geometry': {'BoundingBox': {'Height': 0.014194340445101261,
                               'Left': 0.15870344638824463,
                               'Top': 0.36207500100135803,
                               'Width': 0.07447335124015808},
               'Polygon': [{'X': 0.15870344638824463, 'Y': 0.36207500100135803},
                           {'X': 0.2331767976284027, 'Y': 0.36207500100135803},
                           {'X': 0.2331767976284027, 'Y': 0.3762693405151367},
                           {'X': 0.15870344638824463,
                            'Y': 0.3762693405151367}]},
  'Id': 'b9c746d5-9504-46a1-983f-667a98c0d75b',
  'Text': 'Approval'},
 {'BlockType': 'WORD',
  'Confidence': 99.90777587890625,
  'Geometry': {'BoundingBox': {'Height': 0.01132460031658411,
                               '

               'Polygon': [{'X': 0.15010227262973785, 'Y': 0.15933698415756226},
                           {'X': 0.4165653586387634, 'Y': 0.15933698415756226},
                           {'X': 0.4165653586387634, 'Y': 0.1969892829656601},
                           {'X': 0.15010227262973785,
                            'Y': 0.1969892829656601}]},
  'Id': 'a8800f05-67e6-4d23-84e9-a19683b1f1a6',
  'Relationships': [{'Ids': ['94a33dd8-c2f1-49f9-b99a-5395f17417c2',
                             '9c7826ce-6e6c-42f7-ada1-0e46476e91ec',
                             'a5f63a5f-bcfb-4cf5-a013-9ddeddc269da',
                             '7370ddc8-35cf-4eff-ab3a-b40d3a421289',
                             'd6c97c5d-235c-46c6-9e86-19948267e5a5'],
                     'Type': 'CHILD'}],
  'RowIndex': 4,
  'RowSpan': 1},
 {'BlockType': 'CELL',
  'ColumnIndex': 2,
  'ColumnSpan': 1,
  'Confidence': 99.98334503173828,
  'Geometry': {'BoundingBox': {'Height': 0.03765229508280754,
                       

{
    "Land ": [
        "Owner Consent Form ",
        "",
        ""
    ],
    "Name of Owner ": [
        "Shambu Sharma ",
        "",
        ""
    ],
    "Phone Number Of Land Owner ": [
        "0120-1234569 ",
        "",
        ""
    ],
    "Permanent Address of Land Owner ": [
        "Balampur",
        " Uttar Pradesh ",
        "",
        ""
    ],
    "Name of buyer organization/ Individual ": [
        "AAI ",
        "",
        ""
    ],
    "Phone Number Of Owner ": [
        "011-5669742 ",
        "",
        ""
    ],
    "Permanent Address organization Individual ": [
        "New Delhi ",
        "A-123",
        " Safdarjung Enclave ",
        ""
    ],
    "Total Area of Land ": [
        "20acres ",
        "",
        ""
    ],
    "Total Cost of Land ": [
        "2cr ",
        "",
        ""
    ],
    "Address of Plot ": [
        "Jewar Village ",
        "X-987 opposite Mandir ",
        ""
    ],
    "Approval from Local Authority ": [
        "Do

In [8]:
# Moving generated file to specific folder of that name
os.mkdir(output_file_name)
shutil.move(csv_output_file,output_file_name+'/'+csv_output_file)
shutil.move(json_output_file,output_file_name+'/'+json_output_file)

'Land Owner Consent Form/Land Owner Consent Form.json'