In [None]:
import webbrowser
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint
import pdf2image
import os
import csv
import shutil

In [None]:
def extract_jpg(file_name,csv_output_file):
    table_csv = get_table_csv_results(file_name)
    with open(csv_output_file, "w") as fout:
        fout.write(table_csv)
    
    #print('CSV OUTPUT FILE: ',csv_output_file)

In [9]:
def extract_pdf(file_name,csv_output_file):    
    from pdf2image import convert_from_path
    pages = convert_from_path(file_name, 500)
    table_csv=""
    for page in pages:
        page.save('out.jpeg', 'JPEG')
        table_csv+= get_table_csv_results('out.jpeg')
        os.remove("out.jpeg")
         
    with open(csv_output_file, "w") as fout:
        fout.write(table_csv)
        
    print('CSV OUTPUT FILE: ',csv_output_file)

In [10]:
def get_rows_columns_map(table_result, blocks_map):
    rows = {}
    for relationship in table_result['Relationships']:
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                cell = blocks_map[child_id]
                if cell['BlockType'] == 'CELL':
                    row_index = cell['RowIndex']
                    col_index = cell['ColumnIndex']
                    if row_index not in rows:
                        # create new row
                        rows[row_index] = {}
                        
                    # get the text value
                    rows[row_index][col_index] = get_text(cell, blocks_map)
    return rows


def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] =='SELECTED':
                            text +=  'X '    
    return text


def get_table_csv_results(file_name):

    with open(file_name, 'rb') as file:
        img_test = file.read()
        bytes_test = bytearray(img_test)
        #print('Image loaded', file_name)

    # process using image bytes
    # get the results
    client = boto3.client('textract')

    response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])

    # Get the text blocks
    blocks=response['Blocks']
    #pprint(blocks)

    blocks_map = {}
    table_blocks = []
    for block in blocks:
        blocks_map[block['Id']] = block
        if block['BlockType'] == "TABLE":
            table_blocks.append(block)

    if len(table_blocks) <= 0:
        return "<b> NO Table FOUND </b>"

    csv = ''
    for index, table in enumerate(table_blocks):
        csv += generate_table_csv(table, blocks_map, index +1)
        csv += '\n\n'

    return csv

def generate_table_csv(table_result, blocks_map, table_index):
    rows = get_rows_columns_map(table_result, blocks_map)

    table_id = 'Table_' + str(table_index)
    
    # get cells.
    csv = 'Table: {0}\n\n'.format(table_id)

    for row_index, cols in rows.items():
        
        for col_index, text in cols.items():
            csv += '{}'.format(text) + ","
        csv += '\n'
        
    csv += '\n\n\n'
    return csv

In [None]:
def csv_json(file_name,json_output_file):
    file_name= csv_output_file
    rows=[]
    with open(file_name, 'r',encoding="utf8") as csvfile:        
        csvreader = csv.reader(csvfile) 

        # extracting field names through first row 
        fields = next(csvreader) 

        # extracting each data row one by one 
        for row in csvreader: 
            rows.append(row) 
            
    #removing empty lines
    master_list = []
    for i in range(len(rows)):  
        #print("rows:",rows[i])
        if(len(rows[i])>=2):
            temp = []
            for j in range(len(rows[i])):
                if(len(rows[i][j])>1):
                    temp.append(rows[i][j].strip())
            #print("temp:",temp)
            master_list.append(temp)
            
    master_dict = {}
    for i in master_list:
        master_dict[i[0]] = i[1:]
        
    json_object = json.dumps(master_dict, indent = 4)    
    with open(json_output_file,'w') as outfile:
        json.dump(json_object, outfile)
        
    #print(json_object)    
    return json_object

# csv_to_json function checking code starts

In [32]:
file_name = 'Aerodrome_and_Site_Clearance_Form.csv'
test_json = 'test.json'
print(csv_json(file_name,test_json))

{
    "": [
        "Aerodrome and Site Clearance ",
        "Form ",
        ""
    ],
    "Name of the applicant ": [
        "Sudhanshu Chaudhary ",
        "",
        ""
    ],
    "Full Address ": [
        "Anmol Ratan Building ",
        "3rd Floor ",
        ""
    ],
    "Te No./Fax No. ": [
        "9876543210 ",
        "",
        ""
    ],
    "Permanent E-Mail Address ": [
        "Sudhanshu@indianair.com ",
        "",
        ""
    ],
    "Name of the Directors/ Partners/Promoters ": [
        "None ",
        "",
        ""
    ],
    "Nature of the applicant firm or company ": [
        "Public Limited Company ",
        "",
        ""
    ],
    "Location of Aerodrame ": [
        "Greater Noida ",
        "",
        ""
    ],
    "Name of nearest civilian airport ": [
        "IGI ",
        "",
        ""
    ],
    "Nearest Civilain airport distance from proposed airport ": [
        "84.2 kms ",
        "",
        ""
    ],
    "Tota Land Area identified ": [

In [28]:
os.listdir('.')

['Aerodrome_and_Site_Clearance_Form.csv',
 'Aerodrome_and_Site_Clearance_Form.json']

# csv_to_json function checking code starts

In [12]:
def core_file_name(file_path):
    # This function has been created to get the actaul name of the file
    # i.e irrespective of any file extensions(.pdf/.jpg etc.) and path extensions(//.../../)
    # E.G: from asdf//dsfdsf/xyz.pdf it will return xyz only 
    
    #CAUTION : VERY NOOB CODE :P
    
    #removing the extension 
    file_path = file_path[::-1]
    file_less_ext = file_path[file_path.index('.')+1:][::-1]
    
    #print(file_less_ext)
    
    #removing the path to get the actual name
    if(file_less_ext[::-1].index("/")):
        #path_index = file_less_ext[::-1].index("/")        
        temp_less_path = file_less_ext[::-1]        
        temp_less_path=temp_less_path[:temp_less_path.index('/')]
        final_name = temp_less_path[::-1]
        
    else:
        final_name = file_less_ext
        
    return final_name

In [31]:
file_name = '../Forms/Aerodrome_and_Site_Clearance_Form.pdf'
output_file_name = core_file_name(file_name)
#print(output_file_name)

csv_output_file = output_file_name+'.csv'
if(file_name[-3:]=="pdf"):
    extract_pdf(file_name,csv_output_file)
    
else:
    extract_jpg(file_name,csv_output_file)
    
json_output_file = output_file_name+'.json'    
result = csv_json(csv_output_file,json_output_file)
print(result)


CSV OUTPUT FILE:  Aerodrome_and_Site_Clearance_Form.csv
{
    "": [
        "Aerodrome and Site Clearance ",
        "Form ",
        ""
    ],
    "Name of the applicant ": [
        "Sudhanshu Chaudhary ",
        "",
        ""
    ],
    "Full Address ": [
        "Anmol Ratan Building ",
        "3rd Floor ",
        ""
    ],
    "Te No./Fax No. ": [
        "9876543210 ",
        "",
        ""
    ],
    "Permanent E-Mail Address ": [
        "Sudhanshu@indianair.com ",
        "",
        ""
    ],
    "Name of the Directors/ Partners/Promoters ": [
        "None ",
        "",
        ""
    ],
    "Nature of the applicant firm or company ": [
        "Public Limited Company ",
        "",
        ""
    ],
    "Location of Aerodrame ": [
        "Greater Noida ",
        "",
        ""
    ],
    "Name of nearest civilian airport ": [
        "IGI ",
        "",
        ""
    ],
    "Nearest Civilain airport distance from proposed airport ": [
        "84.2 kms ",
        "

In [8]:
# Moving generated file to specific folder of that name
os.mkdir(output_file_name)
shutil.move(csv_output_file,output_file_name+'/'+csv_output_file)
shutil.move(json_output_file,output_file_name+'/'+json_output_file)

'Land Owner Consent Form/Land Owner Consent Form.json'