In [1]:
import pdf2image
from pdf2image import convert_from_path
pages = convert_from_path('AAI_doc.pdf', 500)

In [2]:
for page in pages:
    page.save('out.png', 'JPEG')    
    break

In [6]:
import webbrowser, os
import json
import boto3
import io
from io import BytesIO
import sys
from pprint import pprint

In [7]:
def get_rows_columns_map(table_result, blocks_map):
    rows = {}
    for relationship in table_result['Relationships']:
        if relationship['Type'] == 'CHILD':
            for child_id in relationship['Ids']:
                cell = blocks_map[child_id]
                if cell['BlockType'] == 'CELL':
                    row_index = cell['RowIndex']
                    col_index = cell['ColumnIndex']
                    if row_index not in rows:
                        # create new row
                        rows[row_index] = {}
                        
                    # get the text value
                    rows[row_index][col_index] = get_text(cell, blocks_map)
    return rows


def get_text(result, blocks_map):
    text = ''
    if 'Relationships' in result:
        for relationship in result['Relationships']:
            if relationship['Type'] == 'CHILD':
                for child_id in relationship['Ids']:
                    word = blocks_map[child_id]
                    if word['BlockType'] == 'WORD':
                        text += word['Text'] + ' '
                    if word['BlockType'] == 'SELECTION_ELEMENT':
                        if word['SelectionStatus'] =='SELECTED':
                            text +=  'X '    
    return text


def get_table_csv_results(file_name):

    with open(file_name, 'rb') as file:
        img_test = file.read()
        bytes_test = bytearray(img_test)
        print('Image loaded', file_name)

    # process using image bytes
    # get the results
    client = boto3.client('textract')

    response = client.analyze_document(Document={'Bytes': bytes_test}, FeatureTypes=['TABLES'])

    # Get the text blocks
    blocks=response['Blocks']
    pprint(blocks)

    blocks_map = {}
    table_blocks = []
    for block in blocks:
        blocks_map[block['Id']] = block
        if block['BlockType'] == "TABLE":
            table_blocks.append(block)

    if len(table_blocks) <= 0:
        return "<b> NO Table FOUND </b>"

    csv = ''
    for index, table in enumerate(table_blocks):
        csv += generate_table_csv(table, blocks_map, index +1)
        csv += '\n\n'

    return csv

def generate_table_csv(table_result, blocks_map, table_index):
    rows = get_rows_columns_map(table_result, blocks_map)

    table_id = 'Table_' + str(table_index)
    
    # get cells.
    csv = 'Table: {0}\n\n'.format(table_id)

    for row_index, cols in rows.items():
        
        for col_index, text in cols.items():
            csv += '{}'.format(text) + ","
        csv += '\n'
        
    csv += '\n\n\n'
    return csv

In [15]:
file_name = 'last_page.png'
table_csv = get_table_csv_results(file_name)
output_file = 'output.csv'


# replace content
with open(output_file, "w") as fout:
    fout.write(table_csv)

# show the results
print('CSV OUTPUT FILE: ', output_file)

Image loaded last_page.png
[{'BlockType': 'PAGE',
  'Geometry': {'BoundingBox': {'Height': 1.0,
                               'Left': 0.0,
                               'Top': 0.0,
                               'Width': 1.0},
               'Polygon': [{'X': 0.0, 'Y': 0.0},
                           {'X': 1.0, 'Y': 0.0},
                           {'X': 1.0, 'Y': 1.0},
                           {'X': 0.0, 'Y': 1.0}]},
  'Id': 'b7d7dd0c-9fdb-4f57-9b00-6a68f60d3759',
  'Relationships': [{'Ids': ['913a712b-681c-497d-af42-d3d7fae9330b',
                             '57148d7a-d082-4217-9fe1-ff323edde366',
                             'a290a06e-3ad9-409b-b46c-e4399afb86d0',
                             '95dfbf01-92a7-4e4a-896b-055edee97843',
                             '38a4c4fb-56ba-4464-b05e-1fd95bc05bdf',
                             '10cb6836-3714-4f82-869c-ae876cd9b7d3',
                             '37ad900d-8129-41f9-a935-11b1b6c6b428',
                             '56477e7a-eb2

  'Geometry': {'BoundingBox': {'Height': 0.014669179916381836,
                               'Left': 0.707362711429596,
                               'Top': 0.5791321396827698,
                               'Width': 0.1384093165397644},
               'Polygon': [{'X': 0.707362711429596, 'Y': 0.5791321396827698},
                           {'X': 0.8457720279693604, 'Y': 0.5791321396827698},
                           {'X': 0.8457720279693604, 'Y': 0.5938013195991516},
                           {'X': 0.707362711429596, 'Y': 0.5938013195991516}]},
  'Id': 'a0e18886-4e83-4d83-bcd0-c0c75b3ae21b',
  'Relationships': [{'Ids': ['35b7e299-04f0-4518-9780-1d21ee74a350'],
                     'Type': 'CHILD'}],
  'Text': 'PROFORMA-II'},
 {'BlockType': 'LINE',
  'Confidence': 99.35279846191406,
  'Geometry': {'BoundingBox': {'Height': 0.015090377070009708,
                               'Left': 0.2515445053577423,
                               'Top': 0.6179522275924683,
                      

                               'Top': 0.047546178102493286,
                               'Width': 0.05303024873137474},
               'Polygon': [{'X': 0.6738726496696472, 'Y': 0.047546178102493286},
                           {'X': 0.7269029021263123, 'Y': 0.047546178102493286},
                           {'X': 0.7269029021263123, 'Y': 0.062345895916223526},
                           {'X': 0.6738726496696472,
                            'Y': 0.062345895916223526}]},
  'Id': '343abab1-53c6-4438-9d7d-b0c1e6a02c34',
  'Text': 'Delhi.'},
 {'BlockType': 'WORD',
  'Confidence': 99.50025939941406,
  'Geometry': {'BoundingBox': {'Height': 0.014680723659694195,
                               'Left': 0.7334243655204773,
                               'Top': 0.04770456254482269,
                               'Width': 0.034804828464984894},
               'Polygon': [{'X': 0.7334243655204773, 'Y': 0.04770456254482269},
                           {'X': 0.7682291865348816, 'Y': 0.0477045625448

  'Text': 'Office'},
 {'BlockType': 'WORD',
  'Confidence': 99.48426818847656,
  'Geometry': {'BoundingBox': {'Height': 0.013376199640333652,
                               'Left': 0.30161526799201965,
                               'Top': 0.31175294518470764,
                               'Width': 0.06986668705940247},
               'Polygon': [{'X': 0.30161526799201965, 'Y': 0.31175294518470764},
                           {'X': 0.3714819550514221, 'Y': 0.31175294518470764},
                           {'X': 0.3714819550514221, 'Y': 0.3251291513442993},
                           {'X': 0.30161526799201965,
                            'Y': 0.3251291513442993}]},
  'Id': '0b9bb289-6d9d-4887-be8f-4742f88eaa81',
  'Text': 'address'},
 {'BlockType': 'WORD',
  'Confidence': 96.42147064208984,
  'Geometry': {'BoundingBox': {'Height': 0.016675639897584915,
                               'Left': 0.18535426259040833,
                               'Top': 0.3324514925479889,
                  

                               'Width': 0.07408212125301361},
               'Polygon': [{'X': 0.24519215524196625, 'Y': 0.7382687926292419},
                           {'X': 0.31927427649497986, 'Y': 0.7382687926292419},
                           {'X': 0.31927427649497986, 'Y': 0.7527811527252197},
                           {'X': 0.24519215524196625,
                            'Y': 0.7527811527252197}]},
  'Id': '7b7995e7-50fc-4755-92eb-09d453301772',
  'Text': 'Address'},
 {'BlockType': 'WORD',
  'Confidence': 64.80642700195312,
  'Geometry': {'BoundingBox': {'Height': 0.017188429832458496,
                               'Left': 0.18847784399986267,
                               'Top': 0.7603040337562561,
                               'Width': 0.024724500253796577},
               'Polygon': [{'X': 0.18847784399986267, 'Y': 0.7603040337562561},
                           {'X': 0.2132023423910141, 'Y': 0.7603040337562561},
                           {'X': 0.2132023423910141, 'Y':

                               'Top': 0.3090805411338806,
                               'Width': 0.23692914843559265},
               'Polygon': [{'X': 0.23665261268615723, 'Y': 0.3090805411338806},
                           {'X': 0.4735817611217499, 'Y': 0.3090805411338806},
                           {'X': 0.4735817611217499, 'Y': 0.32856816053390503},
                           {'X': 0.23665261268615723,
                            'Y': 0.32856816053390503}]},
  'Id': '23a50c68-9225-44f0-ac7d-3fa2783fe5b7',
  'Relationships': [{'Ids': ['ec607301-854f-45fb-a029-15b7e81e4c3f',
                             '0b9bb289-6d9d-4887-be8f-4742f88eaa81'],
                     'Type': 'CHILD'}],
  'RowIndex': 7,
  'RowSpan': 1},
 {'BlockType': 'CELL',
  'ColumnIndex': 3,
  'ColumnSpan': 1,
  'Confidence': 99.99893951416016,
  'Geometry': {'BoundingBox': {'Height': 0.01948760636150837,
                               'Left': 0.4735817611217499,
                               'Top': 0.30908054113

  'RowSpan': 1},
 {'BlockType': 'CELL',
  'ColumnIndex': 1,
  'ColumnSpan': 1,
  'Confidence': 99.99847412109375,
  'Geometry': {'BoundingBox': {'Height': 0.020678702741861343,
                               'Left': 0.16440127789974213,
                               'Top': 0.777752697467804,
                               'Width': 0.07438138872385025},
               'Polygon': [{'X': 0.16440127789974213, 'Y': 0.777752697467804},
                           {'X': 0.23878267407417297, 'Y': 0.777752697467804},
                           {'X': 0.23878267407417297, 'Y': 0.798431396484375},
                           {'X': 0.16440127789974213, 'Y': 0.798431396484375}]},
  'Id': '8fafa15e-37b5-4b6e-a10b-80396f19280b',
  'Relationships': [{'Ids': ['9cbbbf8c-423f-495c-90d4-27cdbb3e6abe'],
                     'Type': 'CHILD'}],
  'RowIndex': 5,
  'RowSpan': 1},
 {'BlockType': 'CELL',
  'ColumnIndex': 2,
  'ColumnSpan': 1,
  'Confidence': 99.99847412109375,
  'Geometry': {'BoundingBox': {'Heigh

In [9]:
print(type(table_csv))

<class 'str'>


In [16]:
with open(output_file, "w") as fout:
    fout.write("main hoon gian")

PermissionError: [Errno 13] Permission denied: 'output.csv'

In [11]:
import csv
with open('output_file.csv', mode='w') as csv_file:
    
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

In [17]:
type(table_csv)

str