In [2]:
#Analyzes text in a document stored in an S3 bucket. Display polygon box around text and angled text 
import boto3
import io
from io import BytesIO
import sys

import math
from PIL import Image, ImageDraw, ImageFont

def ShowBoundingBox(draw,box,width,height,boxColor):
             
    left = width * box['Left']
    top = height * box['Top'] 
    draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],outline=boxColor)   

def ShowSelectedElement(draw,box,width,height,boxColor):
             
    left = width * box['Left']
    top = height * box['Top'] 
    draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],fill=boxColor)  

# Displays information about a block returned by text detection and text analysis
def DisplayBlockInformation(block):
    print('Id: {}'.format(block['Id']))
    if 'Text' in block:
        print('    Detected: ' + block['Text'])
    print('    Type: ' + block['BlockType'])
   
    if 'Confidence' in block:
        print('    Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")

    if block['BlockType'] == 'CELL':
        print("    Cell information")
        print("        Column:" + str(block['ColumnIndex']))
        print("        Row:" + str(block['RowIndex']))
        print("        Column Span:" + str(block['ColumnSpan']))
        print("        RowSpan:" + str(block['ColumnSpan']))    
    
    if 'Relationships' in block:
        print('    Relationships: {}'.format(block['Relationships']))
    print('    Geometry: ')
    print('        Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
    print('        Polygon: {}'.format(block['Geometry']['Polygon']))
    
    if block['BlockType'] == "KEY_VALUE_SET":
        print ('    Entity Type: ' + block['EntityTypes'][0])
    
    if block['BlockType'] == 'SELECTION_ELEMENT':
        print('    Selection element detected: ', end='')

        if block['SelectionStatus'] =='SELECTED':
            print('Selected')
        else:
            print('Not selected')    
    
    if 'Page' in block:
        print('Page: ' + block['Page'])
    print()

def process_text_analysis(bucket, document):

    #Get the document from S3
    s3_connection = boto3.resource('s3')
                          
    s3_object = s3_connection.Object(bucket,document)
    s3_response = s3_object.get()

    stream = io.BytesIO(s3_response['Body'].read())
    image=Image.open(stream)

    # Analyze the document
    client = boto3.client('textract')
    
    image_binary = stream.getvalue()
    response = client.analyze_document(Document={'Bytes': image_binary},
        FeatureTypes=["TABLES", "FORMS"])
  

    # Alternatively, process using S3 object
    #response = client.analyze_document(
    #    Document={'S3Object': {'Bucket': bucket, 'Name': document}},
    #    FeatureTypes=["TABLES", "FORMS"])

    
    #Get the text blocks
    blocks=response['Blocks']
    width, height =image.size  
    draw = ImageDraw.Draw(image)  
    print ('Detected Document Text')
   
    # Create image showing bounding box/polygon the detected lines/text
    for block in blocks:

        DisplayBlockInformation(block)
             
        draw=ImageDraw.Draw(image)
        if block['BlockType'] == "KEY_VALUE_SET":
            if block['EntityTypes'][0] == "KEY":
                ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'red')
            else:
                ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'green')  
            
        if block['BlockType'] == 'TABLE':
            ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'blue')

        if block['BlockType'] == 'CELL':
            ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'yellow')
        if block['BlockType'] == 'SELECTION_ELEMENT':
            if block['SelectionStatus'] =='SELECTED':
                ShowSelectedElement(draw, block['Geometry']['BoundingBox'],width,height, 'blue')    
   
            #uncomment to draw polygon for all Blocks
            #points=[]
            #for polygon in block['Geometry']['Polygon']:
            #    points.append((width * polygon['X'], height * polygon['Y']))
            #draw.polygon((points), outline='blue')
            
    # Display the image
    image.show()
    return len(blocks)


def main():

    bucket = 'textract-manohar'
    document = 'sample.jpg'
    block_count=process_text_analysis(bucket,document)
    print("Blocks detected: " + str(block_count))
    
if __name__ == "__main__":
    main()

Detected Document Text
Id: c902ea92-03f2-4a93-b688-b53d0b5f7f37
    Type: PAGE
    Relationships: [{'Type': 'CHILD', 'Ids': ['a2dd6958-308e-43f7-8289-36ea69bfc208', '4893247c-6894-4c36-9958-b9fcb55c54e7', '46bb2283-d42f-4cd2-9bbd-ee2bf08971fa', '0567bcd2-fe6e-446b-8379-b748f9e21ac0', '57d1066b-0c09-4cab-8c24-b0c7df0a8726', '1fd81148-2741-4bdb-8d37-4120e559628a', '50de5c44-13a5-4d06-bab6-186e9b0aed36', 'e000c5f8-015a-452a-93b8-a9b25f2ce502', '70a3b78e-d26b-47f0-a5bc-b43646dc4d28', '5bc55626-9d66-4f02-a79e-f60dd3e3c9a5', '2c2023fc-9203-442d-a072-fe43db254355', '2dca6b00-5896-41c8-bd30-77707581f02f', '73749ea6-bc15-4c8a-adb9-ab94506f7346', 'e016d1bb-6949-4939-a42e-34362bac885a', 'd4d5e61a-1c21-4c5f-b186-0cd5d5dcf4b8', 'addfd632-0ac1-4e24-9d04-0426f3b6f503', 'bf622a6c-6f39-4fea-927c-4d37e4d4d21a', '9aa755a3-10f4-4050-a2d0-42e60f202fba', '3ef62345-9ce2-4f7a-a2be-a40f9fe8eeb8', '8ccef41c-2b4c-4afb-91ad-0f97780fe19d', '5ab74cdb-060b-4d12-ad56-c7aa642a3a51', 'ce11960c-f94e-4add-a458-c5c8df7eae

Id: c8c014ab-f8f5-49f8-a873-8e51b54499b7
    Type: CELL
    Confidence: 99.99%
    Cell information
        Column:1
        Row:1
        Column Span:1
        RowSpan:1
    Relationships: [{'Type': 'CHILD', 'Ids': ['58d75a9b-fccf-4d3e-a11c-78e0c99bfa60']}]
    Geometry: 
        Bounding Box: {'Width': 0.11863131076097488, 'Height': 0.025557780638337135, 'Left': 0.10873619467020035, 'Top': 0.3246380388736725}
        Polygon: [{'X': 0.10873619467020035, 'Y': 0.3246380388736725}, {'X': 0.22736750543117523, 'Y': 0.3246380388736725}, {'X': 0.22736750543117523, 'Y': 0.35019582509994507}, {'X': 0.10873619467020035, 'Y': 0.35019582509994507}]

Id: fa278f80-207e-4347-bb72-f1df63d6db95
    Type: CELL
    Confidence: 99.99%
    Cell information
        Column:2
        Row:1
        Column Span:1
        RowSpan:1
    Relationships: [{'Type': 'CHILD', 'Ids': ['e7ee7847-95d1-4307-84c7-b9e3a447a752']}]
    Geometry: 
        Bounding Box: {'Width': 0.24413980543613434, 'Height': 0.0255577806383

Blocks detected: 318
