In [1]:
#Analyzes text in a document stored in an S3 bucket. Display polygon box around text and angled text 
import boto3
import io
from io import BytesIO
import sys

import math
from PIL import Image, ImageDraw, ImageFont

In [6]:
def ShowBoundingBox(draw,box,width,height,boxColor):
             
    left = width * box['Left']
    top = height * box['Top'] 
    draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],outline=boxColor)   

def ShowSelectedElement(draw,box,width,height,boxColor):
             
    left = width * box['Left']
    top = height * box['Top'] 
    draw.rectangle([left,top, left + (width * box['Width']), top +(height * box['Height'])],fill=boxColor)  

# Displays information about a block returned by text detection and text analysis
def DisplayBlockInformation(block):
    print('Id: {}'.format(block['Id']))
    if 'Text' in block:
        print('    Detected: ' + block['Text'])
    print('    Type: ' + block['BlockType'])
   
    if 'Confidence' in block:
        print('    Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")

    if block['BlockType'] == 'CELL':
        print("    Cell information")
        print("        Column:" + str(block['ColumnIndex']))
        print("        Row:" + str(block['RowIndex']))
        print("        Column Span:" + str(block['ColumnSpan']))
        print("        RowSpan:" + str(block['ColumnSpan']))    
    
    if 'Relationships' in block:
        print('    Relationships: {}'.format(block['Relationships']))
    print('    Geometry: ')
    print('        Bounding Box: {}'.format(block['Geometry']['BoundingBox']))
    print('        Polygon: {}'.format(block['Geometry']['Polygon']))
    
    if block['BlockType'] == "KEY_VALUE_SET":
        print ('    Entity Type: ' + block['EntityTypes'][0])
    
    if block['BlockType'] == 'SELECTION_ELEMENT':
        print('    Selection element detected: ', end='')

        if block['SelectionStatus'] =='SELECTED':
            print('Selected')
        else:
            print('Not selected')    
    
    if 'Page' in block:
        print('Page: ' + block['Page'])
    print()

def process_text_analysis(bucket, document):

    #Get the document from S3
    s3_connection = boto3.resource('s3')
                          
    s3_object = s3_connection.Object(bucket,document)
    s3_response = s3_object.get()

    stream = io.BytesIO(s3_response['Body'].read())
    image=Image.open(stream)

    # Analyze the document
    client = boto3.client('textract')
    
    image_binary = stream.getvalue()
    response = client.analyze_document(Document={'Bytes': image_binary},
        FeatureTypes=["TABLES", "FORMS"])
  

    # Alternatively, process using S3 object
    #response = client.analyze_document(
    #    Document={'S3Object': {'Bucket': bucket, 'Name': document}},
    #    FeatureTypes=["TABLES", "FORMS"])

    
    #Get the text blocks
    blocks=response['Blocks']
    width, height =image.size  
    draw = ImageDraw.Draw(image)  
    print ('Detected Document Text')
   
    # Create image showing bounding box/polygon the detected lines/text
    for block in blocks:

        DisplayBlockInformation(block)
             
        draw=ImageDraw.Draw(image)
        if block['BlockType'] == "KEY_VALUE_SET":
            if block['EntityTypes'][0] == "KEY":
                ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'red')
            else:
                ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height,'green')  
            
        if block['BlockType'] == 'TABLE':
            ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'blue')

        if block['BlockType'] == 'CELL':
            ShowBoundingBox(draw, block['Geometry']['BoundingBox'],width,height, 'yellow')
        if block['BlockType'] == 'SELECTION_ELEMENT':
            if block['SelectionStatus'] =='SELECTED':
                ShowSelectedElement(draw, block['Geometry']['BoundingBox'],width,height, 'blue')    
   
            #uncomment to draw polygon for all Blocks
            #points=[]
            #for polygon in block['Geometry']['Polygon']:
            #    points.append((width * polygon['X'], height * polygon['Y']))
            #draw.polygon((points), outline='blue')
            
    # Display the image
    image.show()
    return blocks

In [3]:
!aws --version

aws-cli/1.16.251 Python/3.7.4 Darwin/19.0.0 botocore/1.12.241


In [32]:
%timeit
# https://search-text-bucket.s3-eu-west-1.amazonaws.com/14jun201911.46.pdf.jpeg
bucket = 'search-text-bucket'
document = '14jun201911.46.pdf.jpeg'
blocks=process_text_analysis(bucket,document)
#print("Blocks detected: " + str(block_count))

Detected Document Text
Id: 295681dd-6caa-4aee-aad2-ecaaeb895f48
    Type: PAGE
    Relationships: [{'Type': 'CHILD', 'Ids': ['640273c1-e92c-4b79-89ba-2e20bef5f174', '96f3aa64-0316-4213-9472-1413e5afe947', '77c31b29-b196-4d83-8b69-50bac0d829b4', 'fce182c4-9e8e-4128-b1a5-823b141d2b7c', '6e3745f6-8ada-462c-9fb9-4c75efa9c428', '5e7cba13-4b50-4296-9006-6c3064b30174', 'd11afc06-a03e-4aa1-93a3-335e27211e3b', 'c78dc102-3c5b-4864-b345-adfdaf742abd', 'bcc75742-b2e7-439a-830a-a436b93882ca', '7708781f-3628-439d-87a0-cb5d65e05528', 'fdbc3750-285f-4311-bc47-d2c2785af941', '0d3e9c50-720c-4d1d-a7bb-73b6e869043b', '749ecec6-e450-4df9-8c36-08141c78a20f', 'fed8a561-aa23-4e73-9977-05e49f2f022f', '905d569c-ab19-476b-89af-eaff82f75e70', '3bc3f591-4ff2-4cd2-8f2b-c703b06e8dc9', 'dcf00969-09d1-4a36-92a2-ea15fb724672', '4c72fabb-bdb7-4d78-8e59-5610f8564d21', 'b2ed0772-36be-4de1-881b-2210ebbf9aba', '127a682a-fd1b-4785-a78e-20aa98d85f5e', '03e957dd-2914-469e-ba34-950fd82d970e', 'c004bb8a-e79a-4aa9-a01f-b88ba1d35a

        Bounding Box: {'Width': 0.08031994104385376, 'Height': 0.012651631608605385, 'Left': 0.31164559721946716, 'Top': 0.8336882591247559}
        Polygon: [{'X': 0.31164559721946716, 'Y': 0.8336882591247559}, {'X': 0.3919655382633209, 'Y': 0.8336882591247559}, {'X': 0.3919655382633209, 'Y': 0.8463398814201355}, {'X': 0.31164559721946716, 'Y': 0.8463398814201355}]

Id: 86d54aa1-6181-4af8-9716-509f2699ccd4
    Detected: 307029
    Type: WORD
    Confidence: 99.64%
    Geometry: 
        Bounding Box: {'Width': 0.13265855610370636, 'Height': 0.013399077579379082, 'Left': 0.41836076974868774, 'Top': 0.8337184190750122}
        Polygon: [{'X': 0.41836076974868774, 'Y': 0.8337184190750122}, {'X': 0.5510193705558777, 'Y': 0.8337184190750122}, {'X': 0.5510193705558777, 'Y': 0.8471174836158752}, {'X': 0.41836076974868774, 'Y': 0.8471174836158752}]

Id: 6ce97acb-44de-4c2f-bd10-ad0e2fd42436
    Detected: N.C:
    Type: WORD
    Confidence: 96.91%
    Geometry: 
        Bounding Box: {'Width': 

In [33]:
from describe_variable import describe

# describe(blocks, 2)
#list(map(lambda x: x['Text'] if x['BlockType'] == "LINE" else None, blocks))
elems = [x for x in blocks if x['BlockType'] == "LINE"]
",".join([x['Text'] for x in blocks if x['BlockType'] == "LINE"])

'MERCADONA S S.A. A -O,C/ MAYOR, 7 ESPINARDO,MURCIA,TELEFONO,968307114,NIF,A-46103834,07/03/2019 19:51,OP: 105936,FACTURA SIMPLIFICADA 2308-011-043010,Precio,Importe,escripcion,unidad,(e),1 B, ALMENDRA S/A,8,40,4 L SEMI S/LACTO,4,50,18,00,3 GALLETA RELIEV,1,22,3,66,1 COPOS AVENA,0,81,1 COSTILLA BARB,3,99,1 ZANAHORIA BOLS,0,69,2 VENTRESCA ATUN,2,15,4,30,1 PAPEL HIGIENIC,2,70,1 HIGIENICO DOBL,2,07,1 PEPINO,0,478 kg,1,89 e/kg,0,90,PLATANO,0,616 kg,2,29 e/kg,1,4,TOTAL,46,93,TARJETA. BANCARIA,46,93,DETALLE (e),IVA,BASE IMPONIBLE,CUOTA,4%,20,19,0,81,10%,19,24,1,92,21%,3,94,0,83,TOTAL,43,37,3,56,TARJ,9016,AUT 307029,N.C: 44101236,* PAGO TARJETA BANCARIA,A0000000031010,VISA CLASICA,3030,SE ADMITEN DEVOLUCIONES CON TICKET'

In [31]:
list(map(lambda x: x['Text'], elems))

['Cm)',
 'Buelo. lniteligtiile',
 'Y al meor precto.',
 'Ede',
 'LIDL SUPERMERCADOS S.A.U.',
 'Avenida Miguel de Cervantes No 110',
 '30009 Murcia',
 'NIF A60195278',
 'www.lidl.es',
 'EUR',
 'Flauta de cereales',
 '1,50 B',
 '2 X 0,75',
 'Pechugas codorniz',
 '3,69 B',
 'Alcachofa unidad',
 '1,16 A',
 '4 X 0,29',
 'Capsulas Intenso',
 '1,99 B',
 'Capsu IS Ristretto',
 '1,99 B',
 'Alitas de pollo',
 '2,39 B',
 'Champ non',
 '0,69 A',
 'Total',
 '13,41',
 'Entregado',
 '20,42',
 'Cambio',
 '-7,01',
 'IVA%',
 'IVA',
 '+',
 'P Neto',
 '=',
 'PVP',
 'A 4 %',
 '0,07',
 '1,78',
 '1,85',
 'B 10 %',
 '1,05',
 '10,51',
 '11,56',
 'Suma',
 '1,12',
 '1?,29',
 '13,41',
 '3508 039467/02',
 '12.01.19 12:55',
 'Devoluciones articulos de bazar con',
 'ticket de compra y embalaje original',
 'en un plazo maximo de 30 dias sin',
 'perjuicio de la ley de garantias.',
 'Horario Tienda Lu a Sa 09:00 a 22:00',
 'Atencion al cliente',
 'WWW lidl.es/contacto Tel 1.900958311',
 'GRACIAS POR SU VISITA']