# AWS Textract for data extraction from a PDF

Refer:
- https://www.gormanalysis.com/blog/connecting-to-aws-s3-with-python/
- https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html
- https://docs.aws.amazon.com/textract/latest/dg/examples-blocks.html
- https://github.com/aws-samples/amazon-textract-pdf-text-extractor
- https://github.com/aws-samples/amazon-textract-code-samples/blob/master/python/Amazon-Textract-Pdf.pdf

In [1]:
# import libraries
import os
import openai
from dotenv import load_dotenv, find_dotenv

from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential

_ = load_dotenv(find_dotenv()) # read local .env file

my_region_name  = os.environ['AWS_DEFAULT_REGION']
my_aws_access_key_id  = os.environ['AWS_ACCESS_KEY_ID']
my_aws_secret_access_key  = os.environ['AWS_SECRET_ACCESS_KEY']

In [2]:
import boto3

# Let's use Amazon S3
s3 = boto3.resource(
    service_name='s3',
    region_name = my_region_name ,
    aws_access_key_id = my_aws_access_key_id,
    aws_secret_access_key = my_aws_secret_access_key
)


In [3]:
# Print out bucket names
for bucket in s3.buckets.all():
    print(bucket.name)

letsstem.org
mahtabsyed
mahtabsyed.com
mytexttractbucket
www.letsstem.org
www.mahtabsyed.com


In [4]:
import boto3
import time

def start_job(client, s3_bucket_name, object_name):
    response = None
    response = client.start_document_text_detection(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3_bucket_name,
                'Name': object_name
            }})

    return response["JobId"]


def is_job_complete(client, job_id):
    time.sleep(1)
    response = client.get_document_text_detection(JobId=job_id)
    status = response["JobStatus"]
    print("Job status: {}".format(status))

    while(status == "IN_PROGRESS"):
        time.sleep(1)
        response = client.get_document_text_detection(JobId=job_id)
        status = response["JobStatus"]
        print("Job status: {}".format(status))

    return status


def get_job_results(client, job_id):
    pages = []
    time.sleep(1)
    response = client.get_document_text_detection(JobId=job_id)
    pages.append(response)
    print("Resultset page received: {}".format(len(pages)))
    next_token = None
    if 'NextToken' in response:
        next_token = response['NextToken']

    while next_token:
        time.sleep(1)
        response = client.\
            get_document_text_detection(JobId=job_id, NextToken=next_token)
        pages.append(response)
        print("Resultset page received: {}".format(len(pages)))
        next_token = None
        if 'NextToken' in response:
            next_token = response['NextToken']

    return pages

In [5]:
if __name__ == "__main__":
    # Document
    s3_bucket_name = "mytexttractbucket"
    document_name = "GHI QUOTE.pdf"
    region = my_region_name
    client = boto3.client('textract', region_name=region)

    job_id = start_job(client, s3_bucket_name, document_name)
    print("Started job with id: {}".format(job_id))
    if is_job_complete(client, job_id):
        response = get_job_results(client, job_id)

    # print(response)

    # Print detected text
    for result_page in response:
        for item in result_page["Blocks"]:
            if item["BlockType"] == "LINE":
                print('\033[94m' + item["Text"] + '\033[0m')

Started job with id: 0d94e48f22c16a7a0ac20c2171232fcbb24e527744b2237d6d2be614a78b4867
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: IN_PROGRESS
Job status: SUCCEEDED
Resultset page received: 1
Resultset page received: 2
Resultset page received: 3
[94mICICI LOMBARD GIC LTD[0m
[94mICICI CLombard[0m
[94mGROUP HEALTH INSURANCE[0m
[94mNibhaye Vaade[0m
[94mQuote cum proposal format[0m
[94mQuote No.(4016/IP-03843503/001)[0m
[94mProposer Details[0m
[94mPolicy Coverages[0m
[94mName of the Proposer[0m
[94mELIXIR ENTERPRISES[0m
[94mPolicy Construct[0m
[94mEmployer - Employee[0m
[94mAND HOTELS PRIVATE[0m
[94mOPD/IPD[0m
[94mIPD[0m
[94mLIMITED[0m
[94mService Category[0m
[94mBoth(Cashless +[0m
[94mLocation of Proposer[0m
[94mBANGALORE[0m
[94mReimbursement)[0m
[94mIndustry Type[0