# Using Textract

In [1]:
# Load up the usual packages
import pandas as pd
from collections import OrderedDict
import requests

In [2]:
# AWS Python SDK
import boto3
textract = boto3.client('textract', region_name='us-east-1')

## From the command line

In [3]:
# Use Textract from the CLI
text = !aws textract analyze-document \
 --document '{"S3Object":{"Bucket":"nlp-course","Name":"wsj_text.jpeg"}}' \
 --feature-types '["TABLES","FORMS"]'

text

['{',
 '    "DocumentMetadata": {',
 '        "Pages": 1',
 '    },',
 '    "Blocks": [',
 '        {',
 '            "BlockType": "PAGE",',
 '            "Geometry": {',
 '                "BoundingBox": {',
 '                    "Width": 1.0,',
 '                    "Height": 1.0,',
 '                    "Left": 0.0,',
 '                    "Top": 0.0',
 '                },',
 '                "Polygon": [',
 '                    {',
 '                        "X": 0.0,',
 '                        "Y": 0.0',
 '                    },',
 '                    {',
 '                        "X": 1.0,',
 '                        "Y": 0.0',
 '                    },',
 '                    {',
 '                        "X": 1.0,',
 '                        "Y": 1.0',
 '                    },',
 '                    {',
 '                        "X": 0.0,',
 '                        "Y": 1.0',
 '                    }',
 '                ]',
 '            },',
 '            "Id": "7513eae2-0adf-

## Use the Python interface with a Bytes object

In [4]:
# Use a function for processing images in S3 using Python
import io
import io
from io import BytesIO
import sys
import math
from PIL import Image, ImageDraw, ImageFont


def process_text_analysis(bucket, document):

    #Get the document from S3
    s3_connection = boto3.resource('s3')
                          
    s3_object = s3_connection.Object(bucket,document)
    s3_response = s3_object.get()

    stream = io.BytesIO(s3_response['Body'].read())
    image=Image.open(stream)

    # Analyze the document
    client = boto3.client('textract')
    
    # Use a Bytes object
    image_binary = stream.getvalue()
    response = client.analyze_document(Document={'Bytes': image_binary},
        FeatureTypes=["TABLES", "FORMS"])
  

    # Alternatively, process using S3 object
    #response = client.analyze_document(
    #    Document={'S3Object': {'Bucket': bucket, 'Name': document}},
    #    FeatureTypes=["TABLES", "FORMS"])
    
    return response

In [5]:
bucket = 'nlp-course'
document = 'wsj_text.jpeg'
res = process_text_analysis(bucket, document)
res

{'DocumentMetadata': {'Pages': 1},
 'Blocks': [{'BlockType': 'PAGE',
   'Geometry': {'BoundingBox': {'Width': 1.0,
     'Height': 1.0,
     'Left': 0.0,
     'Top': 0.0},
    'Polygon': [{'X': 0.0, 'Y': 0.0},
     {'X': 1.0, 'Y': 0.0},
     {'X': 1.0, 'Y': 1.0},
     {'X': 0.0, 'Y': 1.0}]},
   'Id': 'cb234e5e-e946-49a1-bf02-9d360866b0b2',
   'Relationships': [{'Type': 'CHILD',
     'Ids': ['b5aa0691-400e-46cc-90ed-985916b06700',
      'aa4bdf5e-8f9d-45e9-a8f4-3619a3e57b63',
      '3371bdf9-b9d2-4000-ba71-4a66c42af995',
      '2e1f7c78-8414-4ca6-abf2-001633627bb7',
      '571ddc5f-3d7b-49e2-bcd3-40b76e014eec',
      '52ba24c4-18c2-4482-b73f-8828fcfe3f18',
      'ec14dc8b-5d74-4edc-a683-f275dd35dfdb',
      '7e453ff9-d751-4640-9501-c1bf1036b35f',
      '46085027-02e3-44c8-823c-a0c6df89a583',
      'e59480ca-9a5d-4c15-875a-6e59023543e9',
      'c8f7912e-55fd-491b-acbf-d989be8b281d',
      'b96daa85-10b7-4194-bd07-34944a1c2c0b',
      'a6c86682-6a9e-40ca-95cf-0d9f5fb5fce1',
      '3bccf8bb

## Use the Python interface with a S3 Object

In [6]:
def process_text_analysis(bucket, document):

    #Get the document from S3
    s3_connection = boto3.resource('s3')
                          
    s3_object = s3_connection.Object(bucket,document)
    s3_response = s3_object.get()

    stream = io.BytesIO(s3_response['Body'].read())
    image=Image.open(stream)

    # Analyze the document
    client = boto3.client('textract')
    
#     image_binary = stream.getvalue()
#     response = client.analyze_document(Document={'Bytes': image_binary},
#         FeatureTypes=["TABLES", "FORMS"])
  

#     Alternatively, process using S3 object
    response = client.analyze_document(
       Document={'S3Object': {'Bucket': bucket, 'Name': document}},
       FeatureTypes=["TABLES", "FORMS"])
    
    return response

In [7]:
bucket = 'nlp-course'
document = 'wsj_text.jpeg'
res = process_text_analysis(bucket, document)
res

{'DocumentMetadata': {'Pages': 1},
 'Blocks': [{'BlockType': 'PAGE',
   'Geometry': {'BoundingBox': {'Width': 1.0,
     'Height': 1.0,
     'Left': 0.0,
     'Top': 0.0},
    'Polygon': [{'X': 0.0, 'Y': 0.0},
     {'X': 1.0, 'Y': 0.0},
     {'X': 1.0, 'Y': 1.0},
     {'X': 0.0, 'Y': 1.0}]},
   'Id': 'b0b85fdb-e41d-4508-93e5-b493ad2442bd',
   'Relationships': [{'Type': 'CHILD',
     'Ids': ['f127c48f-d420-4e1b-871a-a8cce580c980',
      'cbe6b405-a2f8-4f4a-947f-2429f064fcc9',
      'cd6c50fa-6dbe-474a-981d-ce5f0afdd3f4',
      '80bb4962-e0f4-4adc-9736-6e6f5d816358',
      'c0d13b82-d13f-40be-adbf-dfe12bd86223',
      '96a71bfc-111d-4ebb-9577-5293c2d2b352',
      '184819bb-9289-4601-b5af-c8f92a94ed7f',
      '39bcb1e9-3eaa-4b14-8626-c1568c8aec1e',
      '93771086-5bd5-4173-bf84-2f3ee4e67af4',
      '78b2bf44-9e99-40e8-9490-fedd91ef5fc5',
      'a0362e0d-ee5b-416a-af1f-654b096c3a2d',
      '2b119ffd-cca7-4c55-9116-ab8945e53a86',
      'b3481a99-9e93-4c44-8055-3d433646e415',
      'd0efa8ce

## Download a file from S3

In [8]:
# For copying files from S3
import os
import sys
import io
import logging
import argparse
import json
from botocore.exceptions import ClientError
import subprocess
import urllib.request as ureq
import datetime as datetime
import boto3
import botocore

BUCKET_NAME = 'nlp-course' # replace with your bucket name
KEY = 'wsj_text.jpeg' # replace with your object key (file name)

s3 = boto3.resource('s3')

try:
    # download as local file
    s3.Bucket(BUCKET_NAME).download_file(KEY, 'my_local_image.jpeg')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The object does not exist.")
    else:
        raise

## Upload a file to S3

In [None]:
# Local files path in notebook instance
file_path_root = '/home/ec2-user/SageMaker/'

BUCKET_NAME = 'nlp-course' # replace with your bucket name
KEY = 'wsj_text.jpeg' # replace with your object key (file name)

file_path = file_path_root + s3_key
s3_client.upload_file(file_path, BUCKET_NAME, KEY)