In [1]:
import boto3
import numpy as np
import os
import pandas as pd

from collections import defaultdict
from PIL import Image, ImageDraw


def get_kv_map(file_name, client):
    with open(file_name, "rb") as file:
        img_test = file.read()
        bytes_test = bytearray(img_test)
        response = client.analyze_document(
            Document={"Bytes": bytes_test}, FeatureTypes=["FORMS"]
        )
        # Get the text blocks
        blocks = response["Blocks"]

        # Get key and value maps
        key_map = {}
        value_map = {}
        block_map = {}
        for block in blocks:
            block_id = block["Id"]
            block_map[block_id] = block
            if block["BlockType"] == "KEY_VALUE_SET":
                if "KEY" in block["EntityTypes"]:
                    key_map[block_id] = block
                else:
                    value_map[block_id] = block

        return key_map, value_map, block_map

def get_kv_relationship(key_map, value_map, block_map):
    kvs = defaultdict(list)
    for block_id, key_block in key_map.items():
        value_block = find_value_block(key_block, value_map)
        key = get_text(key_block, block_map)
        val = get_text(value_block, block_map)
        kvs[key].append(val)
    return kvs

def find_value_block(key_block, value_map):
    for relationship in key_block["Relationships"]:
        if relationship["Type"] == "VALUE":
            for value_id in relationship["Ids"]:
                value_block = value_map[value_id]
    return value_block

def get_text(result, blocks_map):
    text = ""
    if "Relationships" in result:
        for relationship in result["Relationships"]:
            if relationship["Type"] == "CHILD":
                for child_id in relationship["Ids"]:
                    word = blocks_map[child_id]
                    if word["BlockType"] == "WORD":
                        text += word["Text"] + " "
                    if word["BlockType"] == "SELECTION_ELEMENT":
                        if word["SelectionStatus"] == "SELECTED":
                            text += "X "

    return text

def main():
    session = boto3.Session(profile_name="kadakareer-dev")
    client = session.client("textract", region_name="ap-southeast-2")

    combined_data = {}
    image_path = "/home/miggy/Desktop/apps/amazon-textract/images/"
    images = sorted(os.listdir(image_path))
    for img in images:
        key_map, value_map, block_map = get_kv_map(
            image_path+img, client
        )
        kvs = get_kv_relationship(key_map, value_map, block_map)
        # Combine data1 into combined_data
        for key, value in kvs.items():
            if key in combined_data.keys():
                combined_data[key] += kvs[key]
            else:
                combined_data[key] = kvs[key]
    
    # Find the maximum column length
    max_len = max(len(combined_data[col]) for col in combined_data)

    # Pad the columns with 'NaN' as needed
    for col in combined_data:
        combined_data[col] += [np.nan] * (max_len - len(combined_data[col]))

    # Convert the dictionary into DataFrame 
    df = pd.DataFrame(combined_data)

    # Print the DataFrame
    print(df)

    # Save the DataFrame to CSV file
    df.to_csv('output.csv', index=False)


if __name__ == "__main__":
    main()
