In [1]:
import os
from docai import utils, llm, stream

env = {
    "FILES_BUCKET_PARAMETER_NAME": "/dev/bucket/files_bucket_name",
    "OPEN_API_KEY_PARAMETER_NAME": "/dev/secret/openai/api_key",
    "EXTRACT_BATCH_QUEUE_PARAMETER_NAME": "/dev/queue/batch_data_queue_name",
    "SCHEMA_TABLE_PARAMETER_NAME": "/dev/table/schema_table_name"
}
os.environ.update(env)

config = utils.Config()
secrets = utils.Secrets()
resource = utils.Resources()

bucket_name = config("FILES_BUCKET_PARAMETER_NAME")
openai_api_key = secrets("OPEN_API_KEY_PARAMETER_NAME")

s3_client = resource.get_s3()
batch_queue = resource.get_queue("EXTRACT_BATCH_QUEUE_PARAMETER_NAME")
schema_table = resource.get_table("SCHEMA_TABLE_PARAMETER_NAME")

client = llm.LLMClient(api_key=openai_api_key)

media = [
    ("../images/sample.jpg.txt", "image/jpg"),
    ("../images/sample.png.txt", "image/png"),
    ("../images/sample.pdf.txt", "application/pdf")
]
schema = schema_table.get_item(Key={"schema_name": "test_loe","schema_version": "9jBVrISmXA"}).get("Item")
params = []
for filename, mime_type in media:
    content = open(filename, "r").read()
    document = dict(content=content, mime_type=mime_type)
    param = stream.prepare_extraction_request(schema, document, s3_client, bucket_name)
    params.append(param)

In [2]:
responses = []
for param in params:
    response = client(**param, s3=s3_client, bucket_name=bucket_name)
    responses.append(response)

In [6]:
from rich.pretty import pprint
for i, response in enumerate(responses):
    pprint(f"------------------ Document {i+1} :: {media[i]} ------------------")
    pprint(response["result"])