In [1]:
from langchain_community.document_loaders import AmazonTextractPDFLoader
from langchain_community.document_loaders import PDFPlumberLoader
from langchain.prompts import PromptTemplate
from langchain.llms.bedrock import Bedrock
from langchain.chains import LLMChain
import boto3
import botocore

config = botocore.config.Config(
    read_timeout=900,
    connect_timeout=900,
    retries={"max_attempts": 3}
)

bedrock_client = boto3.client(
    service_name="bedrock-runtime",
    region_name="us-east-1",
    config=config,
)

textract_client = boto3.client("textract", region_name="us-west-2")
textract_features=["LAYOUT"]
file_path = r"s3://my-s3-doc-loader/2667551.pdf"
loader = AmazonTextractPDFLoader(file_path,textract_features,client=textract_client)
docs = loader.load()

In [3]:
all_page_content = ""

for doc in docs:
    all_page_content += doc.page_content

# print(all_page_content)

In [6]:
template = """
    Must convert datas into JSON format

    Instructions:
        - Do not skip any datas while converting into JSON
        - I need all the datas to be in JSON format
        - Avoid mismatching of the datas
    
    Important Instructions:
        - Strictly print all the terms like as it is in the document while converting JSON, Do not make abbreviation on your own for all the text
        

    
{datas}
"""

In [7]:
qa_prompt = PromptTemplate(template=template, input_variables=["datas"])
llm = Bedrock(model_id="anthropic.claude-v2:1",client=bedrock_client,model_kwargs = {"temperature":1e-10,"max_tokens_to_sample": 40000})
llm_chain = LLMChain(prompt=qa_prompt, llm=llm, verbose= False)
result = llm_chain.run(datas= all_page_content)
print(result)

 Here is the JSON format of the given document:

```json
{
    "document_name": "Indian Customs EDI System - Imports",
    "document_version": "V1.5R001",
    "bill_of_entry_type": "BILL OF ENTRY FOR HOME CONSUMPTION",
    "customs_station": "INMAA1",
    "cha_details": "AGGPP7274BCH002 [SUPREME C & F SERVICES ]",
    "bill_of_entry_details": {
        "be_number": "2667551",
        "be_date": "20/03/2024",
        "be_type": "N/H"
    },
    "importer_details": {
        "iec_code": "0798000066", 
        "pan": "AAACL2937JFT001",
        "ad_code": "0510010",
        "name": "LIFE STYLE INTERNATIONAL PRIVATE LIMITED",
        "address": "14 : CHENNAI IMPORT WAREHOUSE NO.31/1, POOCHIATHIPET VILLAGE OFF REDHILLS, THIRUVALLUVAR RD REDHILLS CHENNAI 600052"
    },
    "payment_method": "Transaction",
    "igm_details": {
        "igm_number": "2371888/19/03/2024",
        "country_of_origin": "CHINA",
        "bill_of_lading_number": "SZXCB24007583",
        "bill_of_lading_date": "08/03