# Speak with your documents . Extract structured Data

- https://python.langchain.com/docs/tutorials/extraction/

- https://python.langchain.com/docs/how_to/structured_output/

- https://python.langchain.com/docs/how_to/output_parser_structured/

- https://api.python.langchain.com/en/latest/core/output_parsers/langchain_core.output_parsers.pydantic.PydanticOutputParser.html

In [None]:
# ! python -m pip install -r requirements.txt

In [None]:
import os
from dotenv import dotenv_values

In [None]:
config = dotenv_values("./keys/.env")

In [None]:
from dotenv import dotenv_values
import json
import vertexai
from google.oauth2 import service_account
from vertexai.generative_models import GenerationConfig, GenerativeModel, Image, Part
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from langchain_community.document_loaders import PyPDFLoader
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List, Optional
import pprint

In [None]:
import getpass
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

In [None]:
import pydantic

In [None]:
pydantic.__version__

In [None]:

with open("./keys/complete-tube-421007-208a4862c992.json") as source:
    info = json.load(source)

vertex_credentials = service_account.Credentials.from_service_account_info(info)
vertexai.init(
    project=config["PROJECT"],
    location=config["REGION"],
    credentials=vertex_credentials,
)
google_api_key = config["GEMINI-API-KEY"]
os.environ["GEMINI_API_KEY"] = google_api_key

In [None]:
loader = PyPDFLoader("data/PArser Source 1.pdf")
pages = loader.load()

In [None]:
pages[0]

In [None]:

# To help construct our Chat Messages
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate

# To parse outputs and get structured data back
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

In [None]:
len(pages)

In [None]:
from typing import List, Optional

from pydantic import BaseModel, Field

class EntityDataExtraction(BaseModel):
    entity_name: Optional[str] = Field(default=None, description="Name of Entity")
    legal_company_type: Optional[str] = Field(default=None, description="Legal Company Type")
    status :Optional[str] = Field(default=None, description="Status entity")
    registration_number_tax_id: Optional[int] = Field(default=None, description="Registration Number or Tax ID. This could be empty")
    incorporation_date: Optional[str] = Field(default=None, description="The incorporation Date of the time period in ISO format.")
    country : Optional[str] = Field(default=None, description="Country")
    region_state:Optional[str] = Field(default=None, description="Region or State")
    dissolved_date: Optional[str] = Field(default=None,  description="The Dissolved Date of the time period in ISO format. This could be empty")
    historical : Optional[str] =  Field(default=None, description="it is Historical TRUE or FALSE")
    registered_office_address : Optional[str] = Field(default=None, description="The Registered Office Address of the company")
    main_address_line : Optional[str] = Field(default=None, description="The Main Address Line of the company")

class Data(BaseModel):
    """Extracted data about Companies."""

    # Creates a model so that we can extract multiple entities.
    companies: List[EntityDataExtraction]


In [None]:
EntityDataExtraction.schema()

In [None]:

data = {
    "description": "Company registry Information.",
    "schema": EntityDataExtraction.schema(),
    "instruction": (
        "Extract data according to the schema "

    )
}

In [None]:

# Instantiate the parser with the new model.
parser = PydanticOutputParser(pydantic_object=EntityDataExtraction)

# Update the prompt to match the new query and desired format.
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(
            "Answer the users question as best as possible about the name of the company Requested.\n{format_instructions}\n{question}"
        )
    ],
    input_variables=["question"],
    partial_variables={
        "format_instructions": parser.get_format_instructions(),
    },
)

In [None]:
parser.get_format_instructions()

In [None]:
model = ChatGoogleGenerativeAI(
                    model="gemini-1.5-pro-001", credentials=vertex_credentials
                )

In [None]:
document_query = "Extract information of company Yuga Studios from this document report: " + pages[0].page_content

_input = prompt.format_prompt(question=document_query)
output = model.invoke(_input.to_messages())
parsed = parser.parse(output.content)

pprint.pprint(parsed)

In [None]:
json.loads(parsed.json())

In [None]:
output.content

In [None]:
document_query = "Extract information of company 'Youthcoin Ltd' from this document report: " + pages[0].page_content

_input = prompt.format_prompt(question=document_query)
output = model.invoke(_input.to_messages())
parsed = parser.parse(output.content)

pprint.pprint(parsed)

In [None]:
json.loads(parsed.json())

# Multiple Objects (Under Construction)

In [None]:
# prompt2 = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             "You are an expert extraction algorithm. "
#             "Only extract relevant information from the text. "
#             "If you do not know the value of an attribute asked to extract, "
#             "return null for the attribute's value.",
#         ),
#         # Please see the how-to about improving performance with
#         # reference examples.
#         # MessagesPlaceholder('examples'),
#         ("human", "{text}"),
#     ]
# )

In [None]:
# runnable = prompt2 | model.with_structured_output(schema=Data)

In [None]:
# runnable.invoke({"text": pages[0].page_content})