# Speak with your documents . Extract structured Data

- https://python.langchain.com/docs/tutorials/extraction/

- https://python.langchain.com/docs/how_to/structured_output/

- https://python.langchain.com/docs/how_to/output_parser_structured/

- https://api.python.langchain.com/en/latest/core/output_parsers/langchain_core.output_parsers.pydantic.PydanticOutputParser.html

In [36]:
# ! python -m pip install -r requirements.txt

In [1]:
import os
from dotenv import dotenv_values

In [2]:
config = dotenv_values("./keys/.env")

In [3]:
from dotenv import dotenv_values
import json
import vertexai
from google.oauth2 import service_account
from vertexai.generative_models import GenerationConfig, GenerativeModel, Image, Part
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

from langchain_community.document_loaders import PyPDFLoader
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List, Optional
import pprint

In [4]:
import getpass
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

 ········


In [5]:
import pydantic

In [6]:
pydantic.__version__

'2.9.2'

In [7]:

with open("./keys/complete-tube-421007-208a4862c992.json") as source:
    info = json.load(source)

vertex_credentials = service_account.Credentials.from_service_account_info(info)
vertexai.init(
    project=config["PROJECT"],
    location=config["REGION"],
    credentials=vertex_credentials,
)
google_api_key = config["GEMINI-API-KEY"]
os.environ["GEMINI_API_KEY"] = google_api_key

In [41]:
loader = PyPDFLoader("data/PArser Source 1.pdf")
pages = loader.load()

In [52]:
pages[0]

Document(metadata={'source': 'data/PArser Source 1.pdf', 'page': 0}, page_content='Entity Code Entity Name (required) Legal Company Type Type Status Registration Number / Tax ID Incorporation Date Country Region / State Dissolved Date Historical? Registered Office Address Main Address Line 1\nText (12) Text (160) Text (60) Text (30) Text (30) yyyy-mm-dd yyyy-mm-dd TRUE or FALSE Text (60) Text (60)\nIRC Holdings Audit Committee Active 123456789 2020-08-08 RU Moscow Oblast FALSE 15 Red Square, Moscow, 101000, Russia 10 Arbat Street, Moscow, 119019, Russia\nABC Company BCA Company Prior 65545646 2018-07-31 MX Jalisco 2022-07-20 TRUE 789 Avenida Revolución, Mexico City, CDMX 03840, Mexico 456 Calle Insurgentes, Guadalajara, Jalisco 44100, Mexico\nYuga Studios Committee Active 54646111 2017-04-09 RU Saint Petersburg FALSE 23 Nevsky Prospekt, Saint Petersburg, 191186, Russia 78 Bolshaya Morskaya Street, Saint Petersburg, 190000, Russia\nPartner Markets Committee Active 2022-08-13 US Californ

In [9]:

# To help construct our Chat Messages
from langchain.schema import HumanMessage
from langchain.prompts import PromptTemplate, ChatPromptTemplate, HumanMessagePromptTemplate

# To parse outputs and get structured data back
from langchain.output_parsers import StructuredOutputParser, ResponseSchema

In [10]:
len(pages)

1

In [25]:
from typing import List, Optional

from pydantic import BaseModel, Field

class EntityDataExtraction(BaseModel):
    entity_name: Optional[str] = Field(default=None, description="Name of Entity")
    legal_company_type: Optional[str] = Field(default=None, description="Legal Company Type")
    status :Optional[str] = Field(default=None, description="Status entity")
    registration_number_tax_id: Optional[int] = Field(default=None, description="Registration Number or Tax ID. This could be empty")
    incorporation_date: Optional[str] = Field(default=None, description="The incorporation Date of the time period in ISO format.")
    country : Optional[str] = Field(default=None, description="Country")
    region_state:Optional[str] = Field(default=None, description="Region or State")
    dissolved_date: Optional[str] = Field(default=None,  description="The Dissolved Date of the time period in ISO format. This could be empty")
    historical : Optional[str] =  Field(default=None, description="it is Historical TRUE or FALSE")
    registered_office_address : Optional[str] = Field(default=None, description="The Registered Office Address of the company")
    main_address_line : Optional[str] = Field(default=None, description="The Main Address Line of the company")

class Data(BaseModel):
    """Extracted data about Companies."""

    # Creates a model so that we can extract multiple entities.
    companies: List[EntityDataExtraction]


In [26]:
EntityDataExtraction.schema()

{'properties': {'entity_name': {'anyOf': [{'type': 'string'},
    {'type': 'null'}],
   'default': None,
   'description': 'Name of Entity',
   'title': 'Entity Name'},
  'legal_company_type': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'default': None,
   'description': 'Legal Company Type',
   'title': 'Legal Company Type'},
  'status': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'default': None,
   'description': 'Status entity',
   'title': 'Status'},
  'registration_number_tax_id': {'anyOf': [{'type': 'integer'},
    {'type': 'null'}],
   'default': None,
   'description': 'Registration Number or Tax ID. This could be empty',
   'title': 'Registration Number Tax Id'},
  'incorporation_date': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'default': None,
   'description': 'The incorporation Date of the time period in ISO format.',
   'title': 'Incorporation Date'},
  'country': {'anyOf': [{'type': 'string'}, {'type': 'null'}],
   'default': None,
   'descri

In [37]:

data = {
    "description": "Company registry Information.",
    "schema": EntityDataExtraction.schema(),
    "instruction": (
        "Extract data according to the schema "

    )
}

In [38]:

# Instantiate the parser with the new model.
parser = PydanticOutputParser(pydantic_object=EntityDataExtraction)

# Update the prompt to match the new query and desired format.
prompt = ChatPromptTemplate(
    messages=[
        HumanMessagePromptTemplate.from_template(
            "Answer the users question as best as possible about the name of the company Requested.\n{format_instructions}\n{question}"
        )
    ],
    input_variables=["question"],
    partial_variables={
        "format_instructions": parser.get_format_instructions(),
    },
)

In [53]:
parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"entity_name": {"anyOf": [{"type": "string"}, {"type": "null"}], "default": null, "description": "Name of Entity", "title": "Entity Name"}, "legal_company_type": {"anyOf": [{"type": "string"}, {"type": "null"}], "default": null, "description": "Legal Company Type", "title": "Legal Company Type"}, "status": {"anyOf": [{"type": "string"}, {"type": "null"}], "default": null, "description": "Status entity", "title": "Status"}, "registration_number_tax_id": {"anyOf": [{"type": "integer"}, {"type": "null"}], "default": null, 

In [54]:
model = ChatGoogleGenerativeAI(
                    model="gemini-1.5-pro-001", credentials=vertex_credentials
                )

In [55]:
document_query = "Extract information of company Yuga Studios from this document report: " + pages[0].page_content

_input = prompt.format_prompt(question=document_query)
output = model.invoke(_input.to_messages())
parsed = parser.parse(output.content)

pprint.pprint(parsed)

EntityDataExtraction(entity_name='Yuga Studios', legal_company_type='Committee', status='Active', registration_number_tax_id=54646111, incorporation_date='2017-04-09', country='RU', region_state='Saint Petersburg', dissolved_date=None, historical='FALSE', registered_office_address='23 Nevsky Prospekt, Saint Petersburg, 191186, Russia', main_address_line='78 Bolshaya Morskaya Street, Saint Petersburg, 190000, Russia')


In [56]:
json.loads(parsed.json())

{'entity_name': 'Yuga Studios',
 'legal_company_type': 'Committee',
 'status': 'Active',
 'registration_number_tax_id': 54646111,
 'incorporation_date': '2017-04-09',
 'country': 'RU',
 'region_state': 'Saint Petersburg',
 'dissolved_date': None,
 'historical': 'FALSE',
 'registered_office_address': '23 Nevsky Prospekt, Saint Petersburg, 191186, Russia',
 'main_address_line': '78 Bolshaya Morskaya Street, Saint Petersburg, 190000, Russia'}

In [49]:
output.content

'```json\n{"entity_name": "Yuga Studios", "legal_company_type": "Committee", "status": "Active", "registration_number_tax_id": 54646111, "incorporation_date": "2017-04-09", "country": "RU", "region_state": "Saint Petersburg", "dissolved_date": null, "historical": "FALSE", "registered_office_address": "23 Nevsky Prospekt, Saint Petersburg, 191186, Russia", "main_address_line": "78 Bolshaya Morskaya Street, Saint Petersburg, 190000, Russia"}\n```'

In [57]:
document_query = "Extract information of company 'Youthcoin Ltd' from this document report: " + pages[0].page_content

_input = prompt.format_prompt(question=document_query)
output = model.invoke(_input.to_messages())
parsed = parser.parse(output.content)

pprint.pprint(parsed)

EntityDataExtraction(entity_name='Youthcoin Ltd.', legal_company_type='Trust', status='Dormant', registration_number_tax_id=None, incorporation_date='2013-02-16', country='UK', region_state='Greater London', dissolved_date='2022-06-14', historical='TRUE', registered_office_address='221B Baker Street, London NW1 6XE, United Kingdom', main_address_line='12 Downing Street, London SW1A 2AA, United Kingdom')


In [58]:
json.loads(parsed.json())

{'entity_name': 'Youthcoin Ltd.',
 'legal_company_type': 'Trust',
 'status': 'Dormant',
 'registration_number_tax_id': None,
 'incorporation_date': '2013-02-16',
 'country': 'UK',
 'region_state': 'Greater London',
 'dissolved_date': '2022-06-14',
 'historical': 'TRUE',
 'registered_office_address': '221B Baker Street, London NW1 6XE, United Kingdom',
 'main_address_line': '12 Downing Street, London SW1A 2AA, United Kingdom'}

# Multiple Objects (Under Construction)

In [28]:
# prompt2 = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             "You are an expert extraction algorithm. "
#             "Only extract relevant information from the text. "
#             "If you do not know the value of an attribute asked to extract, "
#             "return null for the attribute's value.",
#         ),
#         # Please see the how-to about improving performance with
#         # reference examples.
#         # MessagesPlaceholder('examples'),
#         ("human", "{text}"),
#     ]
# )

In [29]:
# runnable = prompt2 | model.with_structured_output(schema=Data)

In [27]:
# runnable.invoke({"text": pages[0].page_content})