## approach 1:

In [None]:
from typing import List
from pydantic import BaseModel, Field
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

# Define the Pydantic schema for each ledger entry.
class LedgerEntry(BaseModel):
    LedgerEntryID: int = Field(..., description="Sequential ledger entry ID starting from 1.")
    DeedID: str = Field(..., description="Deed identifier (e.g., 'DEED001').")
    OwnerName: str = Field(..., description="Full uppercase name of the owner.")
    ChangeInSharePercentage: float = Field(
        ...,
        description=("Change in share percentage. Use -1.00 for the transferor "
                     "and a positive fraction for each recipient so that the sum for recipients equals 1.00.")
    )
    EffectiveDate: str = Field(..., description="Effective date in ISO format (YYYY-MM-DD).")
    DocRef: str = Field(..., description="Short document reference (e.g., 'R.1').")

# Define a wrapper schema for the extraction result.
class ExtractionResult(BaseModel):
    entries: List[LedgerEntry] = Field(..., description="List of ledger entries extracted from the document.")

# Instantiate a chat model that supports JSON mode.
llm = ChatOpenAI(model="gpt-4", temperature=0)

# Bind the structured output to the LLM using the Pydantic schema.
structured_llm = llm.with_structured_output(ExtractionResult, method="json_mode")

# Create a prompt template that provides instructions.
# Instead of hardcoding an example output, we inject the JSON schema generated from our Pydantic model.
schema_json = ExtractionResult.schema_json(indent=2)
prompt_template = ChatPromptTemplate.from_template(
    """
You are an expert extraction algorithm specialized in legal document amendments.
Extract all ownership transfer events from the text below.
For each event, output a ledger entry that conforms exactly to the following JSON schema:
{schema}

Do not include any extra text or commentary.
Text:
{text}
    """
)

# Build the chain that connects the prompt with the structured LLM.
chain = LLMChain(llm=structured_llm, prompt=prompt_template)

# Example document text (replace with your actual input).
input_text = (
    "R.01/97.956 Data: 15/maio/1989 Pelo formal de partilha passado em 09 de fevereiro de 1982, "
    "pelo Cartório do 6.º Ofício ou 2. de Direito da 8.a Vara, ambos da Família e Sucessões desta Capital, "
    "o Espólio de SIMON KUCZYNSKI -(falecido em 20 de abril de 1973, no estado civil de casado pelo regime da comunhão de bens com Elza Nazareth Kuczynski) "
    "transmitiu por partilha a 1) ELZA NAZARETH KUCZYNSKI, brasileira, viúva, do lar, RG. 584.724, CIC. n.º 000.814.868-63, domiciliada nesta Capital, à Rua Dr. Numa Pereira do Vale, 272; "
    "2) RUTH NAZARETH KUCZYNSKI, brasileira, solteira, maior, estudante, RG. n.º 5.333.362, CIC. sob o n.º 033.850.628-45, domiciliada nesta Capital, à Rua Numa Pereira do Vale, 272; "
    "3) PAULO NAZARETH KUCZYNSKI, brasileiro, solteiro, maior, proprietário, RG. 3.951.101, C.I.C.M. F. sob n.º 531.813.108-59, domiciliado nesta Capital, à Rua Numa Pereira do Vale, 272; "
    "e, 4) SERGIO NAZARETH KUCZYNSKI, brasileiro, solteiro, maior, estudante, CIC. 034.354.838-06, domiciliado nesta Capital, à Rua Numa Pereira do Vale, 272, "
    "o imóvel, pelo valor de Cr$51.656,00, na proporção de 1/2 à viúva; e, 1/6 a cada um dos demais herdeiros.-"
)

# Invoke the chain with the input text.
result = chain.run(text=input_text, schema=schema_json)

# The result is automatically parsed into an ExtractionResult Pydantic object.
print(result.json(indent=2))


## approach 2:

In [None]:
from typing import List
from pydantic import BaseModel, Field
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

# Define the Pydantic schema for a ledger entry.
class LedgerEntry(BaseModel):
    LedgerEntryID: int = Field(description="Sequential ledger entry ID starting from 1.")
    DeedID: str = Field(description="Deed identifier, e.g., 'DEED001'.")
    OwnerName: str = Field(description="Full uppercase name of the owner.")
    ChangeInSharePercentage: float = Field(
        description=("Change in share percentage. For the transferor, use -1.00, "
                     "and for recipients, provide the positive fraction of shares received such that the sum equals 1.00.")
    )
    EffectiveDate: str = Field(description="Effective date in ISO format (YYYY-MM-DD).")
    DocRef: str = Field(description="Short document reference, e.g., 'R.1'.")

# Wrap the entries in a container model.
class ExtractionResult(BaseModel):
    entries: List[LedgerEntry] = Field(description="List of ledger entries extracted from the document.")

# Instantiate a chat model that supports JSON mode.
llm = ChatOpenAI(model="gpt-4", temperature=0)

# Bind the structured output using the Pydantic schema. 
# This will instruct the LLM to output valid JSON that matches ExtractionResult.
structured_llm = llm.with_structured_output(ExtractionResult, method="json_mode")

# Create a prompt template that references the schema requirements without hardcoding an example.
prompt_template = ChatPromptTemplate.from_template(
    """
You are an expert extraction algorithm specializing in legal document amendments.
Extract all ownership transfer events from the text as a JSON object that strictly conforms to the following schema:

- The JSON object must have one key "entries", which is a list.
- Each list item must be an object with the following keys:
    - LedgerEntryID: A sequential integer starting from 1.
    - DeedID: For this document, use "DEED001".
    - OwnerName: The full uppercase name of the owner.
    - ChangeInSharePercentage: For the transferor, use -1.00. For each recipient, provide a positive fraction such that all recipient percentages sum to 1.00.
    - EffectiveDate: The effective date extracted from the text in ISO format (YYYY-MM-DD).
    - DocRef: A short document reference, e.g., "R.1".

Do not include any additional text or formatting. Respond with only the JSON.

Text:
{text}
    """
)

# Build the extraction chain by combining the prompt and the structured LLM.
chain = LLMChain(llm=structured_llm, prompt=prompt_template)

# Example input text (replace with your actual document text)
input_text = (
    "R.01/97.956 Data: 15/maio/1989 Pelo formal de partilha passado em 09 de fevereiro de 1982, "
    "pelo Cartório do 6.º Ofício ou 2. de Direito da 8.a Vara, ambos da Família e Sucessões desta Capital, "
    "o Espólio de SIMON KUCZYNSKI -(falecido em 20 de abril de 1973, no estado civil de casado pelo regime da comunhão de bens com Elza Nazareth Kuczynski) "
    "transmitiu por partilha a 1) ELZA NAZARETH KUCZYNSKI, brasileira, viúva, do lar, RG. 584.724, CIC. n.º 000.814.868-63, domiciliada nesta Capital, à Rua Dr. Numa Pereira do Vale, 272; "
    "2) RUTH NAZARETH KUCZYNSKI, brasileira, solteira, maior, estudante, RG. n.º 5.333.362, CIC. sob o n.º 033.850.628-45, domiciliada nesta Capital, à Rua Numa Pereira do Vale, 272; "
    "3) PAULO NAZARETH KUCZYNSKI, brasileiro, solteiro, maior, proprietário, RG. 3.951.101, C.I.C.M. F. sob n.º 531.813.108-59, domiciliado nesta Capital, à Rua Numa Pereira do Vale, 272; "
    "e, 4) SERGIO NAZARETH KUCZYNSKI, brasileiro, solteiro, maior, estudante, CIC. 034.354.838-06, domiciliado nesta Capital, à Rua Numa Pereira do Vale, 272, "
    "o imóvel, pelo valor de Cr$51.656,00, na proporção de 1/2 à viúva; e, 1/6 a cada um dos demais herdeiros.-"
)

# Invoke the chain.
result = chain.run(text=input_text)

# Print the structured extraction result.
print(result)
