In [23]:
import json
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
import os
from typing import Dict, List, Any, Optional
from dataclasses import dataclass
import openai
from pathlib import Path
import base64
import dotenv
import pydantic
from pydantic import BaseModel, Field, create_model, TypeAdapter
from openai.types.chat.chat_completion_content_part_param import File



In [2]:
from langchain_anthropic import ChatAnthropic


In [4]:
dotenv.load_dotenv()
pdf_path = Path(r'C:\Users\sirius\Downloads\pdf_receipts\archive\2018\de\hotel\20180915_THE MADISON HAMBURG.pdf')
llm = ChatOpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    model='gpt-5-mini'
)

In [5]:
llm2 = ChatAnthropic(model_name="claude-3-5-sonnet-20240620", api_key=os.getenv("ANTHROPIC_API_KEY"))

In [10]:
class State:
    pdf: Path
    json_schema: Dict[str, Any]
    end_schema: pydantic.BaseModel
    output_md: str

with open(pdf_path, "rb") as f:
    file_base64 = base64.b64encode(f.read()).decode("utf-8")
    file_name = pdf_path.name


In [None]:
message = HumanMessage(content=[
    {"type": "text", "text": '''
    Generate a JSON Schema that describes the structured data extracted from the PDF.

    Requirements:
    - The schema must be valid for Pydantic parsing.
    - Use proper Pydantic-compatible types.
    - Provide a clear `"description"` for every field.
    - No fields are required (`required` is empty or omitted).
    - Allow `additionalProperties: true`.
    - Keep the schema as simple and human-readable as possible — merge closely related information into a single field when appropriate.
    - The language of all field names and descriptions should match the language used in the PDF.
    - **For every extracted item (field), include an additional property named `"confidence_level"`**, which is a float between 0 and 1 indicating how certain you are about the correctness of the extracted value (1 = very certain, 0 = uncertain).
    - The `"confidence_level"` property must appear alongside the main value in the same object (not globally).
    - Output only the JSON Schema — no explanations, no extra text.

    Example structural pattern for reference (do not output this example literally):
    {
    "type": "object",
    "properties": {
        "invoice_number": {
        "type": "object",
        "properties": {
            "value": {"type": "string", "description": "Invoice number text"},
            "confidence_level": {"type": "number", "description": "Confidence from 0 to 1"}
        }
        },
        ...
    },
    "additionalProperties": true
    }'''
},
    {
        "type": "document",
            "source": {
                "type": "base64",
                "media_type": "application/pdf",
                "data": file_base64,
            }
    }
    ])
reponse = llm2.invoke([message])

In [73]:
print(reponse.content)

{
  "type": "object",
  "properties": {
    "rechnungsnummer": {
      "type": "object",
      "properties": {
        "value": {"type": "string", "description": "Rechnungsnummer"},
        "confidence_level": {"type": "number", "description": "Konfidenz von 0 bis 1"}
      }
    },
    "rechnungsdatum": {
      "type": "object",
      "properties": {
        "value": {"type": "string", "format": "date", "description": "Datum der Rechnung"},
        "confidence_level": {"type": "number", "description": "Konfidenz von 0 bis 1"}
      }
    },
    "hotel": {
      "type": "object",
      "properties": {
        "value": {"type": "string", "description": "Name und Adresse des Hotels"},
        "confidence_level": {"type": "number", "description": "Konfidenz von 0 bis 1"}
      }
    },
    "kunde": {
      "type": "object",
      "properties": {
        "value": {"type": "string", "description": "Name und Adresse des Kunden"},
        "confidence_level": {"type": "number", "description": 

In [None]:
confidence_level = '"confidence_level":{"type":"number","description":"The confidence level of how certain you are about the accuracy of this transcription from 0 to 1"}'

In [74]:
import json

json_schema = json.loads(reponse.content)
type(json_schema)

dict

In [75]:
schema_path = Path("schema.json")
schema_path.write_text(json.dumps(json_schema, indent=2), encoding="utf-8")
from datamodel_code_generator import InputFileType, generate
output_path = Path("User_models.py")
generate(
    input_=schema_path,
    input_file_type=InputFileType.JsonSchema,
    output=output_path,
)

In [78]:
import User_models
Schema = User_models.Model
llm_structured = llm2.with_structured_output(Schema, include_raw=True, strict=True)

response = llm_structured.invoke([message])

In [79]:
raw_msg = response['raw']
parsed_msg = response['parsed']
print(parsed_msg.model_dump())

{'rechnungsnummer': {'value': '474081', 'confidence_level': 0.95}, 'rechnungsdatum': {'value': datetime.date(2018, 9, 14), 'confidence_level': 0.95}, 'hotel': {'value': 'MADISON Hotel GmbH, Schaarsteinweg 4, 20459 Hamburg, Germany', 'confidence_level': 0.95}, 'kunde': {'value': 'APImeister Consulting GmbH, Friedrichstr. 123, 10117 Berlin', 'confidence_level': 0.95}, 'zimmernummer': {'value': '539', 'confidence_level': 0.95}, 'anreisedatum': {'value': datetime.date(2018, 9, 9), 'confidence_level': 0.95}, 'abreisedatum': {'value': datetime.date(2018, 9, 14), 'confidence_level': 0.95}, 'gastname': {'value': 'Herr Jens Walter', 'confidence_level': 0.95}, 'uebernachtungen': {'value': [{'datum': datetime.date(2018, 9, 9), 'preis': 110.0}, {'datum': datetime.date(2018, 9, 10), 'preis': 110.0}, {'datum': datetime.date(2018, 9, 11), 'preis': 110.0}, {'datum': datetime.date(2018, 9, 12), 'preis': 110.0}, {'datum': datetime.date(2018, 9, 13), 'preis': 110.0}], 'confidence_level': 0.95}, 'gesamtbe

In [89]:
data = parsed_msg.model_dump()
data['rechnungsnummer']['confidence_level'] = 0.5


In [90]:
data['zahlungsmethode']['confidence_level'] = 0.5


In [91]:

def render_value(val, conf):
    
    def red(text):
        return f"<span style='color:red'>{text}</span>" if conf < 0.8 else str(text)

    if isinstance(val, list):
        lines = []
        for item in val:
            if isinstance(item, dict):
                parts = [f"{k}: {v}" for k, v in item.items()]
                lines.append("  - " + red(", ".join(parts)))
            else:
                lines.append(f"  - {red(item)}")
        return "\n".join(lines)
    elif isinstance(val, dict):
        lines = [f"    - **{k}**: {red(v)}" for k, v in val.items()]
        return "\n".join(lines)
    else:
        return red(val)


def dict_to_markdown(data):
    lines = ["# 📄 PDF Structured Data\n"]
    for key, info in data.items():
        value = info.get("value")
        conf = info.get("confidence_level", 1.0)

        key_display = (
            f"<span style='color:red'>{key}</span>" if conf < 0.8 else key
        )

        lines.append(f"### {key_display}")
        lines.append(f"- **Value:**\n{render_value(value, conf)}\n")

    return "\n".join(lines)


markdown_output = dict_to_markdown(data)
with open("output.md", "w", encoding="utf-8") as f:
    f.write(markdown_output)

In [40]:
from typing import Any, Dict, Type

def json_schema_to_pydantic(schema: Dict[str, Any]) -> Type[BaseModel]:
    """
    Convert a JSON schema dict to a Pydantic model class dynamically.
    """
    fields = {}
    required = set(schema.get("required", []))
    props = schema.get("properties", {})

    for field_name, field_info in props.items():
        field_type = type_mapping.get(field_info.get("type", "string"), Any)
        if field_name in required:
            fields[field_name] = (field_type, ...)
        else:
            fields[field_name] = (field_type, None)

    model = create_model(schema.get("title", "DynamicModel"), **fields)
    return model


In [43]:
UserModel = json_schema_to_pydantic(json_schema)

In [None]:
def generate_pydantic_code(schema: Dict[str, Any]) -> str:

    class_name = schema.get("title", "DynamicModel")
    props = schema.get("properties", {})
    required = set(schema.get("required", []))

    lines = [
        "from pydantic import BaseModel\n",
        "from typing import Optional\n\n",
        f"class {class_name}(BaseModel):"
    ]

    for name, info in props.items():
        t = type_mapping.get(info.get("type", "string"), "Any")
        optional = "" if name in required else "Optional[{}]".format(t)
        default = "" if name in required else " = None"
        type_str = t if name in required else optional
        lines.append(f"    {name}: {type_str}{default}")

    return "\n".join(lines)

code = generate_pydantic_code(json_schema)

output_file = "user_model.py"
with open(output_file, "w", encoding="utf-8") as f:
    f.write(code)

In [7]:
def generate_json_schema(file_base64) -> Dict[str, Any]:
    #state: State) -> Dict[str, Any]:
    
    message = HumanMessage(content=[
    {"type": "text", "text": '''generate a json schema for the pdf. 
            The schema should be able to be parsed by pydantic. 
            no required fields needed, allow additionalProperties. 
            The schema should be as simple as possible, combine the related information in one field. 
            The output language should match the language of the pdf. The schema should be as simple as possible. 
            Output only the json schema, nothing else.'''},
    {
        "type": "image_url", 
        "image_url": {
            "url": f'data:application/pdf;base64,{file_base64}',
            "detail": "auto",
        }
    }
    ])

    msg = dict(
        role='user',
        content=[
            dict(type='text', text='generate a json schema for the pdf. The output language should match the language of the pdf. The necessary information is the seller, the buyer, the date, the items and the total amount. The schema should be as simple as possible. The schema should be able to be parsed by pydantic, which means it should include description for each field and data types. Output only the json schema, nothing else.'),
            dict(
            type='file',
            file=dict(
                filename=pdf_path.name,
                file_data=f'data:application/pdf;base64,{file_base64}',
            )
            ),
        ]
        )

    response = llm2.invoke(
        #model='gpt-4o-mini',
        [message]
    )
    json_schema_text = response.choices[0].message.content.strip()
    
    # Clean up response (remove markdown code blocks if present)
    if json_schema_text.startswith("```json"):
        json_schema_text = json_schema_text[7:]
    if json_schema_text.startswith("```"):
        json_schema_text = json_schema_text[3:]
    if json_schema_text.endswith("```"):
        json_schema_text = json_schema_text[:-3]
    
    json_schema = json.loads(json_schema_text.strip())
    #state.json_schema = json_schema
    return json_schema
