In [1]:
import logging

from langchain_anthropic import ChatAnthropic

from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.documents import Document

from langchain_community.document_loaders import PyPDFLoader

from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_text_splitters import TokenTextSplitter


from sentence_transformers import SentenceTransformer

from typing import Optional

from tqdm import tqdm


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Create a logger
logger = logging.getLogger("LOGZ")
logger.setLevel(logging.DEBUG)  # Set level to capture all messages

# Create handlers
console_handler = logging.StreamHandler()  # For console output
file_handler = logging.FileHandler("app.log")  # Save logs to file

# Set levels for handlers
console_handler.setLevel(logging.INFO)  # Console shows INFO and above
file_handler.setLevel(logging.DEBUG)  # File stores everything

# Define a log format
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
file_handler.setFormatter(formatter)

# Add handlers to logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)


logger.info("-------------- New session --------------")

2025-02-24 19:48:40,878 - LOGZ - INFO - -------------- New session --------------


In [3]:
with open("../api_key.txt") as f:
    api_key=f.read()

llm = ChatAnthropic(model="claude-3-sonnet-20240229",
                    temperature=0,
                    max_tokens=512,
                    timeout=None,
                    max_retries=2,
                    api_key=api_key)

In [65]:
file_path = "../Data/Personal Auto/Tesla Property & Casualty, Inc/TPC Rule Manual.pdf"
#file_path = "../Data/Personal Auto/Tesla Property & Casualty, Inc/TX RV 201401 Rules.pdf"

#file_path = "../Data/Personal Auto/test/TX PPA APCIC Rules 101518RB.pdf"

#file_path = "../Data/Personal Auto/Tesla Property & Casualty, Inc/TPC Rate Manual.pdf"

#text_loader_kwargs = {"autodetect_encoding": True}
#loader = DirectoryLoader("corpus/", glob='**/**/*.txt', loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)

logger.info(f"Starting load and extraction of {file_path}")

loader = PyPDFLoader(file_path)

doc = loader.load()
nb_page = len(doc)
nb_page

2025-02-24 20:05:26,009 - LOGZ - INFO - Starting load and extraction of ../Data/Personal Auto/Tesla Property & Casualty, Inc/TPC Rule Manual.pdf


30

Claude pro has a context window of 200k+ tokens (about 500 pages of text or 100 images). So we need to take that into account. For even safer usage, we will divide this number by 2.

In [66]:
pages = loader.load_and_split()

In [67]:
text = " ".join(list(map(lambda page: page.page_content, pages)))
len(text)

18380

In [68]:
text_splitter = TokenTextSplitter(
    chunk_size=10000,
    chunk_overlap=20,
)

In [69]:
text_processed = []

#for each documents
#for page in doc:
#    text_processed += text_splitter.split_text(page.page_content)

text_processed += text_splitter.split_text(text)


text_processed = [text for text in text_processed]
len(text_processed)

1

In [70]:
#text_processed = []
#
##for each documents
##for page in doc:
##    text_processed += text_splitter.split_text(page.page_content)
#
#text_processed += text_splitter.split_documents(doc)
#
#
#text_processed = [doc.page_content for doc in text_processed]
#text_processed

In [71]:
# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

#claude pro context window : 200k+ tokens (about 500 pages of text or 100 images)

In [72]:
all_args = {
    "allow_inf_nan" : True,
    "default" : 0
    }

# default_factory -> callable when default is needed

# use nested class for organisation

#---------------------------------------------------------------------------
class InsuranceExtraction_Rule(BaseModel):
    '''Different informations extracted from a company insurance's rule & rate files.'''
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    company_name: str = Field(
        description="The company that wrote this file."
    )
    min_premium: Optional[str] = Field(
        description="Minimum premium (The lowest amount the insurer will charge for coverage) in dollars, if there is multiple possible values, take the lowest.", 
        default=""
    )
    min_premium_comments: Optional[str] = Field(
        description="Minimum premium commentaries or additionnal informations"
    )
    policy_period: Optional[str] = Field(
        description="Policy period in month, if there is multiple possible values, take the lowest.", 
        default=""
    )
    policy_period_comments: Optional[str] = Field(
        description="Policy period commentaries or additionnal informations."
    )
    territory_factor_BI: Optional[str] = Field(
        description="The company's insurance territory factor for BI (Bodily Injury) insurance.", 
        default=""
    )

#---------------------------------------------------------------------------



In [73]:
structured_llm_rule = llm.with_structured_output(schema=InsuranceExtraction_Rule, include_raw=False)


In [74]:
extractor = prompt_template | structured_llm_rule

In [75]:
# we could possibly chunk 'text' here to stay in the context window
total = len(text_processed)

logger.info(f"Starting extraction on {total} chunks.")

#for i, text in enumerate(text_processed):
try:
    extractions = extractor.batch(
        [{"text": text} for text in text_processed],
        {"max_concurrency": 5}
    )
except Exception as e:
    logger.error(f"An error occurred: {e}", exc_info=True)
#else:
#    logger.info(f"Extraction succesfull {i+1}/{total}")

2025-02-24 20:05:26,825 - LOGZ - INFO - Starting extraction on 1 chunks.


In [76]:
extractions

[InsuranceExtraction_Rule(company_name='Tesla Property &amp; Casualty, Inc.', min_premium='100', min_premium_comments='The minimum premium for a six-month policy is $100, which cannot be reduced except in the event of a cancellation.', policy_period='6', policy_period_comments='Personal Auto Policies may be written for policy periods for 6 months or less.', territory_factor_BI='null')]

In [82]:
[extraction.__dict__ for extraction in extractions]

[{'company_name': 'Tesla Property &amp; Casualty, Inc.',
  'min_premium': '100',
  'min_premium_comments': 'The minimum premium for a six-month policy is $100, which cannot be reduced except in the event of a cancellation.',
  'policy_period': '6',
  'policy_period_comments': 'Personal Auto Policies may be written for policy periods for 6 months or less.',
  'territory_factor_BI': 'null'}]