In [1]:
from langchain_anthropic import ChatAnthropic

from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.documents import Document

from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from sentence_transformers import SentenceTransformer

from typing import Optional


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#EMBEDDING_MODEL_NAME = "thenlper/gte-large"
EMBEDDING_MODEL_NAME = "thenlper/gte-small"


In [3]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)

  embedding_model = HuggingFaceEmbeddings(


In [4]:
with open("../api_key.txt") as f:
    api_key=f.read()

llm = ChatAnthropic(model="claude-3-sonnet-20240229",
                    temperature=0,
                    max_tokens=512,
                    timeout=None,
                    max_retries=2,
                    api_key=api_key)

In [5]:
file_path = "../Data/Personal Auto/Tesla Property & Casualty, Inc/TPC Rule Manual.pdf"
#file_path = "../Data/Personal Auto/Tesla Property & Casualty, Inc/TX RV 201401 Rules.pdf"

#file_path = "../Data/Personal Auto/test/TX PPA APCIC Rules 101518RB.pdf"

file_path_rate = "../Data/Personal Auto/Tesla Property & Casualty, Inc/TPC Rate Manual.pdf"

#text_loader_kwargs = {"autodetect_encoding": True}
#loader = DirectoryLoader("corpus/", glob='**/**/*.txt', loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)


loader = PyPDFLoader(file_path)

doc = loader.load()
nb_page = len(doc)
nb_page

30

In [None]:
pages = loader.load_and_split()


In [6]:
from langchain_text_splitters import TokenTextSplitter

text_splitter = TokenTextSplitter(
    # Controls the size of each chunk
    chunk_size=2000,
    # Controls overlap between chunks
    chunk_overlap=20,
)

In [None]:
text_processed = []

#for each documents
#for page in doc:
#    text_processed += text_splitter.split_text(page.page_content)

text_processed += text_splitter.split_documents(doc)


text_processed = [doc.page_content for doc in text_processed]
text_processed

['PRIVATE PASSENGER AUTO \nRULES \nTEXAS',
 'Tesla Property & Casualty, Inc. \nPersonal Auto Program Rules  \n \nUnderwritten by: 1 Texas \nTesla Property & Casualty, Inc. Edition 2023-03 \n \nTable of Contents \nApplicability of Manual Rules ..................................................................................................................... 2 \nBinding Instructions .................................................................................................................................... 3 \nPolicy Program ............................................................................................................................................. 4 \nFinancial Responsibility Filings .................................................................................................................................... 4 \nPolicy Period ....................................................................................................................................

In [15]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

vectorstore  = FAISS.from_texts(
    text_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
)

In [44]:
# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

#claude pro context window : 200k+ tokens (about 500 pages of text or 100 images)

In [17]:
text = " ".join(list(map(lambda page: page.page_content, pages)))
len(text)

18380

TODO : we should split text and loop to respect model's context window

In [None]:
all_args = {
    "allow_inf_nan" : True,
    "default" : 0
    }

# default_factory -> callable when default is needed

# use nested class for organisation

#---------------------------------------------------------------------------
class InsuranceExtraction_Rule(BaseModel):
    '''Different informations extracted from a company insurance's rule file.'''
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    company_name: str = Field(
        description="The company that wrote this file."
    )
    min_premium: Optional[str] = Field(
        description="Minimum premium (The lowest amount the insurer will charge for coverage) in dollars, if there is multiple possible values, take the lowest.", default=""
    )
    min_premium_comments: Optional[str] = Field(
        description="Minimum premium commentaries or additionnal informations"
    )
    policy_period: Optional[str] = Field(
        description="Policy period in month, if there is multiple possible values, take the lowest.", default=""
    )
    policy_period_comments: Optional[str] = Field(
        description="Policy period commentaries or additionnal informations."
    )


class InsuranceExtraction_Rate(BaseModel):
    '''Different informations extracted from a company insurance's rate file.'''
    territory_factor_BI: str = Field(description="The company's insurance territory factor for BI (Bodily Injury) insurance.")


#---------------------------------------------------------------------------



In [20]:
structured_llm_rule = llm.with_structured_output(schema=InsuranceExtraction_Rule, include_raw=False)
structured_llm_rate = llm.with_structured_output(schema=InsuranceExtraction_Rate, include_raw=False)

### basic

In [18]:
prompt = prompt_template.invoke({"text": text})

In [None]:
result = structured_llm_rule.invoke(prompt)
result

InsuranceExtraction_Rule(company_name='Texas Tesla Property &amp; Casualty, Inc.', min_premium='100', min_premium_comments='The minimum premium for a six-month policy is $100, which cannot be reduced except in the event of a cancellation.', policy_period='6', policy_period_comments='Personal Auto Policies may be written for policy periods for 6 months or less.')

### brute force

In [21]:
extractor = prompt_template | structured_llm_rule

In [None]:
# we could possibly chunk 'text' here to stay in the context window

extractions = extractor.batch(
    [{"text": text}],
    {"max_concurrency": 5}
)
extractions

[InsuranceExtraction_Rule(company_name='Texas Tesla Property &amp; Casualty, Inc.', min_premium='100', min_premium_comments='The minimum premium for a six-month policy is $100, which cannot be reduced except in the event of a cancellation.', policy_period='6', policy_period_comments='Personal Auto Policies may be written for policy periods for 6 months or less.')]

### rag

In [40]:
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 20}
)  # Only extract from first document


In [41]:
rag_extractor = {
    "text": retriever #| (lambda docs: docs[0].page_content)  # fetch content of top doc
} | extractor

In [42]:
extractions = rag_extractor.invoke("Different informations extracted from a company insurance's rule file.")
extractions

InsuranceExtraction_Rule(company_name='Tesla Property &amp; Casualty, Inc.', min_premium='null', min_premium_comments='The minimum premium is not explicitly stated in the provided documents.', policy_period='6', policy_period_comments='The policy period is 6 months, as stated in the section on Auto Loan/Lease Gap Coverage: "Loan/Lease Gap Coverage will continue to apply, unless otherwise requested to be removed by the insured, until the first renewal when the vehicle attains a vehicle age of 6."')

rag might be the slowest but best choice in the long run, but imo should be used with 1 scheme/attribute

______________________________________________________________________

test on document without the information -> error handling

convert to float

automatic document search, indexing, downloading

tag documents ?

few shot prompting with examples ?
