In [1]:
import os
import pandas as pd

from langchain_anthropic import ChatAnthropic

from logger import logger
from modules import extractor


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from modules import extractor


to access modules in parent directory

In [2]:
import sys

# Add the parent directory (my_app/) to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

________________________________________________________

In [3]:
logger.info("-------------- New Notebook Session --------------")

2025-02-25 18:19:02,991 - LOGZ - INFO - -------------- New Notebook Session --------------


In [4]:
with open("../api_key.txt") as f:
    api_key=f.read()

anthropic_model = "claude-3-7-sonnet-20250219"
logger.info(f"Using {anthropic_model} model.")

llm = ChatAnthropic(model=anthropic_model,
                    temperature=0,
                    max_tokens=512,
                    timeout=None,
                    max_retries=2,
                    api_key=api_key)

2025-02-25 18:19:02,999 - LOGZ - INFO - Using claude-3-7-sonnet-20250219 model.


In [None]:
def process_folder(folder_path, **kwargs):
    logger.info(f"Starting load and extraction of {folder_path} folder.")
    
    if not os.path.exists(folder_path):
        logger.error("Error: Folder {folder_path} does not exist.")
        return

    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]

    if not pdf_files:
        logger.info("No PDF files found.")
        return

    logger.info(f"Found {len(pdf_files)} PDF(s). Processing...\n")

    results = []

    for i, pdf in enumerate(pdf_files):
        logger.info(f"Processing {i+1}/{len(pdf_files)}.")
        pdf_path = os.path.join(folder_path, pdf)
        df = extractor.process_pdf(pdf_path, **kwargs)
        results.append(df)

    logger.info("Finished processing all PDFs.")
    return results

In [6]:
folder = '../Data/AllPDF'

args = {
    "llm": llm,
}

results = process_folder(folder, **args)

2025-02-25 18:19:03,020 - LOGZ - INFO - Starting load and extraction of ../Data/AllPDF folder.
2025-02-25 18:19:03,021 - LOGZ - INFO - Found 26 PDF(s). Processing...

2025-02-25 18:19:03,022 - LOGZ - INFO - Processing 0/26.
2025-02-25 18:19:03,022 - LOGZ - INFO - Starting load and extraction of ../Data/AllPDF\Aegis Security Underwriting Guide Clean ver 5 2023.10.31.pdf
2025-02-25 18:19:04,226 - LOGZ - INFO - Starting extraction on 2 chunks.
2025-02-25 18:19:11,183 - LOGZ - INFO - Processing 1/26.
2025-02-25 18:19:11,183 - LOGZ - INFO - Starting load and extraction of ../Data/AllPDF\Algorithm.pdf
2025-02-25 18:19:11,365 - LOGZ - INFO - Starting extraction on 1 chunks.
2025-02-25 18:19:16,706 - LOGZ - INFO - Processing 2/26.
2025-02-25 18:19:16,709 - LOGZ - INFO - Starting load and extraction of ../Data/AllPDF\Aspen MGA Rating and Rule Guide 12.1.2016 v2.1.17.pdf CLEAN.pdf
2025-02-25 18:19:17,050 - LOGZ - INFO - Starting extraction on 1 chunks.
2025-02-25 18:19:23,686 - LOGZ - INFO - Pro

In [7]:
iterator = iter(results)
final_df = next(iterator)

try:
    while True:
        df = next(iterator)
        final_df = pd.concat([final_df, df], ignore_index=True)
except StopIteration:
    print("End of list reached")

final_df

End of list reached


Unnamed: 0,company_name,min_premium,min_premium_comments,policy_period,policy_period_comments,supp_fee_policy,coverage_BI,coverage_PD,coverage_MED,coverage_UM_UIMBI,coverage_UMPD,coverage_COMP,coverage_COLL,coverage_GAP
0,Aegis Security Insurance Company,$90.00,The policy fee is fully earned at inception an...,6.0,Semi-annual (6 month) policy term is acceptabl...,$90.00,mandatory,mandatory,optional,optional,optional,optional,optional,none
1,"Triton General Insurance Agency, LLC. - ASIC B...",,No minimum premium specified in the document.,6.0,Policy period is inferred to be 6 months based...,,mandatory,mandatory,optional,optional,optional,optional,optional,none
2,Affirmative,,No specific minimum premium mentioned in the d...,,"Policy period is not explicitly stated, but te...",Policy Fees + MVR Fee (per policy),mandatory,mandatory,mandatory,mandatory,mandatory,mandatory,mandatory,none
3,Home State County Mutual Insurance Company (Ad...,,No specific minimum premium mentioned in the d...,1.0,The document mentions both monthly (1-month) a...,$72,mandatory,mandatory,optional,optional,optional,optional,optional,none
4,Access General Agency Insurance Agency of Texas,,No specific minimum premium mentioned in the d...,1.0,The document mentions a one month policy perio...,$10,mandatory,mandatory,optionnal,optionnal,optionnal,optionnal,optionnal,none
5,Clear Blue Insurance Company,,No minimum premium specified in the document.,1.0,"Policies are written for One (1), Three (3) an...",$12,mandatory,mandatory,optional,optional,optional,optional,optional,none
6,"Insurance Services Office, Inc.","$2,500",The minimum limit of liability for Personal In...,6.0,Standard policy period appears to be 6 months ...,Refer to company for the appropriate fee that ...,mandatory,mandatory,optional,optional,optional,optional,optional,none
7,"Insurance Services Office, Inc.",,No minimum premium information is provided in ...,,No policy period information is provided in th...,,none,none,none,none,none,none,none,none
8,"Insurance Services Office, Inc.",,No specific minimum premium mentioned in the d...,12.0,The document mentions that the SRIP does not a...,,mandatory,mandatory,optional,optional,optional,optional,optional,none
9,ISO (Insurance Services Office),,No specific minimum premium mentioned in the d...,12.0,The document references a policy period in Rul...,,mandatory,mandatory,mandatory,optional,optional,optional,optional,none


In [8]:
final_df.to_csv("extracted.csv", index=False)

how the hell am i going to handle tables ?