to access modules in parent directory

In [1]:
import sys
import os

# Get the parent directory of the current file
parent_dir = os.path.abspath("..")  # Go up one level

# Add parent directory to sys.path
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [2]:
import pandas as pd

from langchain_anthropic import ChatAnthropic

from modules import logger
from modules import extractor


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from modules import extractor


________________________________________________________

In [3]:
logger.info("-------------- New Notebook Session --------------")

2025-02-26 10:11:31,869 - LOGZ - INFO - -------------- New Notebook Session --------------


In [4]:
with open("../api_key.txt") as f:
    api_key=f.read()

anthropic_model = "claude-3-7-sonnet-20250219"
logger.info(f"Using {anthropic_model} model.")

llm = ChatAnthropic(model=anthropic_model,
                    temperature=0,
                    max_tokens=512,
                    timeout=None,
                    max_retries=2,
                    api_key=api_key)

2025-02-26 10:11:31,879 - LOGZ - INFO - Using claude-3-7-sonnet-20250219 model.


In [5]:
def process_folder(folder_path, **kwargs):
    logger.info(f"Starting load and extraction of {folder_path} folder.")
    
    if not os.path.exists(folder_path):
        logger.error("Error: Folder {folder_path} does not exist.")
        return

    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".pdf")]

    if not pdf_files:
        logger.info("No PDF files found.")
        return

    logger.info(f"Found {len(pdf_files)} PDF(s). Processing...\n")

    results = []

    for i, pdf in enumerate(pdf_files):
        logger.info(f"Processing {i+1}/{len(pdf_files)}.")
        pdf_path = os.path.join(folder_path, pdf)
        df = extractor.process_pdf(pdf_path, **kwargs)
        results.append(df)

    logger.info("Finished processing all PDFs.")
    return results

In [6]:
folder = '../AllPDF'

args = {
    "llm": llm,
}

results = process_folder(folder, **args)

2025-02-26 10:11:31,911 - LOGZ - INFO - Starting load and extraction of ../AllPDF folder.
2025-02-26 10:11:31,913 - LOGZ - INFO - Found 26 PDF(s). Processing...

2025-02-26 10:11:31,914 - LOGZ - INFO - Processing 1/26.
2025-02-26 10:11:31,915 - LOGZ - INFO - Starting load and extraction of ../AllPDF\Aegis Security Underwriting Guide Clean ver 5 2023.10.31.pdf
2025-02-26 10:11:33,125 - LOGZ - INFO - Starting extraction on 2 chunks.
2025-02-26 10:11:41,182 - LOGZ - INFO - Processing 2/26.
2025-02-26 10:11:41,183 - LOGZ - INFO - Starting load and extraction of ../AllPDF\Algorithm.pdf
2025-02-26 10:11:41,369 - LOGZ - INFO - Starting extraction on 1 chunks.
2025-02-26 10:11:47,292 - LOGZ - INFO - Processing 3/26.
2025-02-26 10:11:47,293 - LOGZ - INFO - Starting load and extraction of ../AllPDF\Aspen MGA Rating and Rule Guide 12.1.2016 v2.1.17.pdf CLEAN.pdf
2025-02-26 10:11:47,599 - LOGZ - INFO - Starting extraction on 1 chunks.
2025-02-26 10:11:53,797 - LOGZ - INFO - Processing 4/26.
2025-0

In [7]:
iterator = iter(results)
final_df = next(iterator)

try:
    while True:
        df = next(iterator)
        final_df = pd.concat([final_df, df], ignore_index=True)
except StopIteration:
    print("End of list reached")

final_df

End of list reached


Unnamed: 0,company_name,min_premium,min_premium_comments,policy_period,policy_period_comments,supp_fee_policy,coverage_BI,coverage_PD,coverage_MED,coverage_UM_UIMBI,coverage_UMPD,coverage_COMP,coverage_COLL,coverage_GAP
0,Aegis Security Insurance Company,$90.00,The policy fee is fully earned at inception an...,6.0,Semi-annual (6 month) policy term is acceptabl...,$5.00,mandatory,mandatory,optional,optional,optional,optional,optional,none
1,"Triton General Insurance Agency, LLC. - ASIC B...",,No minimum premium specified in the document.,6.0,Policy period is inferred to be 6 months based...,,mandatory,mandatory,optional,optional,optional,optional,optional,none
2,Affirmative,,No specific minimum premium mentioned in the d...,,"Policy period not explicitly mentioned, but te...",Policy Fees + MVR Fee (per policy),mandatory,mandatory,mandatory,mandatory,mandatory,mandatory,mandatory,none
3,Home State County Mutual Insurance Company,,No specific minimum premium mentioned in the d...,1.0,The document mentions both monthly (1-month) a...,$72,mandatory,mandatory,optional,optional,optional,optional,optional,none
4,Access General Agency Insurance Agency of Texas,,No specific minimum premium mentioned in the d...,1.0,Document mentions one month policy in section ...,$10,mandatory,mandatory,none,optional,optional,optional,optional,none
5,Clear Blue Insurance Company,,No specific minimum premium mentioned in the d...,1.0,"Policies are written for One (1), Three (3) an...",$12,mandatory,mandatory,optional,optional,optional,optional,optional,none
6,"Insurance Services Office, Inc.",,No specific minimum premium amount mentioned i...,,No specific policy period mentioned in the doc...,,mandatory,mandatory,optionnal,optionnal,optionnal,optionnal,optionnal,none
7,"Insurance Services Office, Inc.",,No minimum premium information is provided in ...,,No policy period information is provided in th...,,none,none,none,none,none,none,none,none
8,"Insurance Services Office, Inc.",,No specific minimum premium mentioned in the d...,12.0,The document mentions that SRIP does not apply...,,mandatory,mandatory,optional,optional,optional,optional,optional,none
9,ISO (Insurance Services Office),,No specific minimum premium mentioned in the d...,12.0,The document refers to a 12-month policy perio...,,mandatory,mandatory,mandatory,optional,optional,optional,optional,none


In [8]:
final_df.to_csv("extracted.csv", index=False)

how the hell am i going to handle tables ?