In [31]:
# # Pypdf extraction and OpenAI classification script
# import pandas as pd
# import pypdf
# import os
# import json
# import time
# import logging
# from openai import OpenAI # Use the updated OpenAI library import
# from typing import Dict, Any

# # === LOGGING SETUP ===
# # Sets up basic configuration for logging messages.
# logging.basicConfig(
#     level=logging.INFO,
#     format="%(asctime)s | %(levelname)s | %(message)s"
# )
# logger = logging.getLogger(__name__)

# # === CONFIGURATION ===
# # It's better to manage paths and model names in a central place.
# # The API key is read from a local file for better security.
# API_KEY_PATH = "/Users/jake/Documents/Key/OPENAI_KEY.txt" # <--- ADJUST IF NEEDED
# MODEL = "gpt-4o" # Using the latest model as specified

# def get_openai_client(api_key_path: str) -> OpenAI:
#     """Reads the OpenAI API key from a file and returns an OpenAI client."""
#     try:
#         with open(api_key_path, 'r') as f:
#             api_key = f.read().strip()
#         if not api_key:
#             raise ValueError("API key file is empty.")
#         logger.info("Successfully loaded OpenAI API key.")
#         return OpenAI(api_key=api_key)
#     except FileNotFoundError:
#         logger.error(f"API key file not found at: {api_key_path}")
#         return None
#     except Exception as e:
#         logger.error(f"An error occurred while reading the API key: {e}")
#         return None

# def extract_text_from_pdf(pdf_path: str) -> pd.DataFrame:
#     """
#     Extracts all text from a given PDF file, page by page.
#     """
#     if not os.path.exists(pdf_path):
#         logger.error(f"Error: File not found at {pdf_path}")
#         return pd.DataFrame()

#     logger.info(f"Starting text extraction from '{os.path.basename(pdf_path)}'...")
#     all_pages_data = []

#     try:
#         with open(pdf_path, 'rb') as file:
#             reader = pypdf.PdfReader(file)
#             num_pages = len(reader.pages)
#             logger.info(f"Found {num_pages} pages in the document.")

#             for i, page in enumerate(reader.pages):
#                 page_number = i + 1
#                 text = page.extract_text() or "" # Ensure text is a string
                
#                 all_pages_data.append({
#                     'page_number': page_number,
#                     'text': text.strip()
#                 })
#                 if not text.strip():
#                     logger.info(f"  - No text found on page {page_number}.")

#     except Exception as e:
#         logger.error(f"An error occurred while processing the PDF: {e}")
#         return pd.DataFrame()

#     if all_pages_data:
#         df = pd.DataFrame(all_pages_data)
#         logger.info("Text extraction complete.")
#         return df
#     else:
#         logger.warning("Warning: No text was extracted from the document.")
#         return pd.DataFrame()

# def classify_page_text(client: OpenAI, page_number: int, page_text: str) -> Dict[str, Any]:
#     """
#     Classifies the text of a single page using the OpenAI API.
#     """
#     # The new system prompt with updated instructions and JSON format.
#     # Note: The model is only asked for classification details.
#     # Page number and text are added back into the final dictionary later.
#     sys_prompt = """
# You are an expert insurance regulatory analyst reviewing a state commercial auto insurance rate and rule filing.

# Your job is to classify each page into a single best-fitting category ("bucket"). The following buckets are examples of likely categories, but you are allowed to invent and assign a new, appropriate bucket name, if the existing examples do not fit.

# BUCKET EXAMPLES (use or invent as needed):

# - intro information: Cover letters, summaries, company info, administrative headers.
# - table of contents: Tables/indexes listing sections, rules, forms.
# - correspondence: Letters, memos, formal or informal communication (including with a regulator).
# - rule: Detailed rating rules, eligibility, underwriting guidelines, standard operating instructions.
# - factor table: Tabular lists of rating factors—e.g. for zones, territories, drivers, vehicles.
# - actuarial support: Mathematical or statistical justification, trend documentation, loss ratios, exhibits.
# - form: Complete forms, endorsements, schedules, specimen policy wordings.
# - rating example: A worked example showing how premium/rate is calculated.
# - exhibit: Graphs, charts, additional annotated attachments or appendices.
# - crossed_out: (binary) Use ONLY if the entire page is covered with a line, annotated "withdrawn," or has visible strikethrough/crossout. Otherwise, do not use.
# - other: Use only if the page fits none of the above and you cannot reasonably propose a more accurate new bucket name.

# BUCKET FLEXIBILITY:
# - If none of the above buckets are a good fit, make up an appropriate, concise, descriptive bucket name and use it as the "bucket". Do NOT use "llm_new_category" as a category name—use your proposed name directly (e.g. "signature page", "state certification", etc).

# CATEGORIZATION INSTRUCTIONS:
# - Assign exactly one bucket per page.
# - Always provide a 10 word "explanation" of why you selected—or if new, created—this bucket.
# - If "crossed_out" is chosen, no substantive explanation is needed—just state "Entire page was striked out or withdrawn."
# - Otherwise, explain the dominant content and your reasoning for the bucket chosen in precisely 10 words.

# OUTPUT FORMAT (respond with a single valid JSON object only):

# {
#   "bucket": "<bucket_name>",
#   "confidence": <probability 0-1>,
#   "explanation": "<10 word explanation of categorization>"
# }

# If uncertain, favor "other", but prefer to create (with reasoned explanation) a new appropriate bucket when justified.
# """
#     # Create the full result dictionary here, starting with known values.
#     # This ensures page_number and text are always present, even on error.
#     result_payload = {
#         "page_number": page_number,
#         "bucket": "processing_error",
#         "confidence": 0.0,
#         "explanation": "An error occurred before the API call.",
#         "text": page_text
#     }

#     if not page_text:
#         result_payload.update({
#             "bucket": "other",
#             "confidence": 1.0,
#             "explanation": "Page is blank or contains no extractable text."
#         })
#         return result_payload

#     for attempt in range(3): # Retry logic for transient API errors
#         try:
#             chat_completion = client.chat.completions.create(
#                 messages=[
#                     {"role": "system", "content": sys_prompt},
#                     {"role": "user", "content": page_text[:16000]}, # Increased token limit for gpt-4o
#                 ],
#                 model=MODEL,
#                 response_format={"type": "json_object"},
#                 temperature=0.0,
#             )
#             response_content = chat_completion.choices[0].message.content
#             # Parse the JSON from the model
#             api_result = json.loads(response_content)
#             # Update the payload with the model's response
#             result_payload.update(api_result)
#             return result_payload
#         except Exception as e:
#             logger.warning(f"API call failed on attempt {attempt + 1} for page {page_number}: {e}. Retrying in {2 ** attempt}s...")
#             time.sleep(2 ** attempt)

#     logger.error(f"API call failed after multiple retries for page {page_number}.")
#     result_payload.update({
#         "bucket": "api_error",
#         "explanation": "API call failed after multiple retries."
#     })
#     return result_payload

# # --- Main Script Execution ---
# if __name__ == "__main__":
#     # --- USER INPUT ---
#     input_pdf_path = "./Inputs/PGR_Ohio_BNIC-134120828_trimmed.pdf"
#     output_csv_path = "./Output/classified_pdf_text_aiv2.csv"

#     # --- SCRIPT LOGIC ---
#     # 1. Initialize OpenAI Client
#     openai_client = get_openai_client(API_KEY_PATH)
    
#     if openai_client:
#         # 2. Extract text from PDF
#         pdf_dataframe = extract_text_from_pdf(input_pdf_path)

#         if not pdf_dataframe.empty:
#             # 3. Classify each page
#             logger.info("Starting page classification process...")
            
#             results = []
#             total_pages = len(pdf_dataframe)
#             for index, row in pdf_dataframe.iterrows():
#                 logger.info(f"Classifying page {row['page_number']}/{total_pages}...")
#                 # Pass page number and text to the classification function
#                 result = classify_page_text(
#                     client=openai_client, 
#                     page_number=row['page_number'], 
#                     page_text=row['text']
#                 )
#                 results.append(result)

#             # 4. Create the final DataFrame from the list of result dictionaries
#             final_df = pd.DataFrame(results)

#             # 5. Save the final results to CSV
#             try:
#                 # Reorder columns for clarity in the output CSV
#                 column_order = ['page_number', 'bucket', 'confidence', 'explanation', 'text']
#                 final_df = final_df[column_order]
                
#                 final_df.to_csv(output_csv_path, index=False, encoding='utf-8')
#                 logger.info(f"\nSuccessfully saved classified text to '{output_csv_path}'")
                
#                 logger.info("\n--- Sample of Final Data (text column omitted for brevity) ---")
#                 print(final_df.drop(columns=['text']).head().to_string())

#             except Exception as e:
#                 logger.error(f"\nAn error occurred while saving the CSV file: {e}")

In [37]:
# Pypdf extraction and OpenAI classification script
import pandas as pd
import pypdf
import os
import re
import json
import time
import logging
from openai import OpenAI # Use the updated OpenAI library import
from typing import Dict, Any, List, Tuple

# === LOGGING SETUP ===
# Sets up basic configuration for logging messages.
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger(__name__)

# === CONFIGURATION ===
# It's better to manage paths and model names in a central place.
# The API key is read from a local file for better security.
API_KEY_PATH = "/Users/jake/Documents/Key/OPENAI_KEY.txt" # <--- ADJUST IF NEEDED
MODEL = "gpt-4o" # Using the latest model as specified


In [None]:

def get_openai_client(api_key_path: str) -> OpenAI:
    """Reads the OpenAI API key from a file and returns an OpenAI client."""
    try:
        with open(api_key_path, 'r') as f:
            api_key = f.read().strip()
        if not api_key:
            raise ValueError("API key file is empty.")
        logger.info("Successfully loaded OpenAI API key.")
        return OpenAI(api_key=api_key)
    except FileNotFoundError:
        logger.error(f"API key file not found at: {api_key_path}")
        return None
    except Exception as e:
        logger.error(f"An error occurred while reading the API key: {e}")
        return None

def extract_text_from_pdf(pdf_path: str) -> pd.DataFrame:
    """
    Extracts all text from a given PDF file, page by page.
    """
    if not os.path.exists(pdf_path):
        logger.error(f"Error: File not found at {pdf_path}")
        return pd.DataFrame()

    logger.info(f"Starting text extraction from '{os.path.basename(pdf_path)}'...")
    all_pages_data = []

    try:
        with open(pdf_path, 'rb') as file:
            reader = pypdf.PdfReader(file)
            num_pages = len(reader.pages)
            logger.info(f"Found {num_pages} pages in the document.")

            for i, page in enumerate(reader.pages):
                page_number = i + 1
                text = page.extract_text() or "" # Ensure text is a string
                
                all_pages_data.append({
                    'page_number': page_number,
                    'text': text.strip()
                })
                if not text.strip():
                    logger.info(f"  - No text found on page {page_number}.")

    except Exception as e:
        logger.error(f"An error occurred while processing the PDF: {e}")
        return pd.DataFrame()

    if all_pages_data:
        df = pd.DataFrame(all_pages_data)
        logger.info("Text extraction complete.")
        return df
    else:
        logger.warning("Warning: No text was extracted from the document.")
        return pd.DataFrame()

def extract_company_name(df: pd.DataFrame) -> str:
    """
    Extracts the company name from the first few pages of the document.
    """
    if df.empty:
        return "Unknown_Company"
    
    # Check the first 3 pages for the company name for robustness
    for i in range(min(3, len(df))):
        page_text = df.iloc[i]['text']
        # Use regex to find "Filing Company:" and capture the text after it
        match = re.search(r"Filing Company:\s*(.*)", page_text, re.IGNORECASE)
        if match:
            company_name = match.group(1).strip()
            # A second regex to clean up any trailing text like "Project Name/Number"
            company_name = re.split(r'\s{2,}|Project Name', company_name)[0].strip()
            if company_name:
                return company_name
    
    logger.warning("Could not find company name. Defaulting to 'Unknown_Company'.")
    return "Unknown_Company"


def classify_page_text(client: OpenAI, page_number: int, page_text: str) -> Dict[str, Any]:
    """
    Classifies the text of a single page using rules and the OpenAI API.
    """
    # Create the base payload. This ensures all keys are present, even on error.
    result_payload = {
        "page_number": page_number,
        "bucket": "processing_error",
        "confidence": 0.0,
        "explanation": "An error occurred during processing.",
        "text": page_text
    }

    # RULE #1: Handle blank pages first.
    if not page_text:
        result_payload.update({
            "bucket": "other",
            "confidence": 1.0,
            "explanation": "Page is blank or contains no extractable text."
        })
        return result_payload

    # RULE #2: Handle "redline" pages before calling the API.
    if 'redline' in page_text.lower():
        result_payload.update({
            "bucket": "redline",
            "confidence": 1.0,
            "explanation": "Page contains 'redline' text, indicating document revisions."
        })
        return result_payload

    # If no rules match, proceed with API call.
    sys_prompt = """
You are an expert insurance regulatory analyst reviewing a state commercial auto insurance rate and rule filing.

Your job is to classify each page into a single best-fitting category ("bucket"). The following buckets are examples of likely categories, but you are allowed to invent and assign a new, appropriate bucket name, if the existing examples do not fit.

BUCKET EXAMPLES (use or invent as needed):

- intro information: Cover letters, summaries, company info, administrative headers.
- table of contents: Tables/indexes listing sections, rules, forms.
- correspondence: Letters, memos, formal or informal communication (including with a regulator).
- rule: Detailed rating rules, eligibility, underwriting guidelines, standard operating instructions.
- factor table: Tabular lists of rating factors—e.g. for zones, territories, drivers, vehicles.
- actuarial support: Mathematical or statistical justification, trend documentation, loss ratios, exhibits.
- form: Complete forms, endorsements, schedules, specimen policy wordings.
- rating example: A worked example showing how premium/rate is calculated.
- exhibit: Graphs, charts, additional annotated attachments or appendices.
- crossed_out: (binary) Use ONLY if the entire page is covered with a line, annotated "withdrawn," or has visible strikrough/crossout. Otherwise, do not use.
- other: Use only if the page fits none of the above and you cannot reasonably propose a more accurate new bucket name.

BUCKET FLEXIBILITY:
- If none of the above buckets are a good fit, make up an appropriate, concise, descriptive bucket name and use it as the "bucket". Do NOT use "llm_new_category" as a category name—use your proposed name directly (e.g. "signature page", "state certification", etc).

CATEGORIZATION INSTRUCTIONS:
- Assign exactly one bucket per page.
- Always provide a 10 word "explanation" of why you selected—or if new, created—this bucket.
- If "crossed_out" is chosen, no substantive explanation is needed—just state "Entire page was striked out or withdrawn."
- Otherwise, explain the dominant content and your reasoning for the bucket chosen in precisely 10 words.

OUTPUT FORMAT (respond with a single valid JSON object only):

{
  "bucket": "<bucket_name>",
  "confidence": <probability 0-1>,
  "explanation": "<10 word explanation of categorization>"
}

If uncertain, favor "other", but prefer to create (with reasoned explanation) a new appropriate bucket when justified.
"""
    for attempt in range(3):
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": sys_prompt},
                    {"role": "user", "content": page_text[:16000]},
                ],
                model=MODEL,
                response_format={"type": "json_object"},
                temperature=0.0,
            )
            response_content = chat_completion.choices[0].message.content
            api_result = json.loads(response_content)
            result_payload.update(api_result)
            return result_payload
        except Exception as e:
            logger.warning(f"API call failed on attempt {attempt + 1} for page {page_number}: {e}. Retrying in {2 ** attempt}s...")
            time.sleep(2 ** attempt)

    logger.error(f"API call failed after multiple retries for page {page_number}.")
    result_payload.update({
        "bucket": "api_error",
        "explanation": "API call failed after multiple retries."
    })
    return result_payload

def parse_single_page_tables(client: OpenAI, page_text: str) -> Dict[str, Any]:
    """
    Uses the AI to extract all tables from a SINGLE page's text.
    """
    sys_prompt = """
You are an expert data extraction assistant. Your task is to analyze the provided text from a SINGLE document page and extract ALL tables present.

INSTRUCTIONS:
1.  **Identify ALL Distinct Tables**: Scrutinize the text to identify every individual table. A new table is often indicated by a new title or header row (e.g., "Table 2-1...").
2.  **Handle Continuations**: The text might contain the start of a new table, the end of a table from a previous page, or both. Parse what is present. If a block of text looks like table rows but has no header, treat it as a table and extract the rows as-is.
3.  **Extract Titles**: For each distinct table you identify, find its title. If a table has no clear title, create a concise, descriptive one. If it's a continuation, the title might be absent; in this case, use a placeholder like "Continuation Table".
4.  **Recreate Each Table**: For each table, parse the text to reconstruct its data, including headers (if present) and all data rows.
5.  **Return a List of JSON Objects**: Your output must be a single, valid JSON object containing a list of all tables found on the page.

EXAMPLE OUTPUT for a page with two tables:
{
  "tables": [
    { "table_title": "Table 1-2. Average Driver Experience Score", "table_data": [["Greater than", "Factor"], ["0", "0.989"]] },
    { "table_title": "Table 2-1. Number of Violations", "table_data": [["Violations", "Factor"], ["0", "1.000"]] }
  ]
}
"""
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": sys_prompt},
                {"role": "user", "content": page_text},
            ],
            model=MODEL,
            response_format={"type": "json_object"},
            temperature=0.0,
        )
        response_content = chat_completion.choices[0].message.content
        return json.loads(response_content)
    except Exception as e:
        logger.error(f"An error occurred during single-page table parsing: {e}")
        return {"tables": []}

def is_header_row(row: List[str]) -> bool:
    """
    Heuristically determines if a list of strings is a table header.
    Returns False if it's likely a data row.
    """
    # A simple heuristic: if a row contains mostly numbers, it's likely data.
    # This can be made more sophisticated if needed.
    numeric_count = 0
    for item in row:
        try:
            float(str(item).replace(",", "").replace("$", ""))
            numeric_count += 1
        except (ValueError, TypeError):
            pass
    # If more than half the cells are numeric, it's probably not a header.
    return numeric_count <= len(row) / 2

def combine_table_fragments(parsed_tables: List[Tuple[str, pd.DataFrame]]) -> List[Tuple[str, pd.DataFrame]]:
    """
    Combines a list of parsed table DataFrames into consolidated tables.
    """
    if not parsed_tables:
        return []

    logger.info("Combining table fragments...")
    combined_tables = []
    
    # Start with the first table as the base
    current_title, current_df = parsed_tables[0]
    
    for i in range(1, len(parsed_tables)):
        next_title, next_df = parsed_tables[i]
        
        # If the next DataFrame is empty or has no rows, skip it
        if next_df.empty or len(next_df.columns) == 0:
            continue
            
        # Check if the first row of the next table looks like a header
        first_row_is_header = is_header_row(list(next_df.columns))

        # If the next table has a new header, it's a new table.
        # Finalize the current table and start a new one.
        if first_row_is_header:
            combined_tables.append((current_title, current_df))
            current_title, current_df = next_title, next_df
        # If it looks like a continuation (no header), append it.
        else:
            # Rename columns of the fragment to match the base table
            if len(current_df.columns) == len(next_df.columns):
                next_df.columns = current_df.columns
                current_df = pd.concat([current_df, next_df], ignore_index=True)
            else:
                logger.warning(f"Column mismatch for table '{current_title}'. Cannot combine fragment. Starting new table.")
                combined_tables.append((current_title, current_df))
                current_title, current_df = next_title, next_df

    # Add the last processed table
    combined_tables.append((current_title, current_df))
    
    logger.info(f"Consolidated into {len(combined_tables)} final tables.")
    return combined_tables

# === STEP 1: PDF Parsing and Page Classification ===
def run_classification_step(input_pdf_path: str, output_csv_path: str) -> (pd.DataFrame, str):
    """
    Extracts text from a PDF, classifies each page, and saves the results to a CSV.
    Returns the classification DataFrame and the extracted company name.
    """
    logger.info("--- Starting Step 1: PDF Parsing and Page Classification ---")
    openai_client = get_openai_client(API_KEY_PATH)
    if not openai_client:
        return pd.DataFrame(), "Unknown_Company"

    pdf_dataframe = extract_text_from_pdf(input_pdf_path)
    if pdf_dataframe.empty:
        return pd.DataFrame(), "Unknown_Company"

    company_name = extract_company_name(pdf_dataframe)
    logger.info(f"Extracted Company Name: {company_name}")

    results = []
    total_pages = len(pdf_dataframe)
    for index, row in pdf_dataframe.iterrows():
        logger.info(f"Classifying page {row['page_number']}/{total_pages}...")
        result = classify_page_text(
            client=openai_client, 
            page_number=row['page_number'], 
            page_text=row['text']
        )
        results.append(result)

    final_df = pd.DataFrame(results)
    
    try:
        column_order = ['page_number', 'bucket', 'confidence', 'explanation', 'text']
        final_df_ordered = final_df.reindex(columns=column_order)
        final_df_ordered.to_csv(output_csv_path, index=False, encoding='utf-8')
        logger.info(f"\nSuccessfully saved classification results to '{output_csv_path}'")
        logger.info("\n--- Sample of Classification Data (text column omitted for brevity) ---")
        print(final_df_ordered.drop(columns=['text']).head().to_string())
    except Exception as e:
        logger.error(f"\nAn error occurred while saving the CSV file: {e}")

    return final_df, company_name

# === STEP 2: Table Extraction and Structuring ===
def run_table_extraction_step(classified_df: pd.DataFrame, company_name: str, output_excel_path: str):
    """
    Processes the classified data to find and extract factor tables into an Excel file.
    """
    logger.info("\n--- Starting Step 2: Factor Table Extraction ---")
    if classified_df.empty:
        logger.warning("Classification DataFrame is empty. Skipping table extraction.")
        return
        
    openai_client = get_openai_client(API_KEY_PATH)
    if not openai_client:
        return
        
    factor_table_pages = classified_df[
        classified_df['bucket'].str.lower().isin(['factor_table', 'factor table'])
    ].copy()
    
    if factor_table_pages.empty:
        logger.info("No pages were classified as 'factor_table'. Skipping table extraction.")
        return
        
    # --- New Step 2a: Parse each factor table page individually ---
    all_parsed_tables = []
    for index, row in factor_table_pages.iterrows():
        page_num = row['page_number']
        logger.info(f"Parsing tables on page: {page_num}")
        parsed_data = parse_single_page_tables(openai_client, row['text'])
        for table in parsed_data.get("tables", []):
            table_title = table.get("table_title", f"Table_on_Page_{page_num}")
            table_data = table.get("table_data", [])
            if table_data and len(table_data) > 0: # Check if there is at least a header
                try:
                    header = table_data[0]
                    data = table_data[1:]
                    
                    # FIX: Sanitize the data to ensure all rows have the same number of columns as the header
                    num_columns = len(header)
                    sanitized_data = []
                    for data_row in data:
                        # Pad rows that are too short
                        while len(data_row) < num_columns:
                            data_row.append('')
                        # Truncate rows that are too long
                        sanitized_data.append(data_row[:num_columns])

                    table_df = pd.DataFrame(sanitized_data, columns=header)
                    all_parsed_tables.append((table_title, table_df))
                except Exception as e:
                    logger.error(f"Could not process table '{table_title}' on page {page_num} due to data structure issue: {e}")


    # --- New Step 2b: Combine the parsed fragments ---
    final_tables = combine_table_fragments(all_parsed_tables)

    # --- Step 2c: Write the final, combined tables to Excel ---
    with pd.ExcelWriter(output_excel_path, engine='openpyxl') as writer:
        sheet_name_counts = {}
        for title, df in final_tables:
            full_title = f"{company_name} - {title}"
            safe_sheet_name = "".join(c for c in full_title if c.isalnum() or c in (' ', '_')).rstrip()[:25]
            
            if safe_sheet_name in sheet_name_counts:
                sheet_name_counts[safe_sheet_name] += 1
                final_sheet_name = f"{safe_sheet_name}_{sheet_name_counts[safe_sheet_name]}"
            else:
                sheet_name_counts[safe_sheet_name] = 0
                final_sheet_name = safe_sheet_name
            
            final_sheet_name = final_sheet_name[:31]

            df.to_excel(writer, sheet_name=final_sheet_name, index=False)
            logger.info(f"  - Saved table '{full_title}' to sheet '{final_sheet_name}'")

    logger.info(f"Successfully saved extracted tables to '{output_excel_path}'")


In [46]:

# --- Main Script Execution ---
if __name__ == "__main__":
    # --- USER INPUT ---
    input_pdf_path = "./202505 - TX - PRGS-134279210 _ trimmed.pdf"
    # To skip the classification step and use an existing CSV, provide the path here.
    # Otherwise, leave it as None.
    # Example: pre_classified_csv_path = "./Output/PGR_Ohio_BNIC-134120828/classified_pages.csv"
    pre_classified_csv_path = "/Users/jake/Documents/Python/PDF Parser/Output/202505 - TX - PRGS-134279210 _ trimmed/classified_pages_trimmed.csv" # <--- SET THIS TO A CSV PATH TO SKIP STEP 1

    # --- DYNAMIC OUTPUT PATHS ---
    base_filename = os.path.basename(input_pdf_path)
    file_name_without_ext = os.path.splitext(base_filename)[0]
    
    # 1. Create a dedicated output subfolder for the file
    output_dir = os.path.join("./Output", file_name_without_ext)
    os.makedirs(output_dir, exist_ok=True)

    # Define output paths inside the new subfolder
    output_csv_path = os.path.join(output_dir, f"classified_pages.csv")
    output_excel_path = os.path.join(output_dir, f"extracted_factor_tables_pgr_trimmed.xlsx")
    
    logger.info(f"Input PDF: {input_pdf_path}")
    logger.info(f"Output directory: {output_dir}")

    # --- SCRIPT LOGIC ---
    # 2. Run the two main steps sequentially, with an option to skip Step 1
    
    # Check if a pre-classified CSV should be used
    if pre_classified_csv_path and os.path.exists(pre_classified_csv_path):
        logger.info(f"--- Skipping Step 1, Loading pre-classified data from: {pre_classified_csv_path} ---")
        classified_data = pd.read_csv(pre_classified_csv_path)
        # Handle potential NaN values in the 'text' column when loading from CSV
        classified_data['text'] = classified_data['text'].fillna('')
        company = extract_company_name(classified_data)
        logger.info(f"Extracted Company Name from CSV: {company}")
    else:
        if pre_classified_csv_path:
            logger.warning(f"Pre-classified file not found at '{pre_classified_csv_path}'. Running full classification process.")
        # Run the full classification step if no pre-classified file is provided
        classified_data, company = run_classification_step(input_pdf_path, output_csv_path)

    # Run the table extraction step using the (either newly created or loaded) classified data
    if not classified_data.empty:
        run_table_extraction_step(classified_data, company, output_excel_path)
    else:
        logger.error("Classification data is empty. Cannot proceed to table extraction.")

    logger.info("\n--- Script finished ---")

2025-08-27 12:16:11,482 | INFO | Input PDF: ./202505 - TX - PRGS-134279210 _ trimmed.pdf
2025-08-27 12:16:11,483 | INFO | Output directory: ./Output/202505 - TX - PRGS-134279210 _ trimmed
2025-08-27 12:16:11,483 | INFO | --- Skipping Step 1, Loading pre-classified data from: /Users/jake/Documents/Python/PDF Parser/Output/202505 - TX - PRGS-134279210 _ trimmed/classified_pages_trimmed.csv ---
2025-08-27 12:16:11,492 | INFO | Extracted Company Name from CSV: Unknown_Company
2025-08-27 12:16:11,493 | INFO | 
--- Starting Step 2: Factor Table Extraction ---
2025-08-27 12:16:11,494 | INFO | Successfully loaded OpenAI API key.
2025-08-27 12:16:11,527 | INFO | Parsing tables on page: 21
2025-08-27 12:16:20,479 | INFO | HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-27 12:16:20,485 | INFO | Parsing tables on page: 22
2025-08-27 12:16:49,815 | INFO | HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-08-27 12:16:49,820 |

    sys_prompt = """
You are an expert insurance regulatory analyst reviewing a state commercial auto insurance rate and rule filing.

Your job is to classify each page into a single best-fitting category ("bucket"). The following buckets are examples of likely categories, but you are allowed to invent and assign a new, appropriate bucket name, if the existing examples do not fit.

BUCKET EXAMPLES (use or invent as needed):

- intro information: Cover letters, summaries, company info, administrative headers.
- table of contents: Tables/indexes listing sections, rules, forms.
- correspondence: Letters, memos, formal or informal communication (including with a regulator).
- rule: Detailed rating rules, eligibility, underwriting guidelines, standard operating instructions.
- factor table: Tabular lists of rating factors—e.g. for zones, territories, drivers, vehicles.
- actuarial support: Mathematical or statistical justification, trend documentation, loss ratios, exhibits.
- form: Complete forms, endorsements, schedules, specimen policy wordings.
- rating example: A worked example showing how premium/rate is calculated.
- exhibit: Graphs, charts, additional annotated attachments or appendices.
- crossed_out: (binary) Use ONLY if the entire page is covered with a line, annotated "withdrawn," or has visible strikrough/crossout. Otherwise, do not use.
- other: Use only if the page fits none of the above and you cannot reasonably propose a more accurate new bucket name.

BUCKET FLEXIBILITY:
- If none of the above buckets are a good fit, make up an appropriate, concise, descriptive bucket name and use it as the "bucket". Do NOT use "llm_new_category" as a category name—use your proposed name directly (e.g. "signature page", "state certification", etc).

CATEGORIZATION INSTRUCTIONS:
- Assign exactly one bucket per page.
- Always provide a 10 word "explanation" of why you selected—or if new, created—this bucket.
- If "crossed_out" is chosen, no substantive explanation is needed—just state "Entire page was striked out or withdrawn."
- Otherwise, explain the dominant content and your reasoning for the bucket chosen in precisely 10 words.

OUTPUT FORMAT (respond with a single valid JSON object only):

{
  "bucket": "<bucket_name>",
  "confidence": <probability 0-1>,
  "explanation": "<10 word explanation of categorization>"
}

If uncertain, favor "other", but prefer to create (with reasoned explanation) a new appropriate bucket when justified.
"""