In [None]:
# # Pypdf extraction and OpenAI classification script
# import pandas as pd
# import pypdf
# import os
# import json
# import time
# import logging
# from openai import OpenAI # Use the updated OpenAI library import
# from typing import Dict, Any

# # === LOGGING SETUP ===
# # Sets up basic configuration for logging messages.
# logging.basicConfig(
#     level=logging.INFO,
#     format="%(asctime)s | %(levelname)s | %(message)s"
# )
# logger = logging.getLogger(__name__)

# # === CONFIGURATION ===
# # It's better to manage paths and model names in a central place.
# # The API key is read from a local file for better security.
# API_KEY_PATH = "/Users/jake/Documents/Key/OPENAI_KEY.txt" # <--- ADJUST IF NEEDED
# MODEL = "gpt-4o" # Using the latest model as specified

# def get_openai_client(api_key_path: str) -> OpenAI:
#     """Reads the OpenAI API key from a file and returns an OpenAI client."""
#     try:
#         with open(api_key_path, 'r') as f:
#             api_key = f.read().strip()
#         if not api_key:
#             raise ValueError("API key file is empty.")
#         logger.info("Successfully loaded OpenAI API key.")
#         return OpenAI(api_key=api_key)
#     except FileNotFoundError:
#         logger.error(f"API key file not found at: {api_key_path}")
#         return None
#     except Exception as e:
#         logger.error(f"An error occurred while reading the API key: {e}")
#         return None

# def extract_text_from_pdf(pdf_path: str) -> pd.DataFrame:
#     """
#     Extracts all text from a given PDF file, page by page.
#     """
#     if not os.path.exists(pdf_path):
#         logger.error(f"Error: File not found at {pdf_path}")
#         return pd.DataFrame()

#     logger.info(f"Starting text extraction from '{os.path.basename(pdf_path)}'...")
#     all_pages_data = []

#     try:
#         with open(pdf_path, 'rb') as file:
#             reader = pypdf.PdfReader(file)
#             num_pages = len(reader.pages)
#             logger.info(f"Found {num_pages} pages in the document.")

#             for i, page in enumerate(reader.pages):
#                 page_number = i + 1
#                 text = page.extract_text() or "" # Ensure text is a string
                
#                 all_pages_data.append({
#                     'page_number': page_number,
#                     'text': text.strip()
#                 })
#                 if not text.strip():
#                     logger.info(f"  - No text found on page {page_number}.")

#     except Exception as e:
#         logger.error(f"An error occurred while processing the PDF: {e}")
#         return pd.DataFrame()

#     if all_pages_data:
#         df = pd.DataFrame(all_pages_data)
#         logger.info("Text extraction complete.")
#         return df
#     else:
#         logger.warning("Warning: No text was extracted from the document.")
#         return pd.DataFrame()

# def classify_page_text(client: OpenAI, page_number: int, page_text: str) -> Dict[str, Any]:
#     """
#     Classifies the text of a single page using the OpenAI API.
#     """
#     # The new system prompt with updated instructions and JSON format.
#     # Note: The model is only asked for classification details.
#     # Page number and text are added back into the final dictionary later.
#     sys_prompt = """
# You are an expert insurance regulatory analyst reviewing a state commercial auto insurance rate and rule filing.

# Your job is to classify each page into a single best-fitting category ("bucket"). The following buckets are examples of likely categories, but you are allowed to invent and assign a new, appropriate bucket name, if the existing examples do not fit.

# BUCKET EXAMPLES (use or invent as needed):

# - intro information: Cover letters, summaries, company info, administrative headers.
# - table of contents: Tables/indexes listing sections, rules, forms.
# - correspondence: Letters, memos, formal or informal communication (including with a regulator).
# - rule: Detailed rating rules, eligibility, underwriting guidelines, standard operating instructions.
# - factor table: Tabular lists of rating factors—e.g. for zones, territories, drivers, vehicles.
# - actuarial support: Mathematical or statistical justification, trend documentation, loss ratios, exhibits.
# - form: Complete forms, endorsements, schedules, specimen policy wordings.
# - rating example: A worked example showing how premium/rate is calculated.
# - exhibit: Graphs, charts, additional annotated attachments or appendices.
# - crossed_out: (binary) Use ONLY if the entire page is covered with a line, annotated "withdrawn," or has visible strikethrough/crossout. Otherwise, do not use.
# - other: Use only if the page fits none of the above and you cannot reasonably propose a more accurate new bucket name.

# BUCKET FLEXIBILITY:
# - If none of the above buckets are a good fit, make up an appropriate, concise, descriptive bucket name and use it as the "bucket". Do NOT use "llm_new_category" as a category name—use your proposed name directly (e.g. "signature page", "state certification", etc).

# CATEGORIZATION INSTRUCTIONS:
# - Assign exactly one bucket per page.
# - Always provide a 10 word "explanation" of why you selected—or if new, created—this bucket.
# - If "crossed_out" is chosen, no substantive explanation is needed—just state "Entire page was striked out or withdrawn."
# - Otherwise, explain the dominant content and your reasoning for the bucket chosen in precisely 10 words.

# OUTPUT FORMAT (respond with a single valid JSON object only):

# {
#   "bucket": "<bucket_name>",
#   "confidence": <probability 0-1>,
#   "explanation": "<10 word explanation of categorization>"
# }

# If uncertain, favor "other", but prefer to create (with reasoned explanation) a new appropriate bucket when justified.
# """
#     # Create the full result dictionary here, starting with known values.
#     # This ensures page_number and text are always present, even on error.
#     result_payload = {
#         "page_number": page_number,
#         "bucket": "processing_error",
#         "confidence": 0.0,
#         "explanation": "An error occurred before the API call.",
#         "text": page_text
#     }

#     if not page_text:
#         result_payload.update({
#             "bucket": "other",
#             "confidence": 1.0,
#             "explanation": "Page is blank or contains no extractable text."
#         })
#         return result_payload

#     for attempt in range(3): # Retry logic for transient API errors
#         try:
#             chat_completion = client.chat.completions.create(
#                 messages=[
#                     {"role": "system", "content": sys_prompt},
#                     {"role": "user", "content": page_text[:16000]}, # Increased token limit for gpt-4o
#                 ],
#                 model=MODEL,
#                 response_format={"type": "json_object"},
#                 temperature=0.0,
#             )
#             response_content = chat_completion.choices[0].message.content
#             # Parse the JSON from the model
#             api_result = json.loads(response_content)
#             # Update the payload with the model's response
#             result_payload.update(api_result)
#             return result_payload
#         except Exception as e:
#             logger.warning(f"API call failed on attempt {attempt + 1} for page {page_number}: {e}. Retrying in {2 ** attempt}s...")
#             time.sleep(2 ** attempt)

#     logger.error(f"API call failed after multiple retries for page {page_number}.")
#     result_payload.update({
#         "bucket": "api_error",
#         "explanation": "API call failed after multiple retries."
#     })
#     return result_payload

# # --- Main Script Execution ---
# if __name__ == "__main__":
#     # --- USER INPUT ---
#     input_pdf_path = "./Inputs/PGR_Ohio_BNIC-134120828_trimmed.pdf"
#     output_csv_path = "./Output/classified_pdf_text_aiv2.csv"

#     # --- SCRIPT LOGIC ---
#     # 1. Initialize OpenAI Client
#     openai_client = get_openai_client(API_KEY_PATH)
    
#     if openai_client:
#         # 2. Extract text from PDF
#         pdf_dataframe = extract_text_from_pdf(input_pdf_path)

#         if not pdf_dataframe.empty:
#             # 3. Classify each page
#             logger.info("Starting page classification process...")
            
#             results = []
#             total_pages = len(pdf_dataframe)
#             for index, row in pdf_dataframe.iterrows():
#                 logger.info(f"Classifying page {row['page_number']}/{total_pages}...")
#                 # Pass page number and text to the classification function
#                 result = classify_page_text(
#                     client=openai_client, 
#                     page_number=row['page_number'], 
#                     page_text=row['text']
#                 )
#                 results.append(result)

#             # 4. Create the final DataFrame from the list of result dictionaries
#             final_df = pd.DataFrame(results)

#             # 5. Save the final results to CSV
#             try:
#                 # Reorder columns for clarity in the output CSV
#                 column_order = ['page_number', 'bucket', 'confidence', 'explanation', 'text']
#                 final_df = final_df[column_order]
                
#                 final_df.to_csv(output_csv_path, index=False, encoding='utf-8')
#                 logger.info(f"\nSuccessfully saved classified text to '{output_csv_path}'")
                
#                 logger.info("\n--- Sample of Final Data (text column omitted for brevity) ---")
#                 print(final_df.drop(columns=['text']).head().to_string())

#             except Exception as e:
#                 logger.error(f"\nAn error occurred while saving the CSV file: {e}")

In [24]:
# Pypdf extraction and OpenAI classification script
import pandas as pd
import pypdf
import os
import re
import json
import time
import logging
from openai import OpenAI # Use the updated OpenAI library import
from typing import Dict, Any, List

# === LOGGING SETUP ===
# Sets up basic configuration for logging messages.
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger(__name__)

# === CONFIGURATION ===
# It's better to manage paths and model names in a central place.
# The API key is read from a local file for better security.
API_KEY_PATH = "/Users/jake/Documents/Key/OPENAI_KEY.txt" # <--- ADJUST IF NEEDED
MODEL = "gpt-4o" # Using the latest model as specified
# Maximum characters to send in a single table extraction request to avoid token limits.
# gpt-4o's context window is large, but prompts/responses also consume tokens. 
# 100,000 chars is a safe starting point (approx. 25k tokens).
MAX_CHARS_PER_GROUP = 100000 


def get_openai_client(api_key_path: str) -> OpenAI:
    """Reads the OpenAI API key from a file and returns an OpenAI client."""
    try:
        with open(api_key_path, 'r') as f:
            api_key = f.read().strip()
        if not api_key:
            raise ValueError("API key file is empty.")
        logger.info("Successfully loaded OpenAI API key.")
        return OpenAI(api_key=api_key)
    except FileNotFoundError:
        logger.error(f"API key file not found at: {api_key_path}")
        return None
    except Exception as e:
        logger.error(f"An error occurred while reading the API key: {e}")
        return None

def extract_text_from_pdf(pdf_path: str) -> pd.DataFrame:
    """
    Extracts all text from a given PDF file, page by page.
    """
    if not os.path.exists(pdf_path):
        logger.error(f"Error: File not found at {pdf_path}")
        return pd.DataFrame()

    logger.info(f"Starting text extraction from '{os.path.basename(pdf_path)}'...")
    all_pages_data = []

    try:
        with open(pdf_path, 'rb') as file:
            reader = pypdf.PdfReader(file)
            num_pages = len(reader.pages)
            logger.info(f"Found {num_pages} pages in the document.")

            for i, page in enumerate(reader.pages):
                page_number = i + 1
                text = page.extract_text() or "" # Ensure text is a string
                
                all_pages_data.append({
                    'page_number': page_number,
                    'text': text.strip()
                })
                if not text.strip():
                    logger.info(f"  - No text found on page {page_number}.")

    except Exception as e:
        logger.error(f"An error occurred while processing the PDF: {e}")
        return pd.DataFrame()

    if all_pages_data:
        df = pd.DataFrame(all_pages_data)
        logger.info("Text extraction complete.")
        return df
    else:
        logger.warning("Warning: No text was extracted from the document.")
        return pd.DataFrame()

def extract_company_name(df: pd.DataFrame) -> str:
    """
    Extracts the company name from the first few pages of the document.
    """
    if df.empty:
        return "Unknown_Company"
    
    # Check the first 3 pages for the company name for robustness
    for i in range(min(3, len(df))):
        page_text = df.iloc[i]['text']
        # Use regex to find "Filing Company:" and capture the text after it
        match = re.search(r"Filing Company:\s*(.*)", page_text, re.IGNORECASE)
        if match:
            company_name = match.group(1).strip()
            # A second regex to clean up any trailing text like "Project Name/Number"
            company_name = re.split(r'\s{2,}|Project Name', company_name)[0].strip()
            if company_name:
                return company_name
    
    logger.warning("Could not find company name. Defaulting to 'Unknown_Company'.")
    return "Unknown_Company"


def classify_page_text(client: OpenAI, page_number: int, page_text: str) -> Dict[str, Any]:
    """
    Classifies the text of a single page using rules and the OpenAI API.
    """
    # Create the base payload. This ensures all keys are present, even on error.
    result_payload = {
        "page_number": page_number,
        "bucket": "processing_error",
        "confidence": 0.0,
        "explanation": "An error occurred during processing.",
        "text": page_text
    }

    # RULE #1: Handle blank pages first.
    if not page_text:
        result_payload.update({
            "bucket": "other",
            "confidence": 1.0,
            "explanation": "Page is blank or contains no extractable text."
        })
        return result_payload

    # RULE #2: Handle "redline" pages before calling the API.
    if 'redline' in page_text.lower():
        result_payload.update({
            "bucket": "redline",
            "confidence": 1.0,
            "explanation": "Page contains 'redline' text, indicating document revisions."
        })
        return result_payload

    # If no rules match, proceed with API call.
    # **FIX**: Restored the full system prompt to include the word "JSON", which is required
    # by the API when using response_format="json_object".
    sys_prompt = """
You are an expert insurance regulatory analyst reviewing a state commercial auto insurance rate and rule filing.

Your job is to classify each page into a single best-fitting category ("bucket"). The following buckets are examples of likely categories, but you are allowed to invent and assign a new, appropriate bucket name, if the existing examples do not fit.

BUCKET EXAMPLES (use or invent as needed):

- intro information: Cover letters, summaries, company info, administrative headers.
- table of contents: Tables/indexes listing sections, rules, forms.
- correspondence: Letters, memos, formal or informal communication (including with a regulator).
- rule: Detailed rating rules, eligibility, underwriting guidelines, standard operating instructions.
- factor table: Tabular lists of rating factors—e.g. for zones, territories, drivers, vehicles.
- actuarial support: Mathematical or statistical justification, trend documentation, loss ratios, exhibits.
- form: Complete forms, endorsements, schedules, specimen policy wordings.
- rating example: A worked example showing how premium/rate is calculated.
- exhibit: Graphs, charts, additional annotated attachments or appendices.
- crossed_out: (binary) Use ONLY if the entire page is covered with a line, annotated "withdrawn," or has visible strikrough/crossout. Otherwise, do not use.
- other: Use only if the page fits none of the above and you cannot reasonably propose a more accurate new bucket name.

BUCKET FLEXIBILITY:
- If none of the above buckets are a good fit, make up an appropriate, concise, descriptive bucket name and use it as the "bucket". Do NOT use "llm_new_category" as a category name—use your proposed name directly (e.g. "signature page", "state certification", etc).

CATEGORIZATION INSTRUCTIONS:
- Assign exactly one bucket per page.
- Always provide a 10 word "explanation" of why you selected—or if new, created—this bucket.
- If "crossed_out" is chosen, no substantive explanation is needed—just state "Entire page was striked out or withdrawn."
- Otherwise, explain the dominant content and your reasoning for the bucket chosen in precisely 10 words.

OUTPUT FORMAT (respond with a single valid JSON object only):

{
  "bucket": "<bucket_name>",
  "confidence": <probability 0-1>,
  "explanation": "<10 word explanation of categorization>"
}

If uncertain, favor "other", but prefer to create (with reasoned explanation) a new appropriate bucket when justified.
"""
    for attempt in range(3):
        try:
            chat_completion = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": sys_prompt},
                    {"role": "user", "content": page_text[:16000]},
                ],
                model=MODEL,
                response_format={"type": "json_object"},
                temperature=0.0,
            )
            response_content = chat_completion.choices[0].message.content
            api_result = json.loads(response_content)
            result_payload.update(api_result)
            return result_payload
        except Exception as e:
            logger.warning(f"API call failed on attempt {attempt + 1} for page {page_number}: {e}. Retrying in {2 ** attempt}s...")
            time.sleep(2 ** attempt)

    logger.error(f"API call failed after multiple retries for page {page_number}.")
    result_payload.update({
        "bucket": "api_error",
        "explanation": "API call failed after multiple retries."
    })
    return result_payload

def extract_and_structure_table(client: OpenAI, table_text: str) -> Dict[str, Any]:
    """
    Uses the AI to extract titles and structured data from text containing one or more tables.
    """
    sys_prompt = """
You are an expert data extraction assistant. Your task is to analyze the provided text, which may contain multiple, separate factor tables from an insurance filing, potentially spanning page breaks.

INSTRUCTIONS:
1.  **Identify ALL Distinct Tables**: Scrutinize the entire text to identify every individual table. A new table is often indicated by a new title or header row (e.g., "Table 2-1...").
2.  **Handle Multi-Page Tables**: A table might start on one page and continue on the next. A page break is indicated by "--- NEW PAGE BREAK ---". If a page starts with data rows and no new header, it is a continuation of the previous table. You must intelligently stitch these parts together into a single table.
3.  **Extract Titles**: For each distinct table you identify, find its title. If a table has no clear title, create a concise, descriptive one based on its content.
4.  **Recreate Each Table**: For each distinct table, parse the text to reconstruct its data, including its specific headers and all corresponding data rows.
5.  **Return a List of JSON Objects**: Your output must be a single, valid JSON object that contains a list of all the tables you found. Each table in the list should have the following structure:
    {
      "table_title": "<The title you identified or created for this specific table>",
      "table_data": [
        ["Header 1", "Header 2", "Header 3"],
        ["Row 1 Col 1", "Row 1 Col 2", "Row 1 Col 3"],
        ["Row 2 Col 1", "Row 2 Col 2", "Row 2 Col 3"]
      ]
    }

EXAMPLE OUTPUT for text containing two separate tables:
{
  "tables": [
    {
      "table_title": "Table 1-2. Average Driver Experience Score",
      "table_data": [
        ["Greater than", "And less than or equal to", "Factor"],
        ["0", "0.986", "0.989"],
        ["0.986", "0.998", "1.000"]
      ]
    },
    {
      "table_title": "Table 2-1. Number of Super & Major Violations",
      "table_data": [
        ["Number of Super & Major Violations", "Factor"],
        ["0", "1.000"],
        ["1", "1.354"]
      ]
    }
  ]
}
"""
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "system", "content": sys_prompt},
                {"role": "user", "content": table_text},
            ],
            model=MODEL,
            response_format={"type": "json_object"},
            temperature=0.0,
        )
        response_content = chat_completion.choices[0].message.content
        return json.loads(response_content)
    except Exception as e:
        logger.error(f"An error occurred during table extraction: {e}")
        return {
            "tables": [{
                "table_title": "Extraction Error",
                "table_data": [[f"An error occurred: {e}"]]
            }]
        }

def process_table_group(client: OpenAI, df: pd.DataFrame, group_indices: List[int], writer: pd.ExcelWriter, company_name: str, sheet_name_counts: dict):
    """
    Processes a single group of consecutive factor table pages, expecting multiple tables.
    """
    page_numbers = df.loc[group_indices, 'page_number'].tolist()
    logger.info(f"Processing a potential table group spanning pages: {page_numbers}")
    
    combined_text = "\n--- NEW PAGE BREAK ---\n".join(df.loc[group_indices, 'text'])
    
    structured_data = extract_and_structure_table(client, combined_text)
    
    tables = structured_data.get("tables", [])
    
    for i, table in enumerate(tables):
        table_title = table.get("table_title", f"Untitled_Table_{i+1}")
        table_data = table.get("table_data", [])
        
        if table_data and len(table_data) > 1:
            full_title = f"{company_name} - {table_title}"
            safe_sheet_name = "".join(c for c in full_title if c.isalnum() or c in (' ', '_')).rstrip()[:25]
            
            if safe_sheet_name in sheet_name_counts:
                sheet_name_counts[safe_sheet_name] += 1
                final_sheet_name = f"{safe_sheet_name}_{sheet_name_counts[safe_sheet_name]}"
            else:
                sheet_name_counts[safe_sheet_name] = 0
                final_sheet_name = safe_sheet_name
            
            final_sheet_name = final_sheet_name[:31]

            table_df = pd.DataFrame(table_data[1:], columns=table_data[0])
            table_df.to_excel(writer, sheet_name=final_sheet_name, index=False)
            logger.info(f"  - Saved table '{full_title}' to sheet '{final_sheet_name}'")
        else:
            logger.warning(f"  - Skipping empty or invalid table data for title: '{table_title}'")

# === STEP 1: PDF Parsing and Page Classification ===
def run_classification_step(input_pdf_path: str, output_csv_path: str) -> (pd.DataFrame, str):
    """
    Extracts text from a PDF, classifies each page, and saves the results to a CSV.
    Returns the classification DataFrame and the extracted company name.
    """
    logger.info("--- Starting Step 1: PDF Parsing and Page Classification ---")
    openai_client = get_openai_client(API_KEY_PATH)
    if not openai_client:
        return pd.DataFrame(), "Unknown_Company"

    pdf_dataframe = extract_text_from_pdf(input_pdf_path)
    if pdf_dataframe.empty:
        return pd.DataFrame(), "Unknown_Company"

    company_name = extract_company_name(pdf_dataframe)
    logger.info(f"Extracted Company Name: {company_name}")

    results = []
    total_pages = len(pdf_dataframe)
    for index, row in pdf_dataframe.iterrows():
        logger.info(f"Classifying page {row['page_number']}/{total_pages}...")
        result = classify_page_text(
            client=openai_client, 
            page_number=row['page_number'], 
            page_text=row['text']
        )
        results.append(result)

    final_df = pd.DataFrame(results)
    
    try:
        column_order = ['page_number', 'bucket', 'confidence', 'explanation', 'text']
        final_df_ordered = final_df.reindex(columns=column_order)
        final_df_ordered.to_csv(output_csv_path, index=False, encoding='utf-8')
        logger.info(f"\nSuccessfully saved classification results to '{output_csv_path}'")
        logger.info("\n--- Sample of Classification Data (text column omitted for brevity) ---")
        print(final_df_ordered.drop(columns=['text']).head().to_string())
    except Exception as e:
        logger.error(f"\nAn error occurred while saving the CSV file: {e}")

    return final_df, company_name

# === STEP 2: Table Extraction and Structuring ===
def run_table_extraction_step(classified_df: pd.DataFrame, company_name: str, output_excel_path: str):
    """
    Processes the classified data to find and extract factor tables into an Excel file.
    """
    logger.info("\n--- Starting Step 2: Factor Table Extraction ---")
    if classified_df.empty:
        logger.warning("Classification DataFrame is empty. Skipping table extraction.")
        return
        
    openai_client = get_openai_client(API_KEY_PATH)
    if not openai_client:
        return
        
    factor_table_pages = classified_df[
        classified_df['bucket'].str.lower().isin(['factor_table', 'factor table'])
    ]
    
    if factor_table_pages.empty:
        logger.info("No pages were classified as 'factor_table'. Skipping table extraction.")
        return

    with pd.ExcelWriter(output_excel_path, engine='openpyxl') as writer:
        page_indices = factor_table_pages.index.tolist()
        if not page_indices:
            return
            
        sheet_name_counts = {}
        current_group = []
        current_chars = 0
        
        for index in page_indices:
            # Start a new group if this page isn't consecutive with the last one
            if current_group and index != current_group[-1] + 1:
                process_table_group(openai_client, classified_df, current_group, writer, company_name, sheet_name_counts)
                current_group = []
                current_chars = 0
            
            # Add the page to the current group
            page_text_len = len(classified_df.loc[index, 'text'])
            
            # If adding this page exceeds the char limit, process the current group first
            if current_group and (current_chars + page_text_len) > MAX_CHARS_PER_GROUP:
                process_table_group(openai_client, classified_df, current_group, writer, company_name, sheet_name_counts)
                current_group = []
                current_chars = 0

            current_group.append(index)
            current_chars += page_text_len
        
        # Process the final remaining group
        if current_group:
            process_table_group(openai_client, classified_df, current_group, writer, company_name, sheet_name_counts)

    logger.info(f"Successfully saved extracted tables to '{output_excel_path}'")



In [None]:
# --- Main Script Execution ---
if __name__ == "__main__":
    # --- USER INPUT ---
    input_pdf_path = "./202505 - TX - PRGS-134279210 _ trimmed.pdf"
    # To skip the classification step and use an existing CSV, provide the path here.
    # Otherwise, leave it as None.
    # Example: pre_classified_csv_path = "./Output/PGR_Ohio_BNIC-134120828/classified_pages.csv"
    pre_classified_csv_path = "/Users/jake/Documents/Python/PDF Parser/Output/202505 - TX - PRGS-134279210 _ trimmed/classified_pages.csv" # <--- SET THIS TO A CSV PATH TO SKIP STEP 1

    # --- DYNAMIC OUTPUT PATHS ---
    base_filename = os.path.basename(input_pdf_path)
    file_name_without_ext = os.path.splitext(base_filename)[0]
    
    # 1. Create a dedicated output subfolder for the file
    output_dir = os.path.join("./Output", file_name_without_ext)
    os.makedirs(output_dir, exist_ok=True)

    # Define output paths inside the new subfolder
    output_csv_path = os.path.join(output_dir, f"classified_pages.csv")
    output_excel_path = os.path.join(output_dir, f"extracted_factor_tables.xlsx")
    
    logger.info(f"Input PDF: {input_pdf_path}")
    logger.info(f"Output directory: {output_dir}")

    # --- SCRIPT LOGIC ---
    # 2. Run the two main steps sequentially, with an option to skip Step 1
    
    # Check if a pre-classified CSV should be used
    if pre_classified_csv_path and os.path.exists(pre_classified_csv_path):
        logger.info(f"--- Skipping Step 1, Loading pre-classified data from: {pre_classified_csv_path} ---")
        classified_data = pd.read_csv(pre_classified_csv_path)
        # Handle potential NaN values in the 'text' column when loading from CSV
        classified_data['text'] = classified_data['text'].fillna('')
        company = extract_company_name(classified_data)
        logger.info(f"Extracted Company Name from CSV: {company}")
    else:
        if pre_classified_csv_path:
            logger.warning(f"Pre-classified file not found at '{pre_classified_csv_path}'. Running full classification process.")
        # Run the full classification step if no pre-classified file is provided
        classified_data, company = run_classification_step(input_pdf_path, output_csv_path)

    # Run the table extraction step using the (either newly created or loaded) classified data
    if not classified_data.empty:
        run_table_extraction_step(classified_data, company, output_excel_path)
    else:
        logger.error("Classification data is empty. Cannot proceed to table extraction.")

    logger.info("\n--- Script finished ---")


2025-08-27 11:00:53,175 | INFO | Input PDF: ./202505 - TX - PRGS-134279210 _ trimmed.pdf
2025-08-27 11:00:53,175 | INFO | Output directory: ./Output/202505 - TX - PRGS-134279210 _ trimmed
2025-08-27 11:00:53,176 | INFO | --- Skipping Step 1, Loading pre-classified data from: /Users/jake/Documents/Python/PDF Parser/Output/202505 - TX - PRGS-134279210 _ trimmed/classified_pages.csv ---
2025-08-27 11:00:53,265 | INFO | Extracted Company Name from CSV: Progressive County Mutual Insurance Company
2025-08-27 11:00:53,265 | INFO | 
--- Starting Step 2: Factor Table Extraction ---
2025-08-27 11:00:53,266 | INFO | Successfully loaded OpenAI API key.
2025-08-27 11:00:53,309 | INFO | Processing a potential table group spanning pages: [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]
