In [10]:
import os
import time
import PyPDF2
import openai
from tqdm import tqdm

In [11]:
openai.api_key = os.getenv("OPENAI_API_KEY")  # or set explicitly

In [12]:
# --- Edit these to your liking ---
INPUT_PDF_PATH = "./Inputs/PGR_Ohio_BNIC-134120828_trimmed.pdf"
MAX_PAGES = 3000                     # limit if needed
MAX_TOKENS_PER_PAGE = 1200           # reasonable for GPT-3.5-turbo for <1pg text
MODEL = "gpt-3.5-turbo"              # or gpt-4 if you have access
OUTPUT_INDEX_PATH = "filing_index.csv"

# --- Buckets to detect ---
BUCKETS = [
    "intro information", "correspondence", "rule", "factor table", "actuarial support"
]

# --- System message for GPT ---
SYS_PROMPT = f"""
You are an expert insurance regulatory analyst. 
Given a single page of an insurance rate or form filing, classify it strictly into one of the following categories: {BUCKETS}.
Return just the most likely bucket as a lowercase label, e.g. 'factor table', 'intro information', or 'other' if it doesn't fit.

If very unsure, return 'other'.
"""


In [13]:

# --- Helper: PDF split & clean ---
def extract_pdf_pages(pdf_path, max_pages=MAX_PAGES):
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        n = min(len(reader.pages), max_pages)
        pages = []
        for i in range(n):
            text = reader.pages[i].extract_text()
            pages.append(text if text else "")
        return pages


In [14]:
# --- Helper: classify with GPT ---
def classify_page(text, model=MODEL, system_prompt=SYS_PROMPT, temperature=0):
    prompt = f'''You will be given the contents of one page from a PDF insurance rate filing. Classify as described.

Page content:
"""
{text[:4000]}
"""'''
    resp = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=16,
    )
    result = resp["choices"][0]["message"]["content"].strip().lower()
    return result


In [15]:

# === MAIN PIPELINE ===
def index_filing(pdf_path):
    pages = extract_pdf_pages(pdf_path)
    classification = []
    for i, page_text in tqdm(enumerate(pages), total=len(pages)):
        try:
            label = classify_page(page_text)
        except Exception as e:
            print(f"Page {i+1}: error {e}")
            label = "error"
            time.sleep(5)   # simple error backoff
        classification.append({"page_number": i+1, "bucket": label, "sample_text": page_text[:80].replace('\n',' ')})
    return classification

In [16]:
# === USAGE ===
def main():
    filing_index = index_filing(INPUT_PDF_PATH)
    import csv
    with open(OUTPUT_INDEX_PATH, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["page_number", "bucket", "sample_text"])
        writer.writeheader()
        for row in filing_index:
            writer.writerow(row)
    print(f"Indexed and saved results to {OUTPUT_INDEX_PATH}")

if __name__ == "__main__":
    main()

KeyboardInterrupt: 