In [None]:
import os
with open('/Users/jake/Documents/Key/OPENAI_KEY.txt', 'r') as f:
    os.environ["OPENAI_API_KEY"] = f.read().strip()

import PyPDF2
from openai import OpenAI


# Set your API key
#OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # or paste your key as a string


In [None]:
# --- EDIT FOR TESTING ---
TEST_PDF_PATH = "./Inputs/PGR_Ohio_BNIC-134120828_trimmed.pdf"
TEST_PAGE_NUMBERS = [31, 41, 51]  # 1-based
MODEL = "gpt-3.5-turbo"

BUCKETS = ["intro information", "correspondence", "rule", "factor table", "actuarial support"]

SYS_PROMPT = f"""
You are an expert insurance regulatory analyst.
Given a single page of an insurance rate or form filing, classify it strictly into one of the following categories: {BUCKETS}.
Return just the most likely bucket (as a lowercase label) and a confidence score (0 to 1, 1 being highest confidence).
If unsure, choose 'other' as the label.

RESPONSE FORMAT (json, on one line): {{"bucket": "...", "confidence": 0.95}}
"""


In [None]:

def extract_single_page(pdf_path, page_number):
    """page_number is 1-based for user-friendliness"""
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        n = len(reader.pages)
        if not (1 <= page_number <= n):
            raise ValueError(f"Page number {page_number} out of bounds. Document has {n} pages.")
        text = reader.pages[page_number - 1].extract_text() or ""
        return text

def classify_page_with_confidence(text, client, model=MODEL,
                                  system_prompt=SYS_PROMPT
                                  , temperature=0):
    prompt = f"""Page content:
\"\"\"
{text[:4000]}
\"\"\"

Label the above page strictly. Remember: RESPONSE FORMAT (json, one line): {{"bucket": "...", "confidence": <float>}}
"""
    chat_response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        max_tokens=64
    )
    raw_result = chat_response.choices[0].message.content.strip()
    # Parse JSON from model output
    import json
    try:
        result = json.loads(raw_result)
    except Exception:
        # fallback: very basic extraction
        import re
        label = re.search(r'"bucket"\s*:\s*"([^"]+)"', raw_result)
        conf = re.search(r'"confidence"\s*:\s*([0-9.]+)', raw_result)
        result = {"bucket": label.group(1) if label else "parse_error", "confidence": float(conf.group(1)) if conf else 0.5}
    return result, raw_result


In [None]:
def test_main():
    client = OpenAI(api_key=OPENAI_API_KEY)
    for page_no in TEST_PAGE_NUMBERS:
        print(f"\n===============================")
        print(f"Processing page {page_no}...")
        try:
            page_text = extract_single_page(TEST_PDF_PATH, page_no)
            print(f"\n--- Page {page_no}: Preview ---\n")
            print(page_text[:800].replace("\n", " ") + ("..." if len(page_text) > 800 else ""))
            print("\n--- Sending to GPT for classification... ---\n")
            result, raw_model_output = classify_page_with_confidence(page_text, client)
            print("--- Model output ---")
            print(f"Bucket: {result.get('bucket')}")
            print(f"Confidence: {result.get('confidence')}")
            print("Full model reply (for debugging):")
            print(raw_model_output)
        except Exception as e:
            print(f"Error on page {page_no}: {e}")

if __name__ == "__main__":
    test_main()


In [None]:
import os
import csv
import PyPDF2
import logging
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time

# === LOGGING SETUP ===
LOG_FILENAME = "auto_label_pages.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    handlers=[
        logging.FileHandler(LOG_FILENAME, mode='w', encoding='utf-8'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger()

# === CONFIGURATION ===
PDF_PATH = "./Inputs/PGR_Ohio_BNIC-134120828.pdf"
API_KEY_PATH = "/Users/jake/Documents/Key/OPENAI_KEY.txt"
OUTPUT_CSV = "labeled_pages.csv"
MODEL = "gpt-4"
N_PARALLEL = 2           # Lower value to avoid OpenAI rate limits
MIN_TEXT_LENGTH = 20
PAGE_TEXT_SLICE = 1500   # Shorter prompt for stability

BUCKETS = [
    "intro information", "table of contents", "correspondence", "rule", "factor table",
    "actuarial support", "form", "rating example", "exhibit", "crossed_out", "other", "llm_new_category"
]

SYS_PROMPT = f"""
You are an expert insurance regulatory analyst reviewing a state commercial auto insurance rate and rule filing.
Your job is to classify each page into one of several specific "buckets" using explicit criteria and examples below.
If the page is blank or contains no meaningful text, select "other". If the page has been fully crossed/striked out, select "crossed_out". If none fit, invent a new label as "llm_new_category" with a 10-word description.
BUCKETS:
1. "intro information": Cover letters, summaries, company info.
2. "table of contents": Index/table of rules or forms.
3. "correspondence": Letters, memos, state DOI communication.
4. "rule": Rating rules/guidelines/policies.
5. "factor table": Rate/rating factor tables.
6. "actuarial support": Math, trends, loss ratios, actuarial exhibits.
7. "form": Forms/endorsements/policy wording.
8. "rating example": Sample calculations, worked examples.
9. "exhibit": Charts, graphs, maps, attachments.
10. "crossed_out": Page is withdrawn, crossed out or all strikethrough.
11. "other": Clearly none of the above.
12. "llm_new_category": If new type, add a 10-word explanation.
If uncertain, choose "other". Only one bucket per page. Reply in JSON: {{"bucket": "rule", "confidence": 0.93}}
"""

# --- Util ---
def extract_pdf_pages(pdf_path):
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        return [page.extract_text() or "" for page in reader.pages]

def extract_single_page(pdf_path, page_number):  # 1-based
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        n = len(reader.pages)
        if not (1 <= page_number <= n):
            raise ValueError(f"Page number {page_number} out of bounds. Document has {n} pages.")
        text = reader.pages[page_number - 1].extract_text()
        return text or ""

with open(API_KEY_PATH) as f:
    OPENAI_API_KEY = f.read().strip()
client = OpenAI(api_key=OPENAI_API_KEY)

def classify_page_gpt(page_num, text, model=MODEL, max_page_text_len=PAGE_TEXT_SLICE, verbose=False, retries=3):
    if len((text or '').strip()) < MIN_TEXT_LENGTH:
        if verbose:
            logger.info(f"Page {page_num + 1}: Skipped (empty or too short) [other]")
        return {
            "page_number": page_num+1,
            "page_text": (text or "")[:300],
            "gpt_bucket_guess": "other",
            "confidence": 1.0,
            "llm_new_category_description": ""
        }
    prompt = f'Page content:\n"""\n{text[:max_page_text_len]}\n"""\nClassify as per instructions.'
    for attempt in range(1, retries+1):
        if verbose:
            print("="*40)
            print(f"[GPT TEST] Page {page_num+1} | Attempt {attempt} | Length: {len(text)} | Prompt len: {len(prompt)}")
            print(prompt[:400])
            print("="*40)
        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": SYS_PROMPT},
                    {"role": "user", "content": prompt}
                ],
                temperature=0,
                max_tokens=80
            )
            reply = response.choices[0].message.content.strip().replace('\n', ' ')
            import json, re
            try:
                parsed = json.loads(reply)
            except Exception:
                m = re.search(r'"bucket"\s*:\s*"([^"]+)"', reply)
                m2 = re.search(r'"confidence"\s*:\s*([0-9.]+)', reply)
                parsed = {
                    "bucket": m.group(1) if m else "parse_error",
                    "confidence": float(m2.group(1)) if m2 else 0.5
                }
            desc = ""
            if parsed.get("bucket", "").startswith("llm_new_category") and "|" in reply:
                desc = reply.split("|",1)[1].strip()
            if verbose:
                logger.info(f'Page {page_num + 1:>4}: [{parsed.get("bucket")}] (conf: {parsed.get("confidence")})')
                if desc:
                    logger.info(f"    LLM new category description: {desc}")
            return {
                "page_number": page_num+1,
                "page_text": text[:300],
                "gpt_bucket_guess": parsed.get("bucket", ""),
                "confidence": parsed.get("confidence", 0.5),
                "llm_new_category_description": desc
            }
        except Exception as e:
            logger.warning(f'Page {page_num + 1}: API ERROR: {type(e).__name__}: {e} [attempt {attempt}]')
            if attempt < retries:
                wait = 3 * attempt
                logger.info(f"Retrying page {page_num + 1} after {wait}s...")
                time.sleep(wait)
            else:
                return {
                    "page_number": page_num+1,
                    "page_text": text[:300],
                    "gpt_bucket_guess": f"API_ERROR_{type(e).__name__}",
                    "confidence": 0.0,
                    "llm_new_category_description": ""
                }

def auto_label_pages_serial(pdf_path, output_csv, verbose=False):
    try:
        pages = extract_pdf_pages(pdf_path)
        total_pages = len(pages)
        logger.info(f"PDF extracted successfully. Number of pages: {total_pages}")
        if total_pages == 0:
            logger.error(f"No pages found in PDF: {pdf_path}")
            return []
        logger.info(f"First page text preview: '{(pages[0][:100] if pages[0] else '[Empty page]')}'")
    except Exception as e:
        logger.error(f"Could not extract PDF pages from {pdf_path}: {e}")
        return []
    logger.info(f"Classifying {total_pages} pages from: {pdf_path}")
    results = []
    with open(output_csv, "w", newline='', encoding="utf-8") as csvfile:
        fieldnames = [
            "page_number", "page_text", "gpt_bucket_guess", "confidence", "llm_new_category_description"
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i, text in tqdm(list(enumerate(pages)), desc="Classifying", total=total_pages):
            row = classify_page_gpt(i, text, verbose=verbose)
            writer.writerow(row)
            results.append(row)
            csvfile.flush()
            if i % 5 == 0 or i+1 == total_pages:
                logger.info(f"Processed {i+1} of {total_pages} pages")
    logger.info("Classification complete. Review your output CSV and log files.")
    return results

def auto_label_pages_parallel(pdf_path, output_csv, n_parallel=2, verbose=False):
    try:
        pages = extract_pdf_pages(pdf_path)
        total_pages = len(pages)
        logger.info(f"PDF extracted successfully. Number of pages: {total_pages}")
        if total_pages == 0:
            logger.error(f"No pages found in PDF: {pdf_path}")
            return []
        logger.info(f"First page text preview: '{(pages[0][:100] if pages[0] else '[Empty page]')}'")
    except Exception as e:
        logger.error(f"Could not extract PDF pages from {pdf_path}: {e}")
        return []
    logger.info(f"Classifying {total_pages} pages from: {pdf_path}")
    results = [None] * total_pages
    with open(output_csv, "w", newline='', encoding="utf-8") as csvfile:
        fieldnames = [
            "page_number", "page_text", "gpt_bucket_guess", "confidence", "llm_new_category_description"
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        with ThreadPoolExecutor(max_workers=n_parallel) as executor:
            fut_to_idx = {executor.submit(classify_page_gpt, i, text, verbose=verbose): i for i, text in enumerate(pages)}
            completed = 0
            for fut in tqdm(as_completed(fut_to_idx), total=len(fut_to_idx), desc="Classifying"):
                idx = fut_to_idx[fut]
                try:
                    row = fut.result()
                    if verbose:
                        logger.info(f"Page {row['page_number']}: bucket='{row['gpt_bucket_guess']}', confidence={row['confidence']}, text preview='{row['page_text'][:80]}'")
                        if row.get("llm_new_category_description"):
                            logger.info(f"    New LLM category desc: {row['llm_new_category_description']}")
                except Exception as e:
                    logger.warning(f"Error classifying page {idx+1}: {e}")
                    row = {
                        "page_number": idx+1,
                        "page_text": "",
                        "gpt_bucket_guess": "classification_error",
                        "confidence": 0.0,
                        "llm_new_category_description": ""
                    }
                results[idx] = row
                writer.writerow(row)
                csvfile.flush()
                completed += 1
                if completed % 5 == 0 or completed == total_pages:
                    logger.info(f"Processed {completed} of {total_pages} pages")
    logger.info("Classification complete. Review your output CSV and log files.")
    return results

def list_openai_models():
    print("Available OpenAI models:")
    models = client.models.list()
    for m in models.data:
        print(m.id)

def pdf_test_only_count_and_preview(path=PDF_PATH, n_preview=3):
    try:
        with open(path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            num_pages = len(reader.pages)
            print(f"PDF opened! Page count: {num_pages}")
            for i in range(min(n_preview, num_pages)):
                page = reader.pages[i]
                txt = page.extract_text() or "[NO TEXT FOUND]"
                print(f"\nPage {i+1} text preview: {txt[:300]}")
    except Exception as e:
        print(f"PDF open/read error: {e}")

def test_single_page(page_number=1, verbose=True):
    logger.info(f"Testing classification for page {page_number}")
    text = extract_single_page(PDF_PATH, page_number)
    print("\n--- Extracted text preview ---\n", repr(text[:500]))
    row = classify_page_gpt(page_number-1, text, verbose=verbose)
    print("\n--- Classification result ---\n", row)
    return row

if __name__ == "__main__":
    # --- Select a mode: ---
    # Options: "model_list", "pdf_preview", "single_page", "full_run_serial", "full_run_parallel"
    TEST_MODE = "full_run_parallel"
    TEST_PAGE_NUMBER = 31  # for single_page

    if TEST_MODE == "model_list":
        list_openai_models()
    elif TEST_MODE == "pdf_preview":
        pdf_test_only_count_and_preview(PDF_PATH, n_preview=5)
    elif TEST_MODE == "single_page":
        logger.info(f"Reading PDF: {PDF_PATH}")
        logger.info(f"Writing CSV: {OUTPUT_CSV}")
        t0 = time.time()
        test_single_page(TEST_PAGE_NUMBER, verbose=True)
        logger.info(f"Single page test done in {time.time() - t0:.2f}s.")
    elif TEST_MODE == "full_run_serial":
        logger.info(f"Reading PDF: {PDF_PATH}")
        logger.info(f"Writing CSV: {OUTPUT_CSV}")
        auto_label_pages_serial(PDF_PATH, OUTPUT_CSV, verbose=True)
        logger.info("Full PDF labeling (serial) done.")
    elif TEST_MODE == "full_run_parallel":
        logger.info(f"Reading PDF: {PDF_PATH}")
        logger.info(f"Writing CSV: {OUTPUT_CSV}")
        auto_label_pages_parallel(PDF_PATH, OUTPUT_CSV, n_parallel=N_PARALLEL, verbose=True)
        logger.info("Full PDF labeling (parallel) done.")

    logger.info("Done.")

In [None]:
#  This code snippet tests if the OpenAI API key is set correctly and returns a simple response.
from openai import OpenAI
client = OpenAI(api_key="")  # Use your real key
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "Say ONLY: test OK"},
        {"role": "user", "content": "Test page contents"}
    ],
    max_tokens=10
)
print(response.choices[0].message.content)

# This code snippet lists all available OpenAI models using the OpenAI Python client.
from openai import OpenAI

API_KEY_PATH = "/Users/jake/Documents/Key/OPENAI_KEY.txt"
with open(API_KEY_PATH) as f:
    OPENAI_API_KEY = f.read().strip()
client = OpenAI(api_key=OPENAI_API_KEY)

models = client.models.list()
for m in models.data:
    print(m.id)

In [None]:
import os
import csv
import PyPDF2
import logging
from openai import OpenAI
import time

# === LOGGING SETUP ===
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger()

PDF_PATH = "./Inputs/PGR_Ohio_BNIC-134120828.pdf"
API_KEY_PATH = "/Users/jake/Documents/Key/OPENAI_KEY.txt"
OUTPUT_CSV = "labeled_pages.csv"
MODEL = "gpt-4"  # or "gpt-3.5-turbo"
MIN_TEXT_LENGTH = 20
PAGE_TEXT_SLICE = 1500

SYS_PROMPT = """
You are an expert insurance regulatory analyst reviewing a state commercial auto insurance rate and rule filing.
Classify each page into one of several explicit "buckets": ...
(Truncate for brevity. Use your detailed prompt here.)
...
"""

def extract_pdf_pages(pdf_path):
    try:
        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)
            pages = [page.extract_text() or "" for page in reader.pages]
            print(f"extract_pdf_pages: Loaded {len(pages)} pages from {pdf_path}")
            logger.info(f"extract_pdf_pages: Loaded {len(pages)} pages from {pdf_path}")
            return pages
    except Exception as e:
        print(f"extract_pdf_pages ERROR: {e}")
        logger.error(f"extract_pdf_pages ERROR: {e}")
        raise

with open(API_KEY_PATH) as f:
    OPENAI_API_KEY = f.read().strip()
client = OpenAI(api_key=OPENAI_API_KEY)

def classify_page_gpt(page_num, text, model=MODEL, max_page_text_len=PAGE_TEXT_SLICE, retries=3):
    if len((text or '').strip()) < MIN_TEXT_LENGTH:
        print(f"Page {page_num + 1}: Skipped (empty or too short) [other]")
        logger.info(f"Page {page_num + 1}: Skipped (empty or too short) [other]")
        return {
            "page_number": page_num+1,
            "page_text": (text or "")[:300],
            "gpt_bucket_guess": "other",
            "confidence": 1.0,
            "llm_new_category_description": ""
        }
    prompt = f'Page content:\n"""\n{text[:max_page_text_len]}\n"""\nClassify as per instructions.'
    for attempt in range(1, retries+1):
        try:
            print(f"Classifying page {page_num+1}, attempt {attempt}, text length {len(text)}")
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": SYS_PROMPT},
                    {"role": "user", "content": prompt}
                ],
                temperature=0,
                max_tokens=80
            )
            reply = response.choices[0].message.content.strip().replace('\n', ' ')
            import json, re
            try:
                parsed = json.loads(reply)
            except Exception:
                m = re.search(r'"bucket"\s*:\s*"([^"]+)"', reply)
                m2 = re.search(r'"confidence"\s*:\s*([0-9.]+)', reply)
                parsed = {
                    "bucket": m.group(1) if m else "parse_error",
                    "confidence": float(m2.group(1)) if m2 else 0.5
                }
            desc = ""
            if parsed.get("bucket", "").startswith("llm_new_category") and "|" in reply:
                desc = reply.split("|",1)[1].strip()
            print(f"Result page {page_num+1}: bucket='{parsed.get('bucket')}', conf={parsed.get('confidence')}")
            logger.info(f"Result page {page_num+1}: bucket='{parsed.get('bucket')}', conf={parsed.get('confidence')}")
            return {
                "page_number": page_num+1,
                "page_text": text[:300],
                "gpt_bucket_guess": parsed.get("bucket", ""),
                "confidence": parsed.get("confidence", 0.5),
                "llm_new_category_description": desc
            }
        except Exception as e:
            print(f'Page {page_num + 1}: API ERROR: {type(e).__name__}: {e} [attempt {attempt}]')
            logger.warning(f'Page {page_num + 1}: API ERROR: {type(e).__name__}: {e} [attempt {attempt}]')
            if attempt < retries:
                wait = 3 * attempt
                print(f"Retrying page {page_num + 1} after {wait}s...")
                logger.info(f"Retrying page {page_num + 1} after {wait}s...")
                time.sleep(wait)
            else:
                return {
                    "page_number": page_num+1,
                    "page_text": text[:300],
                    "gpt_bucket_guess": f"API_ERROR_{type(e).__name__}",
                    "confidence": 0.0,
                    "llm_new_category_description": ""
                }


def loop_single_page_all(pdf_path, output_csv, max_pages=3):  # Default for test: just 3 pages!
    try:
        pages = extract_pdf_pages(pdf_path)
        total_pages = len(pages)
        print(f"Loaded {total_pages} pages. Will classify up to {max_pages} pages.")
        logger.info(f"Loaded {total_pages} pages. Will classify up to {max_pages} pages.")
        with open(output_csv, "w", newline='', encoding="utf-8") as csvfile:
            fieldnames = [
                "page_number", "page_text", "gpt_bucket_guess", "confidence", "llm_new_category_description"
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for i, text in enumerate(pages[:max_pages]):
                t0 = time.time()
                print(f"\n--- About to classify page {i+1}/{total_pages} ---")
                logger.info(f"\n--- About to classify page {i+1}/{total_pages} ---")
                row = classify_page_gpt(i, text)
                writer.writerow(row)
                csvfile.flush()
                print(f"--- Done with page {i+1}: bucket={row['gpt_bucket_guess']}, confidence={row['confidence']}, elapsed {time.time()-t0:.2f}s ---\n")
                logger.info(f"--- Done with page {i+1}: bucket={row['gpt_bucket_guess']}, confidence={row['confidence']}, elapsed {time.time()-t0:.2f}s ---")
        print("Loop finished. Check output CSV for results.")
        logger.info("Loop finished. Check output CSV for results.")
    except Exception as e:
        print(f"SCRIPT ERROR: {e}")
        logger.error(f"SCRIPT ERROR: {e}")

if __name__ == "__main__":
    loop_single_page_all(PDF_PATH, OUTPUT_CSV, max_pages=3)  # Start with 3 for fast feedback; set to None or remove for all once happy

In [3]:
#  this script is a more robust version of the previous one, with better error handling and logging.
import os
import csv
import PyPDF2
import logging
from openai import OpenAI
import time

# === LOGGING SETUP ===
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
logger = logging.getLogger()

PDF_PATH = "./Inputs/PGR_Ohio_BNIC-134120828_trimmed.pdf"
API_KEY_PATH = "/Users/jake/Documents/Key/OPENAI_KEY.txt"
OUTPUT_CSV = "labeled_pages.csv"
MODEL = "gpt-4o"
MIN_TEXT_LENGTH = 20
PAGE_TEXT_SLICE = 1500
SECONDS_BETWEEN_PAGES = 5     # <-- adjust as you wish!

SYS_PROMPT = """
You are an expert insurance regulatory analyst reviewing a state commercial auto insurance rate and rule filing.
Classify each page into one of several explicit "buckets": intro information, table of contents, correspondence, rule, factor table, actuarial support, form, rating example, exhibit, crossed_out, other, or llm_new_category (with a 10-word description if used). Return a JSON like {"bucket": "...", "confidence": 0.93}. Only one bucket per page. If uncertain, pick "other".
"""

def extract_pdf_pages(pdf_path):
    with open(pdf_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        pages = [page.extract_text() or "" for page in reader.pages]
        print(f"Extracted {len(pages)} pages from {pdf_path}")
        logger.info(f"Extracted {len(pages)} pages from {pdf_path}")
        return pages

with open(API_KEY_PATH) as f:
    OPENAI_API_KEY = f.read().strip()
client = OpenAI(api_key=OPENAI_API_KEY)

def classify_page_gpt(page_num, text, model=MODEL, max_page_text_len=PAGE_TEXT_SLICE, retries=3):
    if len((text or '').strip()) < MIN_TEXT_LENGTH:
        print(f"Page {page_num+1}: Skipped (empty or too short) [other]")
        logger.info(f"Page {page_num+1}: Skipped (empty or too short) [other]")
        return {
            "page_number": page_num+1,
            "page_text": (text or "")[:300],
            "gpt_bucket_guess": "other",
            "confidence": 1.0,
            "llm_new_category_description": ""
        }
    
    prompt = f'Page content:\n"""\n{text[:max_page_text_len]}\n"""\nClassify as per instructions.'
    for attempt in range(1, retries+1):
        try:
            print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] About to classify page {page_num+1}, attempt {attempt}.")
            logger.info(f"About to classify page {page_num+1}, attempt {attempt}.")
            start_api = time.time()
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": SYS_PROMPT},
                    {"role": "user", "content": prompt}
                ],
                temperature=0,
                max_tokens=80
            )
            elapsed_api = time.time() - start_api
            reply = response.choices[0].message.content.strip().replace('\n', ' ')
            import json, re
            try:
                parsed = json.loads(reply)
            except Exception:
                m = re.search(r'"bucket"\s*:\s*"([^"]+)"', reply)
                m2 = re.search(r'"confidence"\s*:\s*([0-9.]+)', reply)
                parsed = {
                    "bucket": m.group(1) if m else "parse_error",
                    "confidence": float(m2.group(1)) if m2 else 0.5
                }
            desc = ""
            if parsed.get("bucket", "").startswith("llm_new_category") and "|" in reply:
                desc = reply.split("|",1)[1].strip()
            print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Got result for page {page_num+1} after {elapsed_api:.2f}s: bucket='{parsed.get('bucket')}', conf={parsed.get('confidence')}")
            logger.info(f"Got result for page {page_num+1} after {elapsed_api:.2f}s: bucket='{parsed.get('bucket')}', conf={parsed.get('confidence')}")
            return {
                "page_number": page_num+1,
                "page_text": text[:300],
                "gpt_bucket_guess": parsed.get("bucket", ""),
                "confidence": parsed.get("confidence", 0.5),
                "llm_new_category_description": desc
            }
        except Exception as e:
            print(f"Page {page_num + 1}: API ERROR: {type(e).__name__}: {e} [attempt {attempt}]")
            logger.warning(f"Page {page_num + 1}: API ERROR: {type(e).__name__}: {e} [attempt {attempt}]")
            if attempt < retries:
                wait = 3 * attempt
                print(f"Retrying page {page_num + 1} after {wait}s...")
                logger.info(f"Retrying page {page_num + 1} after {wait}s...")
                time.sleep(wait)
            else:
                return {
                    "page_number": page_num+1,
                    "page_text": text[:300],
                    "gpt_bucket_guess": f"API_ERROR_{type(e).__name__}",
                    "confidence": 0.0,
                    "llm_new_category_description": ""
                }

def loop_single_page_all(pdf_path, output_csv, seconds_between_pages=SECONDS_BETWEEN_PAGES):
    pages = extract_pdf_pages(pdf_path)
    total_pages = len(pages)
    print(f"Loaded {total_pages} pages.")
    logger.info(f"Loaded {total_pages} pages.")
    with open(output_csv, "w", newline='', encoding="utf-8") as csvfile:
        fieldnames = [
            "page_number", "page_text", "gpt_bucket_guess", "confidence", "llm_new_category_description"
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i, text in enumerate(pages):
            t0 = time.time()
            print(f"\n--- Progress: {i+1}/{total_pages} pages ({int(100*(i+1)/total_pages)}%) ---")
            logger.info(f"Progress: {i+1}/{total_pages} ({int(100*(i+1)/total_pages)}%)")
            row = classify_page_gpt(i, text)
            writer.writerow(row)
            csvfile.flush()
            elapsed = time.time()-t0
            print(f"--- Done page {i+1}: bucket={row['gpt_bucket_guess']}, confidence={row['confidence']}, elapsed {elapsed:.2f}s ---")
            logger.info(f"Done page {i+1}: bucket={row['gpt_bucket_guess']}, confidence={row['confidence']}, elapsed {elapsed:.2f}s")
            # Emit a progress checkpoint every 5 pages
            if (i+1) % 5 == 0 or i == total_pages-1:
                print(f"[Checkpoint] Classified {i+1} of {total_pages} pages ({int(100*(i+1)/total_pages)}%)")
                logger.info(f"[Checkpoint] Classified {i+1} of {total_pages} pages ({int(100*(i+1)/total_pages)}%)")
            if i < total_pages - 1:
                print(f"Sleeping {seconds_between_pages} seconds before next page.")
                time.sleep(seconds_between_pages)
    print("Loop finished. Check output CSV for results.")
    logger.info("Loop finished. Check output CSV for results.")

if __name__ == "__main__":
    print("Starting PDF extraction only.")
    pages = extract_pdf_pages(PDF_PATH)
    print(f"Extracted {len(pages)} pages.")
    with open("test_page_texts.csv", "w", newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["page_number", "preview"])
        for i, text in enumerate(pages):
            writer.writerow([i+1, text[:100].replace('\n', ' ')])
    print("Extraction+write done.")

Starting PDF extraction only.


KeyboardInterrupt: 

Error: File not found at your_document.pdf
