In [None]:
!pip install -q chandra-ocr

In [None]:
!pip install flash-attn

In [None]:
!chandra /content/albee_et_al_1973.pdf /content/chandra_out --method hf --page-range 13-14 --no-images

In [None]:
from pathlib import Path
import subprocess
import pandas as pd
import os
import torch

pdf_path = Path("/content/albee_et_al_1973.pdf")
out_dir = Path("/content/chandra_out")
out_dir.mkdir(parents=True, exist_ok=True)

#Find HTML files produced for this PDF
html_files = sorted(out_dir.rglob(f"{pdf_path.stem}*.html"))
if not html_files:
    raise FileNotFoundError(f"No HTML files found for {pdf_path.stem} in {out_dir}")

#Extract tables and save each as traceable CSV
csv_dir = out_dir / "csv"
csv_dir.mkdir(exist_ok=True)

table_counter = 0

for h_idx, html_path in enumerate(html_files, start=1):
    try:
        tables = pd.read_html(html_path)  # list of DataFrames
    except ValueError:
        # No tables found in this HTML
        continue

    if not tables:
        continue

    for t_idx, df in enumerate(tables, start=1):
        table_counter += 1

        # Traceable filename: <pdf_stem>_h<htmlIndex>_t<tableIndex>.csv
        out_name = f"{pdf_path.stem}_h{h_idx}_t{t_idx}.csv"
        out_path = csv_dir / out_name

        df.to_csv(out_path, index=False)
        print(f"Saved table {table_counter}: {out_path}")

print(f"\nâœ… Processing complete. {table_counter} tables saved to: {csv_dir}")
