In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install chandra-ocr



In [3]:
!pip install flash-attn



In [6]:
!chandra /content/prinz_science_1973.pdf /content/drive/MyDrive/chandra_out --method hf --page-range 2 --no-images

2025-11-12 03:19:26.696605: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-12 03:19:26.714692: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762917566.736248   13653 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762917566.742739   13653 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1762917566.759490   13653 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [7]:
import os
import json
import glob
import io
import hashlib
from pathlib import Path
from typing import List
import pandas as pd

OUTPUT_DIR = Path("/content/drive/MyDrive/chandra_out")
SAVE_DIR = OUTPUT_DIR / "tables_csv"
SAVE_DIR.mkdir(parents=True, exist_ok=True)

def ensure_markdown_lib():
    try:
        import markdown
        return True
    except Exception:
        import sys, subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "markdown", "--quiet"])
        import markdown
        return True

MD_AVAILABLE = ensure_markdown_lib()

def md_to_html(md_text: str) -> str:
    import markdown
    return markdown.markdown(md_text, extensions=["tables"]) if MD_AVAILABLE else f"<pre>{md_text}</pre>"

def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy().dropna(axis=0, how="all").dropna(axis=1, how="all")
    df = df.astype(str).applymap(lambda x: " ".join(x.strip().split()))
    df.columns = [(" ".join(str(c).strip().split())) for c in df.columns]
    df.reset_index(drop=True, inplace=True)
    return df

def df_fingerprint(df: pd.DataFrame) -> str:
    ndf = normalize_df(df)
    payload = {"columns": list(ndf.columns), "data": ndf.to_dict(orient="records")}
    raw = json.dumps(payload, sort_keys=True, ensure_ascii=False)
    return hashlib.sha256(raw.encode("utf-8")).hexdigest()

def save_unique_tables(dfs: List[pd.DataFrame], source_name: str, ext: str, seen: set, table_counter: int) -> int:
    saved = 0
    for i, df in enumerate(dfs, start=1):
        if df is None or df.empty:
            continue
        fp = df_fingerprint(df)
        if fp in seen:
            continue
        seen.add(fp)
        df = normalize_df(df)
        safe_name = source_name.replace(" ", "_").replace(".", "_")
        out_path = SAVE_DIR / f"{safe_name}_{ext}_table{i}.csv"
        df.to_csv(out_path, index=False)
        print(f"[OK] Saved unique table → {out_path.name}")
        saved += 1
        table_counter += 1
    return saved

def extract_tables_from_html_file(path: Path) -> List[pd.DataFrame]:
    try:
        return pd.read_html(str(path))
    except ValueError:
        return []
    except Exception as e:
        print(f"[WARN] HTML parse error on {path.name}: {e}")
        return []

def extract_tables_from_html_str(html_text: str) -> List[pd.DataFrame]:
    try:
        return pd.read_html(io.StringIO(html_text))
    except ValueError:
        return []
    except Exception:
        return []

def extract_tables_from_markdown_file(path: Path) -> List[pd.DataFrame]:
    try:
        text = path.read_text(encoding="utf-8", errors="ignore")
    except Exception:
        return []
    html = md_to_html(text)
    return extract_tables_from_html_str(html)

def looks_like_md_table(text: str) -> bool:
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    return any(("|" in ln and "---" in ln) for ln in lines[:10]) or any(ln.startswith("|") for ln in lines[:10])

def extract_tables_from_json_obj(obj) -> List[pd.DataFrame]:
    results: List[pd.DataFrame] = []

    def from_cells(rows):
        if not isinstance(rows, list) or not rows:
            return None
        if all(isinstance(r, dict) for r in rows):
            for k in ["cells", "values", "row", "data"]:
                if all(k in r for r in rows):
                    rows = [r[k] for r in rows]
                    break
        if not all(isinstance(r, (list, tuple)) for r in rows):
            return None
        try:
            if rows and all(isinstance(x, (str, int, float, type(None))) for x in rows[0]):
                return pd.DataFrame(rows[1:], columns=[str(c) for c in rows[0]])
            else:
                return pd.DataFrame(rows)
        except Exception:
            return None

    def walk(node):
        if isinstance(node, dict):
            html_str = node.get("html") or node.get("table_html")
            if isinstance(html_str, str) and "<table" in html_str.lower():
                results.extend(extract_tables_from_html_str(html_str))
            md_str = node.get("markdown") or node.get("md")
            if isinstance(md_str, str) and looks_like_md_table(md_str):
                html = md_to_html(md_str)
                results.extend(extract_tables_from_html_str(html))
            for key in ("cells", "rows", "data", "table", "table_data"):
                if key in node:
                    df = from_cells(node[key])
                    if df is not None:
                        results.append(df)
            for v in node.values():
                walk(v)
        elif isinstance(node, list):
            for v in node:
                walk(v)

    walk(obj)
    return results

def extract_tables_from_json_file(path: Path) -> List[pd.DataFrame]:
    try:
        text = path.read_text(encoding="utf-8", errors="ignore")
        data = json.loads(text)
    except Exception as e:
        print(f"[WARN] JSON parse error on {path.name}: {e}")
        return []
    return extract_tables_from_json_obj(data)

def main():
    seen_fps = set()
    total_saved = 0

    html_files = sorted(glob.glob(str(OUTPUT_DIR / "**" / "*.html"), recursive=True))
    md_files   = sorted(glob.glob(str(OUTPUT_DIR / "**" / "*.md"), recursive=True))
    json_files = sorted(glob.glob(str(OUTPUT_DIR / "**" / "*.json"), recursive=True))

    print(f"Scanning {OUTPUT_DIR} …")
    print(f"Found {len(html_files)} HTML, {len(md_files)} Markdown, {len(json_files)} JSON file(s).")

    for path_str in html_files:
        path = Path(path_str)
        dfs = extract_tables_from_html_file(path)
        total_saved += save_unique_tables(dfs, path.stem, "html", seen_fps, total_saved)


    print(f"\n✅ Done. Saved {total_saved} unique, traceable tables to: {SAVE_DIR}")

if __name__ == "__main__":
    main()


Scanning /content/drive/MyDrive/chandra_out …
Found 1 HTML, 1 Markdown, 1 JSON file(s).
[OK] Saved unique table → prinz_science_1973_html_table1.csv
[OK] Saved unique table → prinz_science_1973_html_table2.csv

✅ Done. Saved 2 unique, traceable tables to: /content/drive/MyDrive/chandra_out/tables_csv


  df = df.astype(str).applymap(lambda x: " ".join(x.strip().split()))
  df = df.astype(str).applymap(lambda x: " ".join(x.strip().split()))
  df = df.astype(str).applymap(lambda x: " ".join(x.strip().split()))
  df = df.astype(str).applymap(lambda x: " ".join(x.strip().split()))
