In [None]:
import os
import re
import fitz
import camelot
import pandas as pd
import requests
import contextlib
import sys
import cloudscraper

@contextlib.contextmanager
def suppress_output():
    with open(os.devnull, "w") as devnull:
        old_stdout, old_stderr = sys.stdout, sys.stderr
        sys.stdout, sys.stderr = devnull, devnull
        try:
            yield
        finally:
            sys.stdout, sys.stderr = old_stdout, old_stderr

def read_tables(file_path: str, page: int, flavor: str):
    with suppress_output():
        try:
            return camelot.read_pdf(file_path, pages=str(page), flavor=flavor)
        except:
            return []

def is_valid_mni_triplet(x, y, z, bound=100):
    return all(-bound < val < bound for val in (x, y, z))

def extract_from_table(table_df: pd.DataFrame, validity_threshold=0.7):
    mni_data, valid_rows, candidate_rows = [], 0, 0
    for _, row in table_df.iterrows():
        numeric = []
        for cell in row.dropna().tolist():
            try:
                numeric.append(int(cell))
            except:
                continue
        if len(numeric) >= 3:
            candidate_rows += 1
            for i in range(len(numeric) - 2):
                x, y, z = numeric[i], numeric[i+1], numeric[i+2]
                if is_valid_mni_triplet(x, y, z):
                    valid_rows += 1
                    mni_data.append((x, y, z))
                    break
    if candidate_rows and valid_rows / candidate_rows >= validity_threshold:
        return mni_data
    return []

def extract_from_paragraphs(doc, keyword_window=80):
    """""
    For each regex match I tried to check nearby text for coordinate keywords.
    """
    # Pattern: [x,y,z], (x,y,z), x, y, z, "x, y and z"
    pattern = re.compile(
        r'(?:(?<=\[)|(?<=\()|(?<![\d\w]))\s*(-?\d{1,3})\s*(?:,|and| )+\s*(-?\d{1,3})\s*(?:,|and| )+\s*(-?\d{1,3})(?=[\]\)]|[^0-9]|$)',
        re.IGNORECASE
    )
    # Keywords
    coord_keywords = re.compile(r'\b(MNI|coordinate|coordinates|activation|cluster|peak|voxel|x, y, z)\b', re.IGNORECASE)

    para_results = []
    for page_num in range(len(doc)):
        text = doc[page_num].get_text("text")
        if not text or not coord_keywords.search(text):
            continue

        for m in pattern.finditer(text):
            try:
                x, y, z = map(int, m.groups())
            except:
                continue

            if not is_valid_mni_triplet(x, y, z, bound=100):
                continue

            start = max(0, m.start() - keyword_window)
            end = min(len(text), m.end() + keyword_window)
            nearby = text[start:end]
            if coord_keywords.search(nearby):
                para_results.append({
                    "Source": "Paragraph",
                    "Page": page_num + 1,
                    "x": x, "y": y, "z": z
                })

    return para_results

# MAIN

pdf_url = input("Enter PDF URL: ").strip()
doi = input("Enter DOI: ").strip()
file_path = "paper_universal.pdf"

scraper = cloudscraper.create_scraper()
response = scraper.get(pdf_url)
with open(file_path, "wb") as f:
    f.write(response.content)

print("\n--- Starting extraction ---\n")

doc = fitz.open(file_path)
total_pages = len(doc)
results = []

# 1) Table extraction
table_coords_set = set() 
table_results = []

for flavor in ["stream", "lattice"]:
    for page in range(1, total_pages + 1):
        tables = read_tables(file_path, page, flavor)
        for table in tables:
            coords = extract_from_table(table.df)
            for (x, y, z) in coords:
                entry = {"Source": f"Table ({flavor})", "Page": page, "x": x, "y": y, "z": z}
                table_results.append(entry)
                table_coords_set.add((page, x, y, z))

results.extend(table_results)

# 2) Paragraph extraction:
para_results = extract_from_paragraphs(doc, keyword_window=80)
for entry in para_results:
    key = (entry["Page"], entry["x"], entry["y"], entry["z"])
    if key in table_coords_set:
        continue
    results.append(entry)

df = pd.DataFrame(results)
if not df.empty:
    # remove duplicates
    df = df.drop_duplicates(subset=["Page", "x", "y", "z"], keep="first")
    df.insert(0, "DOI", doi)
    df = df.sort_values(by=["Page", "Source"])
    print("\nExtraction complete.\n")
    print(df.reset_index(drop=True))
    df.to_csv("mni_coordinates_extracted.csv", index=False)
    print("\n Saved results to 'mni_coordinates_extracted.csv'")
else:
    print("No coordinates found in this document.")

doc.close()



--- Starting extraction ---


✅ Extraction complete.

                          DOI          Source  Page   x   y   z
0   10.1101/2024.11.20.624446       Paragraph     3 -45   0  45
1   10.1101/2024.11.20.624446       Paragraph     3  42   6  57
2   10.1101/2024.11.20.624446       Paragraph     3 -63   0  24
3   10.1101/2024.11.20.624446       Paragraph     3  60  -3  24
4   10.1101/2024.11.20.624446       Paragraph     3 -42  15  -3
..                        ...             ...   ...  ..  ..  ..
91  10.1101/2024.11.20.624446  Table (stream)    18  44  83 -51
92  10.1101/2024.11.20.624446  Table (stream)    18  45  56 -45
93  10.1101/2024.11.20.624446  Table (stream)    18  46  67 -30
94  10.1101/2024.11.20.624446  Table (stream)    18 -39  21  -6
95  10.1101/2024.11.20.624446  Table (stream)    18 -42  15  -3

[96 rows x 6 columns]

📁 Saved results to 'mni_coordinates_extracted.csv'
