In [1]:
!pip install pymupdf
!pip install PyPDF2
!pip install pdfplumber
!pip install "camelot-py[cv]"





In [2]:
#Paper 1- simply extracting coordinates from table (this code is pretty paper speific)

import os
import contextlib
import requests
import camelot
import pandas as pd
import re

def read_tables(file_path: str, page: str | int):
    with open(os.devnull, 'w') as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            return camelot.read_pdf(file_path, pages=str(page), flavor="stream")

def extract_mni_coordinates_from_table(table_df: pd.DataFrame):
    mni_data = []
    for _, row in table_df.iterrows():
        row = row.dropna().tolist()
        # Trying to find the coordinates
        for i in range(len(row) - 2):
            try:
                x, y, z = int(row[i]), int(row[i+1]), int(row[i+2])
                # These are the values within rough brain coordinate bounds
                if all(-100 < val < 100 for val in (x, y, z)):
                    mni_data.append((x, y, z))
                    break  # To prevent double counting
            except ValueError:
                continue
    return mni_data

# Paper 1
pdf_url = "https://www.biorxiv.org/content/10.1101/2025.02.10.636597v2.full.pdf"
doi = "10.1101/2025.02.10.636597"
file_path = "paper1_fixed.pdf"

# Download PDF
response = requests.get(pdf_url)
with open(file_path, "wb") as f:
    f.write(response.content)

results = []
for page in range(1, 31):
    try:
        tables = read_tables(file_path, page)
        for table in tables:
            coords = extract_mni_coordinates_from_table(table.df)
            for x, y, z in coords:
                results.append({
                    "DOI": doi,
                    "Page": page,
                    "x": x,
                    "y": y,
                    "z": z
                })
    except Exception:
        continue

df1 = pd.DataFrame(results)
if not df1.empty:
    print(df1)
else:
    print("No correct MNI coordinates found in this paper.")

                          DOI  Page   x   y   z
0   10.1101/2025.02.10.636597    17  56 -14   6
1   10.1101/2025.02.10.636597    17 -54 -28  10
2   10.1101/2025.02.10.636597    17  -6   8  54
3   10.1101/2025.02.10.636597    17 -40  18  60
4   10.1101/2025.02.10.636597    17   6  10  58
5   10.1101/2025.02.10.636597    17  72 -52 -32
6   10.1101/2025.02.10.636597    17  16 -56   6
7   10.1101/2025.02.10.636597    17  14  52  10
8   10.1101/2025.02.10.636597    17   7 -28  -6
9   10.1101/2025.02.10.636597    17   5  20 -52
10  10.1101/2025.02.10.636597    18  58  30  24
11  10.1101/2025.02.10.636597    18  30  46  40
12  10.1101/2025.02.10.636597    18  43 -32  20
13  10.1101/2025.02.10.636597    18   6 -44   2
14  10.1101/2025.02.10.636597    18  13 -30 -64
15  10.1101/2025.02.10.636597    18  11  34 -62
16  10.1101/2025.02.10.636597    20  44  42   6
17  10.1101/2025.02.10.636597    20 -28 -72 -28
18  10.1101/2025.02.10.636597    20  56 -40  48
19  10.1101/2025.02.10.636597    20  54 

In [3]:
#Paper 2 simply extracting coordinates from table (this code is also pretty paper speific)

import os
import contextlib
import requests
import camelot
import pandas as pd
import re

def read_tables(file_path: str, page: str | int):
    with open(os.devnull, 'w') as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            return camelot.read_pdf(file_path, pages=str(page), flavor="stream")

def extract_mni_coordinates_from_table(table_df: pd.DataFrame):
    mni_data = []
    for _, row in table_df.iterrows():
        row = row.dropna().tolist()
        # Checking for 3 consecutive numeric values in row (neglecting the voxel count)
        for i in range(len(row) - 2):
            try:
                x, y, z = int(row[i]), int(row[i+1]), int(row[i+2])
                if all(-100 < val < 100 for val in (x, y, z)):
                    mni_data.append((x, y, z))
                    break  
            except ValueError:
                continue
    return mni_data

# Paper 2 Info
pdf_url = "https://www.biorxiv.org/content/10.1101/2024.11.20.624446v2.full.pdf"
doi = "10.1101/2024.11.20.624446"
file_path = "paper2.pdf"

# Download PDF
response = requests.get(pdf_url)
with open(file_path, "wb") as f:
    f.write(response.content)

# Extracting MNI coordinates
results = []
for page in range(1, 21):
    try:
        tables = read_tables(file_path, page)
        if tables:
            for table in tables:
                coords = extract_mni_coordinates_from_table(table.df)
                for x, y, z in coords:
                    results.append({
                        "DOI": doi,
                        "Page": page,
                        "x": x,
                        "y": y,
                        "z": z
                    })
    except Exception as e:
        print(f"Page {page} failed: {e}")
        continue

# Output
df = pd.DataFrame(results)
if not df.empty:
    print(df)
else:
    print("No MNI coordinates found in this paper.")

                          DOI  Page   x   y   z
0   10.1101/2024.11.20.624446     4 -45   0  45
1   10.1101/2024.11.20.624446     4   9  13 -45
2   10.1101/2024.11.20.624446     4  43  36 -63
3   10.1101/2024.11.20.624446     4  44  83 -51
4   10.1101/2024.11.20.624446     4  45  56 -45
5   10.1101/2024.11.20.624446     4 -42  15  -3
6   10.1101/2024.11.20.624446     4   6  94  42
7   10.1101/2024.11.20.624446     4   9   5  45
8   10.1101/2024.11.20.624446     4  43  33  60
9   10.1101/2024.11.20.624446     4  44  90  48
10  10.1101/2024.11.20.624446     4  45  95  45
11  10.1101/2024.11.20.624446     4  39  18   0
12  10.1101/2024.11.20.624446     5 -45   0  45
13  10.1101/2024.11.20.624446     5  42   6  57
14  10.1101/2024.11.20.624446     5 -45   9  45
15  10.1101/2024.11.20.624446     5  45  15  54
16  10.1101/2024.11.20.624446     5 -63   0  24
17  10.1101/2024.11.20.624446     5  60  -3  24
18  10.1101/2024.11.20.624446     5 -51   9  27
19  10.1101/2024.11.20.624446     5  48 

In [4]:
#Paper 3
'''This code tries to extract coordinates from a research paper when the coordinatres are not in tabular form,
and are inside paragraphs, but in the specific format- [x,y,z]'''

import os
import re
import requests
import fitz  # PyMuPDF
import pandas as pd

# Paper 3 Info
pdf_url = "https://www.biorxiv.org/content/10.1101/2024.11.20.624569v2.full.pdf"
doi = "10.1101/2024.11.20.624569"
file_path = "paper3.pdf"

# Download the PDF
response = requests.get(pdf_url)
with open(file_path, "wb") as f:
    f.write(response.content)

# A regular expression pattern to find the coordinates in the form [x,y,z]
mni_pattern = re.compile(r'\[\s*(-?\d{1,3})\s*,\s*(-?\d{1,3})\s*,\s*(-?\d{1,3})\s*\]')

results = []

doc = fitz.open(file_path)
for page_num in range(len(doc)):
    text = doc[page_num].get_text()
    matches = mni_pattern.findall(text)
    
    for match in matches:
        x, y, z = map(int, match)
        # Valid MNI range and meaningful values
        if all(-100 <= val <= 100 for val in (x, y, z)):
            results.append({
                "DOI": doi,
                "Page": page_num + 1,
                "x": x,
                "y": y,
                "z": z
            })

df = pd.DataFrame(results)
if not df.empty:
    print(df)
else:
    print("No valid MNI coordinates found in the paper.")


                          DOI  Page   x   y   z
0   10.1101/2024.11.20.624569     9 -20   1 -34
1   10.1101/2024.11.20.624569     9 -28   3 -38
2   10.1101/2024.11.20.624569     9 -28  -2 -42
3   10.1101/2024.11.20.624569    10  27 -10 -28
4   10.1101/2024.11.20.624569    12   7  47  23
5   10.1101/2024.11.20.624569    12   5  57  -2
6   10.1101/2024.11.20.624569    12   0  33  11
7   10.1101/2024.11.20.624569    15  19 -10 -32
8   10.1101/2024.11.20.624569    15  -4  21  -6
9   10.1101/2024.11.20.624569    15  15  21 -26
10  10.1101/2024.11.20.624569    15   1 -44  27
11  10.1101/2024.11.20.624569    15  29   0 -42
12  10.1101/2024.11.20.624569    17  -8  55   3
13  10.1101/2024.11.20.624569    17  -4  39  -9
14  10.1101/2024.11.20.624569    17   3  33 -12
15  10.1101/2024.11.20.624569    86 -17   9 -12
16  10.1101/2024.11.20.624569    86  26   0 -16
17  10.1101/2024.11.20.624569    88   9  48  25


In [5]:
#Paper 4: # This paper has tabular coodinates but in a very weird format so extracting coordinates from them was hard.
#So this code needs more correcting because it is not yet recognizing all the tables in it, but it's working to some extent.

import requests
import pdfplumber
import pandas as pd
import re
import sys
import os
import contextlib

# Suppress CropBox / MediaBox warnings
@contextlib.contextmanager
def suppress_stderr():
    with open(os.devnull, "w") as devnull:
        old_stderr = sys.stderr
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stderr = old_stderr

# Setup
pdf_url = "https://www.biorxiv.org/content/10.1101/2025.04.04.647211v1.full.pdf"
doi = "10.1101/2025.04.04.647211"
file_path = "paper4_fullscan.pdf"

# Download PDF
response = requests.get(pdf_url)
with open(file_path, "wb") as f:
    f.write(response.content)

# Pattern: match ONLY integer triplets (no floats!)
int_pattern = re.compile(r"(?<![\d.])(-?\d{1,3})(?!\.)\s+(-?\d{1,3})(?!\.)\s+(-?\d{1,3})(?!\.)")

# Extract paper 4
results = []
with suppress_stderr():
    with pdfplumber.open(file_path) as pdf:
        for i, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            lines = text.split("\n")
            for line in lines:
                matches = list(int_pattern.finditer(line))
                if matches:
                    x, y, z = map(int, matches[-1].groups())  # Get the last match on the line
                    if all(-100 < val < 100 for val in (x, y, z)):
                        results.append({
                            "DOI": doi,
                            "Page": i,
                            "x": x,
                            "y": y,
                            "z": z
                        })

# Output
df = pd.DataFrame(results)
if not df.empty:
    print(df)
else:
    print("No MNI coordinates found.")


                         DOI  Page   x   y   z
0  10.1101/2025.04.04.647211    15 -44 -18  54
1  10.1101/2025.04.04.647211    15 -38   8  48
2  10.1101/2025.04.04.647211    15   8 -10  76
3  10.1101/2025.04.04.647211    15 -44 -18  54
4  10.1101/2025.04.04.647211    15 -38   8  48
5  10.1101/2025.04.04.647211    16   6 -10  74
6  10.1101/2025.04.04.647211    16  38 -38  62
7  10.1101/2025.04.04.647211    18 -62 -18  34


In [6]:
'''Paper 5: This is the paper where the code was initially also recognizing values from tables
that did not include the coordinates just because it fit in the -100<n<100 values. 
So i included another criteria to look at the validity of all values in that table to figure out if those are actually coordinates or not.'''

import os
import contextlib
import requests
import camelot
import pandas as pd
import re

def read_tables(file_path: str, page: str | int):
    with open(os.devnull, 'w') as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            return camelot.read_pdf(file_path, pages=str(page), flavor="stream")

def is_valid_mni_triplet(x, y, z):
    return all(-100 < val < 100 for val in (x, y, z))

def extract_valid_mni_table(table_df: pd.DataFrame, validity_threshold=0.8):
    """
    Only return MNI coordinates if a sufficient percentage of rows contain valid triplets.
    """
    mni_data = []
    valid_rows = 0
    candidate_rows = 0

    for _, row in table_df.iterrows():
        row = row.dropna().tolist()
        numeric = []

        for cell in row:
            try:
                numeric.append(int(cell))
            except:
                continue

        if len(numeric) >= 3:
            candidate_rows += 1
            for i in range(len(numeric) - 2):
                x, y, z = numeric[i], numeric[i+1], numeric[i+2]
                if is_valid_mni_triplet(x, y, z):
                    mni_data.append((x, y, z))
                    valid_rows += 1
                    break

    if candidate_rows == 0:
        return []  # Not a numeric table

    if valid_rows / candidate_rows >= validity_threshold:
        return mni_data  # Accept table as valid
    else:
        return []  # Reject noisy table

# Paper-specific setup
pdf_url = "https://www.biorxiv.org/content/10.1101/2025.03.24.645080v2.full.pdf"
doi = "10.1101/2025.03.24.645080"
file_path = "paper5.pdf"

# Download PDF paper 5
response = requests.get(pdf_url)
with open(file_path, "wb") as f:
    f.write(response.content)

# Scan and validate tables
results = []
for page in range(1, 48):
    try:
        tables = read_tables(file_path, page)
        for table in tables:
            coords = extract_valid_mni_table(table.df, validity_threshold=0.8)
            for x, y, z in coords:
                results.append({
                    "DOI": doi,
                    "Page": page,
                    "x": x,
                    "y": y,
                    "z": z
                })
    except Exception as e:
        print(f"Error on page {page}: {e}")
        continue

# Output
df = pd.DataFrame(results)
if not df.empty:
    print("MNI coordinates found:")
    print(df)
else:
    print("No valid MNI coordinates found in this paper.")


MNI coordinates found:
                          DOI  Page   x   y   z
0   10.1101/2025.03.24.645080    42 -10 -99   8
1   10.1101/2025.03.24.645080    42   8 -83  -5
2   10.1101/2025.03.24.645080    42 -13 -83 -10
3   10.1101/2025.03.24.645080    42  12 -47 -70
4   10.1101/2025.03.24.645080    42   4  47 -65
5   10.1101/2025.03.24.645080    42 -55 -21  52
6   10.1101/2025.03.24.645080    42 -44 -26  63
7   10.1101/2025.03.24.645080    42 -42 -16  13
8   10.1101/2025.03.24.645080    42  39  -8  11
9   10.1101/2025.03.24.645080    42 -52 -70   8
10  10.1101/2025.03.24.645080    42 -52 -42 -10


In [7]:
#Paper 6. This is the paper that required me to change the flavor from 'stream' to 'lattice'.

import os
import contextlib
import requests
import camelot
import pandas as pd
import re

def read_tables(file_path: str, page: str | int):
    with open(os.devnull, 'w') as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            return camelot.read_pdf(file_path, pages=str(page), flavor="lattice")

def extract_mni_coordinates_from_table(table_df: pd.DataFrame):
    mni_data = []
    for _, row in table_df.iterrows():
        row = row.dropna().tolist()
        # Checking for 3 consecutive numeric values in row (neglecting the voxel count)
        for i in range(len(row) - 2):
            try:
                x, y, z = int(row[i]), int(row[i+1]), int(row[i+2])
                if all(-100 < val < 100 for val in (x, y, z)):
                    mni_data.append((x, y, z))
                    break  
            except ValueError:
                continue
    return mni_data

# Paper 6 Info
pdf_url = "https://www.biorxiv.org/content/10.1101/2025.02.19.639155v1.full.pdf"
doi = "10.1101/2025.02.19.639155"
file_path = "paper6.pdf"

# Download PDF
response = requests.get(pdf_url)
with open(file_path, "wb") as f:
    f.write(response.content)

# Extracting MNI coordinates
results = []
for page in range(1, 27):
    try:
        tables = read_tables(file_path, page)
        if tables:
            for table in tables:
                coords = extract_mni_coordinates_from_table(table.df)
                for x, y, z in coords:
                    results.append({
                        "DOI": doi,
                        "Page": page,
                        "x": x,
                        "y": y,
                        "z": z
                    })
    except Exception as e:
        print(f"Page {page} failed: {e}")
        continue

# Output
df = pd.DataFrame(results)
if not df.empty:
    print(df)
else:
    print("No MNI coordinates found in this paper.")

                           DOI  Page   x   y   z
0    10.1101/2025.02.19.639155    21 -50  10  22
1    10.1101/2025.02.19.639155    21 -50  10  20
2    10.1101/2025.02.19.639155    21 -48  18  -2
3    10.1101/2025.02.19.639155    21 -50  16  -6
4    10.1101/2025.02.19.639155    21 -48  12  14
..                         ...   ...  ..  ..  ..
119  10.1101/2025.02.19.639155    26  -8  14  -2
120  10.1101/2025.02.19.639155    26  -8  -4   4
121  10.1101/2025.02.19.639155    26  -4 -18   6
122  10.1101/2025.02.19.639155    26 -14 -18  12
123  10.1101/2025.02.19.639155    26 -58 -42  10

[124 rows x 5 columns]


In [8]:
'''Paper 7 (i): Table extraction. 
This paper has mni coordinates in both, paragraph and table form. 
This code is just for the table. For this paper I experimented with the page range. 
I tried to write code that can parse through all pages of a paper, wihtout including its page range. 
All my older codes have a specific page range, so this code is useful if we simply want to feed the paper,
and not change the range each time we do that.'''

import os
import contextlib
import requests
import camelot
import pandas as pd
import fitz  # to detect page count

def read_tables(file_path: str, page: str | int):
    with open(os.devnull, 'w') as fnull:
        with contextlib.redirect_stdout(fnull), contextlib.redirect_stderr(fnull):
            return camelot.read_pdf(file_path, pages=str(page), flavor="stream")

def extract_mni_coordinates_from_table(table_df: pd.DataFrame):
    mni_data = []
    for _, row in table_df.iterrows():
        row = row.dropna().tolist()
        for i in range(len(row) - 2):
            try:
                x, y, z = int(row[i]), int(row[i+1]), int(row[i+2])
                if all(-100 < val < 100 for val in (x, y, z)):
                    mni_data.append((x, y, z))
                    break
            except ValueError:
                continue
    return mni_data

# ---- Paper Info ---- #
pdf_url = "https://www.biorxiv.org/content/10.1101/2024.11.07.622453v2.full.pdf"
doi = "10.1101/2024.11.07.622453"
file_path = "paper_generic.pdf"

# ---- Download PDF ---- #
response = requests.get(pdf_url)
with open(file_path, "wb") as f:
    f.write(response.content)

# ---- Automatically determine number of pages ---- #
doc = fitz.open(file_path)
total_pages = len(doc)
doc.close()

# ---- Extract MNI coordinates from all tables ---- #
results = []
for page in range(1, total_pages + 1):
    try:
        tables = read_tables(file_path, page)
        for table in tables:
            coords = extract_mni_coordinates_from_table(table.df)
            for x, y, z in coords:
                results.append({
                    "DOI": doi,
                    "Page": page,
                    "x": x,
                    "y": y,
                    "z": z
                })
    except Exception:
        continue

# ---- Output ---- #
df1 = pd.DataFrame(results)
if not df1.empty:
    print(df1)
else:
    print("No correct MNI coordinates found in this paper.")

                         DOI  Page   x   y   z
0  10.1101/2024.11.07.622453    24   3   4   5
1  10.1101/2024.11.07.622453    30  10 -22  69
2  10.1101/2024.11.07.622453    30  61 -36  26
3  10.1101/2024.11.07.622453    30   4 -61  45
4  10.1101/2024.11.07.622453    30  16 -56  19
5  10.1101/2024.11.07.622453    30  27  12  56
6  10.1101/2024.11.07.622453    30  45 -66  30


In [9]:
'''paper 7 (ii): paragraph coordinates extraction. 
In the coordinates in paragraph format, this paper had coordinates in no specific format like [x,y,z]. 
So i tried to write a code that could catch coordinates in different formats.'''
# where coordinates can appear in paragraph text in formats like:
# [x, y, z], (x, y, z), or just x, y, z
#But it's still unable to find coordinates that are in the for of 'x, y and z'. This table does contain such wording, so I will have to tweak the code like that.
#Since this is one of the first papers I have found that contains coordinates in both table, an paragraph format, I wasn't sure if i should merge the codes or not. 
#So for now, I have done them separately.
#As you said, we might have to send each paper through various codes so it might work like that.

import os
import re
import requests
import fitz  # PyMuPDF
import pandas as pd

# Paper Info
pdf_url = "https://www.biorxiv.org/content/10.1101/2024.11.07.622453v2.full.pdf"
doi = "10.1101/2024.11.07.622453"
file_path = "paper7.pdf"

# Download the PDF
response = requests.get(pdf_url)
with open(file_path, "wb") as f:
    f.write(response.content)

# Regex to match coordinates in any of these forms:
# [x, y, z], (x, y, z), or x, y, z
mni_pattern = re.compile(
    r'[\[\(]?\s*(-?\d{1,3})\s*,\s*(-?\d{1,3})\s*,\s*(-?\d{1,3})\s*[\]\)]?'
)

results = []

doc = fitz.open(file_path)
for page_num in range(len(doc)):
    text = doc[page_num].get_text()
    matches = mni_pattern.findall(text)
    
    for match in matches:
        x, y, z = map(int, match)
        if all(-100 <= val <= 100 for val in (x, y, z)):
            results.append({
                "DOI": doi,
                "Page": page_num + 1,
                "x": x,
                "y": y,
                "z": z
            })

df = pd.DataFrame(results)
if not df.empty:
    print(df)
else:
    print("No valid MNI coordinates found in the paper.")


                         DOI  Page   x   y   z
0  10.1101/2024.11.07.622453     1   1   3   4
1  10.1101/2024.11.07.622453    12  38 -69  35
2  10.1101/2024.11.07.622453    13  25 -11 -15
3  10.1101/2024.11.07.622453    14  39 -24 -12
4  10.1101/2024.11.07.622453    16  42 -70  40
5  10.1101/2024.11.07.622453    16  39 -70  35
