In [15]:
!pip install pymupdf
!pip install PyPDF2
!pip install pdfplumber
!pip install "camelot-py[cv]"





In [18]:
import fitz  # PyMuPDF
import re
import pandas as pd

# --- Paper Info ---
file_path = "lab trial code for caption plus tables.pdf"
paper_title = "Trial for Caption Identification"
doi = "11.222.333.444"

# --- Regex patterns ---
table_pattern = re.compile(r"Table\s*\d+(\.\d+)?", re.IGNORECASE)
caption_pattern = re.compile(r"(Table\s*\d+(\.\d+)?[:.\-]?\s*(.*))", re.IGNORECASE)
coord_pattern = re.compile(r"(-?\d{1,3})[\s,]+(-?\d{1,3})[\s,]+(-?\d{1,3})")

# --- Open PDF ---
doc = fitz.open(file_path)
results = {}

for page_num, page in enumerate(doc, start=1):
    text = page.get_text("text")
    lines = text.split("\n")

    for i, line in enumerate(lines):
        # Detect table mentions
        if re.search(table_pattern, line):
            table_match = re.search(table_pattern, line)
            table_id = table_match.group(0).title().replace("Table ", "Table ")

            # --- Find caption ---
            caption = ""
            if re.search(caption_pattern, line):
                cap_match = re.search(caption_pattern, line)
                if cap_match and cap_match.group(3):
                    caption = cap_match.group(3).strip()

            if not caption:
                # Combine up to next 3 lines if no caption on same line
                caption_lines = []
                for j in range(1, 4):
                    if i + j < len(lines):
                        next_line = lines[i + j].strip()
                        if re.search(table_pattern, next_line) or len(next_line) < 5:
                            break
                        caption_lines.append(next_line)
                caption = " ".join(caption_lines).strip()

            # --- Extract nearby coordinates ---
            coords = []
            for k in range(max(0, i - 10), min(len(lines), i + 10)):
                coord_matches = coord_pattern.findall(lines[k])
                for match in coord_matches:
                    x, y, z = map(int, match)
                    if all(-120 <= val <= 120 for val in (x, y, z)):
                        coords.append((x, y, z))

            # --- Merge if table already seen ---
            if table_id not in results:
                results[table_id] = {
                    "Caption": caption,
                    "Pages": {page_num},
                    "Coordinates": coords
                }
            else:
                # Add new page and coordinates if not already stored
                results[table_id]["Pages"].add(page_num)
                results[table_id]["Coordinates"].extend(coords)
                # Prefer non-empty caption if previous was blank
                if not results[table_id]["Caption"] and caption:
                    results[table_id]["Caption"] = caption

# --- Output formatting ---
print(f"Title: {paper_title}")
print(f"DOI: {doi}\n")

for table_id, data in results.items():
    print(f"{table_id}:")
    print(f"Caption: {data['Caption'] if data['Caption'] else '(No caption found)'}")
    print(f"Page Numbers: {', '.join(map(str, sorted(data['Pages'])))}")
    print("Coordinates:\n")

    if data["Coordinates"]:
        df = pd.DataFrame(data["Coordinates"], columns=["x", "y", "z"]).drop_duplicates()
        print(df.to_string(index=False))
    else:
        print("(No coordinates found)")
    print("\n" + "-"*60 + "\n")


Title: Trial for Caption Identification
DOI: 11.222.333.444

Table 1.1:
Caption: This is to test if code can capture this as caption.
Page Numbers: 3, 4
Coordinates:

(No coordinates found)

------------------------------------------------------------

Table 1.2:
Caption: Another caption for this table. A different format.
Page Numbers: 4
Coordinates:

(No coordinates found)

------------------------------------------------------------



In [19]:
import fitz  # PyMuPDF
import camelot
import re
import pandas as pd
import os

# --- PDF Info ---
file_path = "lab trial code for caption plus tables.pdf"
doi = "11.222.333.444"

# --- Regex patterns ---
table_pattern = re.compile(r"Table\s*\d+(\.\d+)?", re.IGNORECASE)
caption_pattern = re.compile(r"(Table\s*\d+(\.\d+)?[:.\-]?\s*(.*))", re.IGNORECASE)

# --- Extract Abstract as Title ---
doc = fitz.open(file_path)
first_page_text = doc[0].get_text("text")
abstract_match = re.search(r'(?i)abstract[:\-]?\s*(.*)', first_page_text)
if abstract_match:
    title = abstract_match.group(1).strip()
else:
    title = first_page_text.split("\n")[0][:80]  # fallback: first line

results = {}

for page_num, page in enumerate(doc, start=1):
    text = page.get_text("text")
    lines = text.split("\n")

    for i, line in enumerate(lines):
        # Detect table references
        if re.search(table_pattern, line):
            table_match = re.search(table_pattern, line)
            table_id = table_match.group(0).title()

            # Find caption
            caption = ""
            if re.search(caption_pattern, line):
                cap_match = re.search(caption_pattern, line)
                if cap_match and cap_match.group(3):
                    caption = cap_match.group(3).strip()

            if not caption:
                caption_lines = []
                for j in range(1, 3):
                    if i + j < len(lines):
                        next_line = lines[i + j].strip()
                        if re.search(table_pattern, next_line) or len(next_line) < 5:
                            break
                        caption_lines.append(next_line)
                caption = " ".join(caption_lines).strip()

            # Merge results if same table reappears
            if table_id not in results:
                results[table_id] = {
                    "Caption": caption,
                    "Pages": {page_num},
                    "Coordinates": []
                }
            else:
                results[table_id]["Pages"].add(page_num)
                if not results[table_id]["Caption"] and caption:
                    results[table_id]["Caption"] = caption

# --- Extract tables using Camelot ---
for table_id, data in results.items():
    for page in data["Pages"]:
        try:
            tables = camelot.read_pdf(file_path, pages=str(page), flavor="stream")
            for table in tables:
                df = table.df
                # Try to find 3 consecutive numeric values
                for _, row in df.iterrows():
                    row_vals = row.dropna().tolist()
                    for i in range(len(row_vals) - 2):
                        try:
                            x, y, z = int(row_vals[i]), int(row_vals[i+1]), int(row_vals[i+2])
                            if all(-120 <= val <= 120 for val in (x, y, z)):
                                data["Coordinates"].append((x, y, z))
                        except:
                            continue
        except Exception:
            continue

# --- Output formatted ---
print(f"Title: {title}")
print(f"DOI: {doi}\n")

for table_id, data in results.items():
    print(f"{table_id}:")
    print(f"Caption: {data['Caption'] if data['Caption'] else '(No caption found)'}")
    print(f"Page Numbers: {', '.join(map(str, sorted(data['Pages'])))}")
    print("Coordinates:\n")

    if data["Coordinates"]:
        df_coords = pd.DataFrame(data["Coordinates"], columns=["x", "y", "z"]).drop_duplicates()
        print(df_coords.to_string(index=False))
    else:
        print("(No coordinates found)")
    print("\n" + "-"*60 + "\n")


CropBox missing from /Page, defaulting to MediaBox
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
CropBox missing from /Page, defaulting to MediaBox
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)
CropBox missing from /Page, defaulting to MediaBox
  cols, rows, v_s, h_s = self._generate_columns_and_rows(bbox, user_cols)


Title: trial for caption identification
DOI: 11.222.333.444

Table 1.1:
Caption: This is to test if code can capture this as caption.
Page Numbers: 3, 4
Coordinates:

 x  y  z
15 78 54
20 73 92
25 84 94
30 36 46
34 82 92
12 56 78
 5 45 68
57 76 34
35 35  8
76 74 44
66 65 55
77 87 66
44 46 65

------------------------------------------------------------

Table 1.2:
Caption: Another caption for this table. A different format.
Page Numbers: 4
Coordinates:

 x  y  z
12 56 78
 5 45 68
57 76 34
35 35  8
76 74 44
66 65 55
77 87 66
44 46 65

------------------------------------------------------------

