In [1]:
import os
from docsearch.figure_extraction import DocumentPageAnalyzer
from IPython.display import display
from rich.console import Console
from rich.markdown import Markdown
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

console = Console()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ROOT_DIR = Path(".")
DATA_DIR = ROOT_DIR / "data"
MODEL_WEIGHTS = DATA_DIR / "doclayout_yolo_docstructbench_imgsz1024.pt"
SAMPLES_DIR = DATA_DIR / "samples"
sample_filepaths = list(SAMPLES_DIR.glob("*.png"))

print(f"ROOT_DIR: {ROOT_DIR}")
print(f"DATA_DIR: {DATA_DIR}")
print(f"SAMPLES_DIR: {SAMPLES_DIR}")
for filepath in sample_filepaths:
    print(f"sample_filepath: {filepath}")
    
    
table_filepaths = []
table_filepaths.append(sample_filepaths[0].with_suffix("") / "sample_1" / "table" / "table_000.png")
table_filepaths.append(sample_filepaths[0].with_suffix("") / "sample_1" / "table" / "table_001.png")

table_captions = []
table_captions.append(sample_filepaths[0].with_suffix("") / "sample_1" / "table" / "table_000_caption.png")
table_captions.append(sample_filepaths[0].with_suffix("") / "sample_1" / "table" / "table_001_caption.png")
print(f"table_filepaths: \n{table_filepaths}")


figure_filepaths = []
figure_filepaths.append(sample_filepaths[0].with_suffix("") / "sample_1" / "figure" / "figure_000.png")

print(f"figure_filepaths: \n{figure_filepaths}")

figure_captions = []
figure_captions.append(sample_filepaths[0].with_suffix("") / "sample_1" / "figure" / "figure_000_caption.png")

print(f"figure_captions: \n{figure_captions}")


ROOT_DIR: .
DATA_DIR: data
SAMPLES_DIR: data\samples
sample_filepath: data\samples\sample_1.png
sample_filepath: data\samples\sample_2.png
sample_filepath: data\samples\sample_3.png
sample_filepath: data\samples\sample_4.png
sample_filepath: data\samples\sample_5.png
table_filepaths: 
[WindowsPath('data/samples/sample_1/sample_1/table/table_000.png'), WindowsPath('data/samples/sample_1/sample_1/table/table_001.png')]
figure_filepaths: 
[WindowsPath('data/samples/sample_1/sample_1/figure/figure_000.png')]
figure_captions: 
[WindowsPath('data/samples/sample_1/sample_1/figure/figure_000_caption.png')]


## Table Extraction

In [11]:
from docsearch.core import Table

table= Table.from_image(table_filepaths[0],caption=table_captions[0])
print(table)
md = Markdown(table._md)
console.print(md, crop=False)

Processing async: <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1571x361 at 0x218F86DDF90>
Making API call (Model: gemini-2.0-flash)...
Processing async: <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1010x48 at 0x218F86DE800>
Making API call (Model: gemini-2.0-flash)...
Table(
df=
  Task/Dataset   BBBP  Tox21  ToxCast  SIDER  ClinTox   BACE   ESOL  FreeSolv
0     KCL(GIN)  0.954  0.854    0.748  0.660    0.945  0.932  0.580     0.856
1     KCL(GAT)  0.956  0.857    0.750  0.663    0.942  0.930  0.588     0.860
2     KCL(GCN)  0.956  0.856    0.757  0.666    0.945  0.934  0.582     0.854
3   KCL(R-GCN)  0.936  0.830    0.735  0.637    0.948  0.898  0.780     1.236
4    KCL(MPNN)  0.940  0.835    0.738  0.640    0.950  0.895  0.743     1.111
5   KCL(KMPNN)  0.961  0.859    0.740  0.671    0.958  0.924  0.732     0.795, 
summary=This table presents the performance of different KCL models (GIN, GAT, GCN, R-GCN, MPNN, and KMPNN) on various classification and regression tasks. 

### Asynchronous Table Extraction

In [4]:
# import asyncio
# tasks = [asyncio.create_task(Table.from_image_async(image)) for image in table_filepaths]
# results = await asyncio.gather(*tasks)

# for result in results:
#     print(result)
#     md = Markdown(result._md)
#     console.print(md, crop=False)

### Testing Table Class

In [5]:
table.df

Unnamed: 0,Task/Dataset,BBBP,Tox21,ToxCast,SIDER,ClinTox,BACE,ESOL,FreeSolv
0,KCL(GIN),0.954,0.854,0.748,0.66,0.945,0.932,0.58,0.856
1,KCL(GAT),0.956,0.857,0.75,0.663,0.942,0.93,0.588,0.86
2,KCL(GCN),0.956,0.856,0.757,0.666,0.945,0.934,0.582,0.854
3,KCL(R-GCN),0.936,0.83,0.735,0.637,0.948,0.898,0.78,1.236
4,KCL(MPNN),0.94,0.835,0.738,0.64,0.95,0.895,0.743,1.111
5,KCL(KMPNN),0.961,0.859,0.74,0.671,0.958,0.924,0.732,0.795


In [6]:
table.to_json(filepath=table_filepaths[0].with_suffix(".json"))

'{\n  "df": [\n    {\n      "Task/Dataset": "KCL(GIN)",\n      "BBBP": 0.954,\n      "Tox21": 0.854,\n      "ToxCast": 0.748,\n      "SIDER": 0.66,\n      "ClinTox": 0.945,\n      "BACE": 0.932,\n      "ESOL": 0.58,\n      "FreeSolv": 0.856\n    },\n    {\n      "Task/Dataset": "KCL(GAT)",\n      "BBBP": 0.956,\n      "Tox21": 0.857,\n      "ToxCast": 0.75,\n      "SIDER": 0.663,\n      "ClinTox": 0.942,\n      "BACE": 0.93,\n      "ESOL": 0.588,\n      "FreeSolv": 0.86\n    },\n    {\n      "Task/Dataset": "KCL(GCN)",\n      "BBBP": 0.956,\n      "Tox21": 0.856,\n      "ToxCast": 0.757,\n      "SIDER": 0.666,\n      "ClinTox": 0.945,\n      "BACE": 0.934,\n      "ESOL": 0.582,\n      "FreeSolv": 0.854\n    },\n    {\n      "Task/Dataset": "KCL(R-GCN)",\n      "BBBP": 0.936,\n      "Tox21": 0.83,\n      "ToxCast": 0.735,\n      "SIDER": 0.637,\n      "ClinTox": 0.948,\n      "BACE": 0.898,\n      "ESOL": 0.78,\n      "FreeSolv": 1.236\n    },\n    {\n      "Task/Dataset": "KCL(MPNN)",\

In [7]:
table.caption

Caption(
md=Table 11: Results comparison with different graph encoders., 
image=<PIL.PngImagePlugin.PngImageFile image mode=RGB size=1010x48 at 0x218F8376A70>)

In [8]:
table.description
md = Markdown(table.description)
console.print(md, crop=False)

## Figure Extraction

In [12]:
from docsearch.core import Figure

figure = Figure.from_image(figure_filepaths[0], caption=figure_captions[0])
print(figure)
md = Markdown(figure._md)

Processing async: <PIL.PngImagePlugin.PngImageFile image mode=RGB size=957x386 at 0x218F83D2F80>
Making API call (Model: gemini-2.0-flash)...
Processing async: <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1010x48 at 0x218F86DE050>
Making API call (Model: gemini-2.0-flash)...
Figure(
md=| Feature          | BACE | ToxCast |
|------------------|------|---------|
| Ionization       | Yes  | Yes     |
| Abundance        | No   | No      |
| Electronegativity | Yes  | No      |
| Electron Affinity  | No   | No      |
| Boiling Point    | No   | Yes     |
| Periodic         | No   | Yes     |
| State            | Yes  | Yes     |
| Heat             | Yes  | Yes     |
| Family           | No   | Yes     |
| Weight           | No   | No      |
| Conductivity     | Yes  | No      |
| Density          | Yes  | No      |
| Metallicity      | Yes  | Yes     |
| Radius           | No   | No      |
| Melting Point    | No   | No      |, 
summary=The table compares the features used in BACE a

In [14]:
print(figure.description)


| Feature          | BACE | ToxCast |
|------------------|------|---------|
| Ionization       | Yes  | Yes     |
| Abundance        | No   | No      |
| Electronegativity | Yes  | No      |
| Electron Affinity  | No   | No      |
| Boiling Point    | No   | Yes     |
| Periodic         | No   | Yes     |
| State            | Yes  | Yes     |
| Heat             | Yes  | Yes     |
| Family           | No   | Yes     |
| Weight           | No   | No      |
| Conductivity     | Yes  | No      |
| Density          | Yes  | No      |
| Metallicity      | Yes  | Yes     |
| Radius           | No   | No      |
| Melting Point    | No   | No      |

Table 11: Results comparison with different graph encoders.

*Summary: The table compares the features used in BACE and ToxCast models. It indicates whether each feature is used in each model with a 'Yes' or 'No'.*


## Page Extraction

In [None]:
from docsearch.core import Page

page = Page.from_image(sample_filepaths[0])
print(page)
md = Markdown(page._md)
print(page.description)
