In [None]:
import pandas as pd
import pymupdf4llm
import docling
import json
import time

**Using Marker-PDF   **

In [None]:
from langchain_docling.loader import DoclingLoader

In [None]:
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode


In [None]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser

In [None]:
import os

In [None]:
import psutil
import os

# Get the memory usage of the current process (your kernel)
process = psutil.Process(os.getpid())
memory_gb = process.memory_info().rss / (1024 ** 3)  # Convert bytes to GB

print(f"ðŸ”¹ Current Kernel Memory: {memory_gb:.2f} GB")
print(f"ðŸ”¹ Total System RAM Used: {psutil.virtual_memory().percent}%")

In [None]:
!ollama pull qwen2.5:1.5b

In [None]:
config = {
    "output_format": "markdown",
    "use_llm" : True,
    "llm_service": "marker.services.ollama.OllamaService",
    "llm_model" : "qwen2.5:1.5b",
    "ollama_base_url": "http://localhost:11434"
    }

In [None]:
converter = PdfConverter(
    config = config,
    artifact_dict = create_model_dict(),
    processor_list = ConfigParser(config).get_processors(),
    renderer = ConfigParser(config).get_renderer(),
    llm_service = ConfigParser(config).get_llm_service()
)

In [None]:
from pathlib import Path

In [None]:
path = "p1.pdf"

rendered = converter(path)

In [None]:
text_1 = "I want to learn NLP!"

In [None]:
list(text.split())

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
from spacy.lang.en.examples import sentences

In [None]:
doc = nlp(text_1)

In [None]:
print(doc.text)

In [None]:
for token in doc:
    print(token.text,token.pos)

In [None]:
import nltk 
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('punkt_tab')

In [None]:
tokens = word_tokenize(text_1.lower())

print(tokens)

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
filtered_tokens = [word for word in tokens if word not in stop_words]

In [None]:
print(filtered_tokens)

In [None]:
from transformers import pipeline
import pymupdf
from pathlib import Path

In [None]:

def summarizer_fn(pdf: Path):
    document = Path(pdf)
    filename = document.stem
    summarizer = pipeline("summarization", model ="facebook/bart-large-cnn")
    summary = {}
    with pymupdf.open(document) as doc:
        for page in doc:
            text = page.get_text()
            text_list = text.split(".")
            
            Path(f"{filename}_{page}"+".txt").write_bytes(text.encode())
            for c in range(0,len(text_list),200):
                chunk = text_list[c:c+200]
                
                chunk_text = "".join(chunk)
                if len(chunk_text.strip()) < 50:
                    continue
                result = summarizer(chunk_text, max_length = 120, min_length = 30)

                summary[(page, c)] = result[0]['summary_text']


    
    return summary


In [None]:
import ollama

In [None]:
def summarizer_function(pdf:Path):
    document = Path(pdf)
    filename = document.stem

    summarizer = pipeline("summarization", model = "facebook/bart-large-cnn")

    full_text = ""

    with pymupdf.open(document) as doc:
        for page in doc:
            full_text += page.get_text() + " "

    Path(f"{filename}_full_text.txt").write_bytes(full_text.encode())
    print("file printed!")

    sentences = full_text.split(".")


    all_summaries = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())

        if current_length + sentence_length>300:
            chunk_text = ". ".join(current_chunk) + ". "

            result = summarizer(chunk_text,max_length = 100, min_length = 30)

            all_summaries.append(result[0]['summary_text'])

            current_chunk = []
            current_length = 0

        current_chunk.append(sentence)
        current_length += sentence_length


    if current_chunk:
        chunk_text = " . ".join(current_chunk)

        if len(chunk_text.split()) > 30:
            result = summarizer(chunk_text,max_length = 100, min_length = 30)
            all_summaries.append(result[0]['summary_text'])


    final_summary = "\n\n".join(all_summaries)


    return final_summary




In [30]:
images = []
with pymupdf.open("p1.pdf") as doc:
    for i, page in enumerate(doc):
        images.append((i,page.get_images()))

In [31]:
print(images[1][1])

[(217, 0, 1601, 814, 8, 'DeviceRGB', '', 'x39', 'FlateDecode')]


In [32]:
import requests

In [66]:
def parse_pdf(pdf_path:Path):
    url = "http://localhost:8070/api/processFulltextDocument"


    with open(pdf_path, 'rb') as file:
        files = {'input' : (pdf_path.name, file,'application/pdf')}

        print(f"sending {pdf_path} to local GORBID server!!")

        response = requests.post(url, files=files)

    if response.status_code == 200:
        print("PDF parsed.")
        return response.text

    else:
        print("something went wrong!")
        return None

In [67]:
xml_data = parse_pdf(Path("p1.pdf"))

sending p1.pdf to local GORBID server!!
PDF parsed.


In [68]:
if xml_data:
    print(xml_data)

<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Disorder information from conductance: a quantum inverse problem</title>
				<funder ref="#_zcv5fjK">
					<orgName type="full">FAPERJ</orgName>
				</funder>
				<funder ref="#_5bRrFex">
					<orgName type="full">FAPESP</orgName>
				</funder>
				<funder ref="#_Xjcr4bT #_G6GtHSa">
					<orgName type="full">unknown</orgName>
				</funder>
				<funder ref="#_jpNeVCd">
					<orgName type="full">CNPq</orgName>
				</funder>
				<funder ref="#_PxZ2udk">
					<orgName type="full">ICTP-Simons Foundation Associate Scheme</orgName>
				</funder>
			</titleStmt>
			<publ

In [69]:
from bs4 import BeautifulSoup

In [70]:
soup = BeautifulSoup(xml_data,"xml")

In [73]:
equations = []

for formula in soup.find_all("formula"):
    equations.append(formula.get_text(strip = True))

paragraphs = []

for p in soup.find_all("p"):
    paragraphs.append(p.get_text(strip= True))


print("Extraction complete... ")

Extraction complete... 


In [74]:
if equations:
    print(len(equations))

11


In [75]:
if paragraphs:
    print(len(paragraphs))

49


In [80]:
figures = []

for fig in soup.find_all("graphic"):
    cap = fig.get_text(strip=True)

    coords = fig.get("coords")


    if coords:
        figures.append({"caption": cap, "coords": coords})
        print(f"found figure {cap} with coords {coords}")
        


found figure  with coords 10,148.97,76.04,107.60,55.88


In [72]:
figures

[]

In [None]:
for fig in soup.find_all("figure"):
    cap = fig.get_text(strip=True)
    graphic_tag = fig.find("graphic")
    if graphic_tag:
        coords = graphic_tag.get("coords")

        print(f"coords: {coords}")
    

coords: 10,148.97,76.04,107.60,55.88


In [105]:
figures_data = []
captions_data = []
coords_data = []

for fig in soup.find_all('figure'):
    figures_data.append(fig)
    cap = fig.get_text(strip=True)
    
    captions_data.append(cap)
    print(cap)


    graphic_tag = fig.find("graphic")
    print(graphic_tag)


Fig. 2 (FIG. 2 .22FIG. 2. (a) Absolute value of the conductance deviation |âˆ†Î“(E, n)| as a function of the impurity concentration n (in percentage) for a fixed energy (E = 0.42t). (b) Misfit function Ï‡ (in arbitrary units) as a function of n. The vertical (red) dashed line on the lower part of the panel indicates the real number of impurities used to generate the sample conductance, which coincides with the minimum of Ï‡(n). The (black) dotted line in the upper part of the panel is the approximate concentration n * . Integration limits were E-= 0.5t and E+ = 1.5t and the solid (blue) line is simply a guide to the eyes.
None
FIG. 3 .3FIG. 3. 2D contour plots of the logarithm of the misfit function. Dashed lines in the plots indicate the values of the respective quantities used in the underlying Hamiltonians. (a): Ï‡ as a function of n and . (b): Ï‡ as a function of na and n b .
None
FIG. 4 .4FIG. 4. (a) The (red) solid line is the conductance spectrum Î“(E) obtained from a specific d

In [106]:
figures_data

[<figure xml:id="fig_0" xmlns="http://www.tei-c.org/ns/1.0"><head>Fig. 2 (FIG. 2 .</head><label>22</label><figDesc>FIG. 2. (a) Absolute value of the conductance deviation |âˆ†Î“(E, n)| as a function of the impurity concentration n (in percentage) for a fixed energy (E = 0.42t). (b) Misfit function Ï‡ (in arbitrary units) as a function of n. The vertical (red) dashed line on the lower part of the panel indicates the real number of impurities used to generate the sample conductance, which coincides with the minimum of Ï‡(n). The (black) dotted line in the upper part of the panel is the approximate concentration n * . Integration limits were E-= 0.5t and E+ = 1.5t and the solid (blue) line is simply a guide to the eyes.</figDesc></figure>,
 <figure xml:id="fig_1" xmlns="http://www.tei-c.org/ns/1.0"><head>FIG. 3 .</head><label>3</label><figDesc>FIG. 3. 2D contour plots of the logarithm of the misfit function. Dashed lines in the plots indicate the values of the respective quantities used i

In [104]:
for tag in soup.find_all(["graphic", "inline-graphic", "media", "img", "figures"]):
    print("Tag:", tag.name)
    print("Attributes:", tag.attrs)
    print("-" * 50)

Tag: graphic
Attributes: {'coords': '10,148.97,76.04,107.60,55.88', 'type': 'bitmap'}
--------------------------------------------------


In [107]:
import fitz # PyMuPDF

doc = fitz.open("p1.pdf")
image_count = 0

for page_index in range(len(doc)):
    page = doc[page_index]
    image_list = page.get_images(full=True)
    image_count += len(image_list)
    
print(f"Total image objects found by PyMuPDF: {image_count}")

Total image objects found by PyMuPDF: 7


In [108]:
doc = fitz.open("p1.pdf")

for page_index in range(len(doc)):
    page = doc[page_index]
    # We search for "Fig" to catch both "Figure" and "Fig."
    text_instances = page.search_for("Fig")
    
    if text_instances:
        print(f"Page {page_index + 1}: Found {len(text_instances)} figure references.")
        for inst in text_instances:
            print(f"  Location: {inst}")

Page 1: Found 1 figure references.
  Location: Rect(413.6850891113281, 425.0692443847656, 427.9482727050781, 437.60809326171875)
Page 2: Found 7 figure references.
  Location: Rect(53.88138198852539, 261.74224853515625, 70.45907592773438, 273.0362548828125)
  Location: Rect(171.7843780517578, 411.5881042480469, 186.0475616455078, 424.126953125)
  Location: Rect(104.94381713867188, 579.924072265625, 119.20700073242188, 592.4629516601562)
  Location: Rect(164.65286254882812, 627.767578125, 178.9160614013672, 640.3064575195312)
  Location: Rect(426.96148681640625, 461.8426818847656, 441.22467041015625, 474.38153076171875)
  Location: Rect(439.71624755859375, 604.7307739257812, 453.97943115234375, 617.2696533203125)
  Location: Rect(394.1658020019531, 688.4561157226562, 408.4289855957031, 700.9949951171875)
Page 3: Found 5 figure references.
  Location: Rect(53.881385803222656, 261.47515869140625, 70.4590835571289, 272.7691650390625)
  Location: Rect(66.27066040039062, 604.7307739257812, 8