In [None]:
import pymupdf
import langextract as lx
import textwrap
from dotenv import load_dotenv
import os


In [None]:
#Setting up Google Gemini API Key 

load_dotenv()

api_key = os.environ.get('LANGEXTRACT_API_KEY')

In [None]:
# Run this cell once

!curl https://arxiv.org/pdf/1704.05842 -o jet_substructure_paper.pdf

In [None]:
#Run this cell once
!curl https://arxiv.org/pdf/2107.11405 -o stable_diffusion_paper.pdf

In [None]:
#for paper 1
doc = pymupdf.open('jet_substructure_paper.pdf')

doc

In [None]:
#for paper 2
doc2 = pymupdf.open('stable_diffusion_paper.pdf')

doc2

In [None]:
alltext = ""
for page in doc:
    text = page.get_text()
    alltext += text + "\n"

print(alltext)

In [None]:
alltext = ""
for page in doc2:
    text = page.get_text()
    alltext += text + "\n"

print(alltext)

In [None]:
# 1. Define a concise prompt
prompt = textwrap.dedent("""\

You are an expert extractor for experimental particle physics papers. Your job is to find and return all mentions of datasets and any numeric/size information about them. For each dataset mentioned in the input text produce one extraction object.

Extraction goals (for each dataset mention):

dataset_name: the dataset identifier or human name exactly as written in the text (examples: '/JetHT/Run2016B-03Feb2017-v1/AOD', 'Jet Primary Dataset', 'Dataset_X (DX-2020-v2)', 'Sample-Small').
file_count: the number of files (normalized to integer). If the text says 'about 420 files' set file_count to 420 and approximate=true.
number_of_events: number of events (normalized to integer). Parse commas, scientific notation, 'million', 'k', etc.
disk_size_bytes: disk size normalized to an integer number of bytes when a size is given or can be inferred. Prefer bytes; also provide human_readable_disk_size exactly as written.
compressed_uncompressed: text describing compressed vs uncompressed sizes if both are given (string or null).
inferred: true if the value was computed/inferred from other numeric parameters in the text (e.g., computed from luminosity and cross-section when both are given); false if directly stated.
inference_reason: short text explaining any inference or calculation performed (null if none).
extraction_text: the exact short span from the input that supports this extraction (copy verbatim).
confidence: a float 0.0 to 1.0 expressing your confidence (0.9+ for explicit exact numbers, 0.6 to 0.8 for approximate phrasing, 0.3–0.5 for inferred / weak evidence).
notes: any clarifying notes (null if none).

Rules:

Return one object per dataset mention (if the paragraph mentions multiple datasets, return multiple objects).
Always include the exact extraction_text (a short substring from the input that shows the dataset or numbers).
Normalize numeric strings:
'1.2 million' -> 1200000
'2.5 x 10^5' or '2.5e5' -> 250000
'1,234' -> 1234
Normalize sizes to bytes using: KB=10^3, MB=10^6, GB=10^9, TB=10^12 (and GiB/GiB as 2^30 if you prefer; state the conversion in inference_reason). If only a human unit is present, compute bytes and set disk_size_bytes accordingly.
If a disk size is not explicit but can be reliably inferred from other numbers present in the text, compute it, set inferred=true, and explain the calculation in inference_reason.
If text uses approximate words ('about', '~', 'approximately', 'roughly'), set approximate=true in notes or attributes and lower confidence.
If no dataset-specific numbers are present, return an empty list (i.e., []). Do not hallucinate.
Keep extraction_text short (one sentence or clause) and exact.
When multiple different numeric claims appear (e.g., 'raw size 3.6 TB; archived 900 GB'), provide them in compressed_uncompressed or notes, and include disk_size_bytes as the primary size if one is clearly primary (or null if ambiguous). Provide both human_readable and bytes when possible.

""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text=(
            "Our jet substructure study is based on the Jet Primary Dataset [76], "
            "which is a subset of the full open data release with events that pass a predefined set "
            "of single-jet and multi-jet triggers. There are 1664 AOD files in the Jet Primary Dataset, corresponding to 20,022,826 events "
            "and 2.0 Terabytes of disk space."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="file count",
                extraction_text="1664 AOD files",
                attributes={"type": "count"}, 

            ),
            lx.data.Extraction(
                extraction_class="data set",
                extraction_text="Jet Primary Dataset",
                attributes={"type": "data set"},
            ),
            lx.data.Extraction(
                extraction_class="disk space",
                extraction_text="2.0 Terabytes of disk space",
                attributes={"type": "disk space"},
            ),
            lx.data.Extraction(
                extraction_class="number of events",
                extraction_text="20,022,826 events",
                attributes={"type": "count"},
            ),
        ],
    ),

    # Additional diverse examples to improve robustness
    lx.data.ExampleData(
        text=(
            "The analysis uses an integrated sample from /MuonEG/Run2017C-31Mar2018-v1/AOD with roughly 4.5e5 events distributed over 320 files, "
            "with an on-disk footprint of approximately 210 GiB (compressed)."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="data set",
                extraction_text="/MuonEG/Run2017C-31Mar2018-v1/AOD",
                attributes={"type": "data set"},
            ),
            lx.data.Extraction(
                extraction_class="number of events",
                extraction_text="roughly 4.5e5 events",
                attributes={"type": "count", "approx": True},
            ),
            lx.data.Extraction(
                extraction_class="file count",
                extraction_text="320 files",
                attributes={"type": "count"},
            ),
            lx.data.Extraction(
                extraction_class="disk space",
                extraction_text="approximately 210 GiB (compressed)",
                attributes={"type": "disk space", "units": "GiB"},
            ),
        ],
    ),

    lx.data.ExampleData(
        text=(
            "We processed 1.25 million events from Dataset_Z_v3 (internal name: DZ_v3) across 1,800 AOD files. The raw size was ~2.8 TB, reduced to ~700 GB after compression and archiving."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="data set",
                extraction_text="Dataset_Z_v3 (internal name: DZ_v3)",
                attributes={"type": "data set"},
            ),
            lx.data.Extraction(
                extraction_class="number of events",
                extraction_text="1.25 million events",
                attributes={"type": "count"},
            ),
            lx.data.Extraction(
                extraction_class="file count",
                extraction_text="1,800 AOD files",
                attributes={"type": "count"},
            ),
            lx.data.Extraction(
                extraction_class="disk space",
                extraction_text="raw size was ~2.8 TB, reduced to ~700 GB after compression and archiving",
                attributes={"type": "disk space"},
            ),
        ],
    ),

    lx.data.ExampleData(
        text=(
            "The study analyzes a small specialized subset (Sample-Small) containing 2,340 events in 12 files; total disk usage is negligible (~15 MB) because the files are skimmed and only selected branches are kept."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="data set",
                extraction_text="Sample-Small",
                attributes={"type": "data set"},
            ),
            lx.data.Extraction(
                extraction_class="number of events",
                extraction_text="2,340 events",
                attributes={"type": "count"},
            ),
            lx.data.Extraction(
                extraction_class="file count",
                extraction_text="12 files",
                attributes={"type": "count"},
            ),
            lx.data.Extraction(
                extraction_class="disk space",
                extraction_text="~15 MB",
                attributes={"type": "disk space", "units": "MB"},
            ),
        ],
    ),
]

# 3. Run the extraction on your input text
input_text = alltext


result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    api_key=api_key,
)


In [None]:
# Save and visualize the results
lx.io.save_annotated_documents([result], output_name="jetsubstructureTest_extraction.jsonl", output_dir=".")

# Generate the interactive visualization
html_content = lx.visualize("jetsubstructureTest_extraction.jsonl")
with open("jetsubstructureTest_extraction.html", "w", encoding='utf-8') as f:
    if hasattr(html_content, 'data'): #Check if the content has a data attribute
        f.write(html_content.data)  
    else:
        f.write(html_content)

print("Interactive visualization saved to jetsubstructureTest_extraction_visualization.html")

In [None]:
#Trying on a different paper
# 1. Define a concise prompt
prompt = textwrap.dedent("""\

Extract mentions of names of datasets, size in file counts, number of events and disk space size in bytes from scientific texts.
Use exact text for extractions. Do not paraphrase. 

""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text=(
            "Our jet substructure study is based on the Jet Primary Dataset [76], "
            "which is a subset of the full open data release with events that pass a predefined set "
            "of single-jet and multi-jet triggers. There are 1664 AOD files in the Jet Primary Dataset, corresponding to 20,022,826 events "
            "and 2.0 Terabytes of disk space."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="file count",
                extraction_text="1664 AOD files",
                attributes={"type": "count"}, 

            ),
            lx.data.Extraction(
                extraction_class="data set",
                extraction_text="Jet Primary Dataset",
                attributes={"type": "data set"},
            ),
            lx.data.Extraction(
                extraction_class="disk space",
                extraction_text="2.0 Terabytes of disk space",
                attributes={"type": "disk space"},
            ),
            lx.data.Extraction(
                extraction_class="number of events",
                extraction_text="20,022,826 events",
                attributes={"type": "count"},
            ),
        ],
    )
]

# 3. Run the extraction on your input text
input_text = alltext


result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    api_key=api_key,
)

In [None]:
# Save and visualize the results
lx.io.save_annotated_documents([result], output_name="stableDiffusionTest_extraction.jsonl", output_dir=".")

# Generate the interactive visualization
html_content = lx.visualize("stableDiffusionTest_extraction.jsonl")
with open("stableDiffusionTest_extraction.html", "w", encoding='utf-8') as f:
    if hasattr(html_content, 'data'): #Check if the content has a data attribute
        f.write(html_content.data)  
    else:
        f.write(html_content)

print("Interactive visualization saved to stableDiffusionTest_extraction_visualization.html")

In [None]:
#Better prompt?
'''
You are an expert extractor for experimental particle physics papers. Your job is to find and return all mentions of datasets and any numeric/size information about them. For each dataset mentioned in the input text produce one extraction object.

Extraction goals (for each dataset mention):

dataset_name: the dataset identifier or human name exactly as written in the text (examples: '/JetHT/Run2016B-03Feb2017-v1/AOD', 'Jet Primary Dataset', 'Dataset_X (DX-2020-v2)', 'Sample-Small').
file_count: the number of files (normalized to integer). If the text says 'about 420 files' set file_count to 420 and approximate=true.
number_of_events: number of events (normalized to integer). Parse commas, scientific notation, 'million', 'k', etc.
disk_size_bytes: disk size normalized to an integer number of bytes when a size is given or can be inferred. Prefer bytes; also provide human_readable_disk_size exactly as written.
compressed_uncompressed: text describing compressed vs uncompressed sizes if both are given (string or null).
inferred: true if the value was computed/inferred from other numeric parameters in the text (e.g., computed from luminosity and cross-section when both are given); false if directly stated.
inference_reason: short text explaining any inference or calculation performed (null if none).
extraction_text: the exact short span from the input that supports this extraction (copy verbatim).
confidence: a float 0.0 to 1.0 expressing your confidence (0.9+ for explicit exact numbers, 0.6 to 0.8 for approximate phrasing, 0.3–0.5 for inferred / weak evidence).
notes: any clarifying notes (null if none).

Rules:

Return one object per dataset mention (if the paragraph mentions multiple datasets, return multiple objects).
Always include the exact extraction_text (a short substring from the input that shows the dataset or numbers).
Normalize numeric strings:
'1.2 million' -> 1200000
'2.5 x 10^5' or '2.5e5' -> 250000
'1,234' -> 1234
Normalize sizes to bytes using: KB=10^3, MB=10^6, GB=10^9, TB=10^12 (and GiB/GiB as 2^30 if you prefer; state the conversion in inference_reason). If only a human unit is present, compute bytes and set disk_size_bytes accordingly.
If a disk size is not explicit but can be reliably inferred from other numbers present in the text, compute it, set inferred=true, and explain the calculation in inference_reason.
If text uses approximate words ('about', '~', 'approximately', 'roughly'), set approximate=true in notes or attributes and lower confidence.
If no dataset-specific numbers are present, return an empty list (i.e., []). Do not hallucinate.
Keep extraction_text short (one sentence or clause) and exact.
When multiple different numeric claims appear (e.g., 'raw size 3.6 TB; archived 900 GB'), provide them in compressed_uncompressed or notes, and include disk_size_bytes as the primary size if one is clearly primary (or null if ambiguous). Provide both human_readable and bytes when possible.
'''
