In [1]:
import pymupdf
import langextract as lx
import textwrap
from dotenv import load_dotenv
import os


In [2]:
#Setting up Google Gemini API Key 

load_dotenv()

api_key = os.environ.get('LANGEXTRACT_API_KEY')

In [None]:
# Run this cell once

!curl https://arxiv.org/pdf/1704.05842 -o jet_substructure_paper.pdf

In [3]:
doc = pymupdf.open('jet_substructure_paper.pdf')

doc

Document('jet_substructure_paper.pdf')

In [4]:
alltext = ""
for page in doc:
    text = page.get_text()
    alltext += text + "\n"

print(alltext)

MIT-CTP 4890
Jet Substructure Studies with CMS Open Data
Aashish Tripathee,1, ∗Wei Xue,1, † Andrew Larkoski,2, ‡ Simone Marzani,3, § and Jesse Thaler1, ¶
1Center for Theoretical Physics, Massachusetts Institute of Technology, Cambridge, MA 02139, USA
2Physics Department, Reed College, Portland, OR 97202, USA
3University at Buﬀalo, The State University of New York, Buﬀalo, NY 14260-1500, USA
We use public data from the CMS experiment to study the 2-prong substructure of jets. The
CMS Open Data is based on 31.8 pb−1 of 7 TeV proton-proton collisions recorded at the Large
Hadron Collider in 2010, yielding a sample of 768,687 events containing a high-quality central jet
with transverse momentum larger than 85 GeV. Using CMS’s particle ﬂow reconstruction algorithm
to obtain jet constituents, we extract the 2-prong substructure of the leading jet using soft drop
declustering.
We ﬁnd good agreement between results obtained from the CMS Open Data and
those obtained from parton shower generator

In [None]:
# 1. Define a concise prompt
prompt = textwrap.dedent("""\
You are an expert at high energy particle physics and you understand jargon like "events" and datasets. 
I'm looking for information on the dataset the authors used.
Extract mentions of names of datasets, size in file counts, number of events and disk space size in bytes from scientific texts.
Use exact text for extractions. Do not paraphrase. 

""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text=(
            "Our jet substructure study is based on the Jet Primary Dataset [76], "
            "which is a subset of the full open data release with events that pass a predefined set "
            "of single-jet and multi-jet triggers. There are 1664 AOD files in the Jet Primary Dataset, corresponding to 20,022,826 events "
            "and 2.0 Terabytes of disk space."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="file count",
                extraction_text="1664 AOD files",
                attributes={"type": "count"}, 

            ),
            lx.data.Extraction(
                extraction_class="data set",
                extraction_text="Jet Primary Dataset",
                attributes={"type": "data set"},
            ),
            lx.data.Extraction(
                extraction_class="disk space",
                extraction_text="2.0 Terabytes of disk space",
                attributes={"type": "disk space"},
            ),
            lx.data.Extraction(
                extraction_class="number of events",
                extraction_text="20,022,826 events",
                attributes={"type": "count"},
            ),
        ],
    )
]

# 3. Run the extraction on your input text
input_text = alltext


result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    api_key=api_key,
)




In [7]:
# Save and visualize the results
lx.io.save_annotated_documents([result], output_name="jetsubstructureTest_extraction.jsonl", output_dir=".")

# Generate the interactive visualization
html_content = lx.visualize("jetsubstructureTest_extraction.jsonl")
with open("jetsubstructureTest_extraction.html", "w", encoding='utf-8') as f:
    if hasattr(html_content, 'data'): #Check if the content has a data attribute
        f.write(html_content.data)  
    else:
        f.write(html_content)

print("Interactive visualization saved to jetsubstructureTest_extraction_visualization.html")

[94m[1mLangExtract[0m: Saving to [92mjetsubstructureTest_extraction.jsonl[0m: 1 docs [00:00, 91.58 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mjetsubstructureTest_extraction.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mjetsubstructureTest_extraction.jsonl[0m: 100%|█████████▉| 210k/210k [00:00<00:00, 15.0MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mjetsubstructureTest_extraction.jsonl[0m
Interactive visualization saved to jetsubstructureTest_extraction_visualization.html



