In [None]:
import pymupdf
import langextract as lx
import textwrap
from dotenv import load_dotenv
import os


In [None]:
#Setting up Google Gemini API Key 

load_dotenv()

api_key = os.environ.get('LANGEXTRACT_API_KEY')

In [None]:
# Run this cell once

!curl https://arxiv.org/pdf/1704.05842 -o jet_substructure_paper.pdf

In [None]:
doc = pymupdf.open('jet_substructure_paper.pdf')

doc

In [None]:
for page in doc:
    text = page.get_text()
    print(text)

In [None]:
prompt = textwrap.dedent("""\
    Look for mentions of  the specific dataset, mentions of number of events 
    used and/or disk/file size.""")

In [None]:
example1 = [
    lx.data.ExampleData(
        text = (
            "There are 1664 AOD files in the Jet Primary Dataset, corresponding to 20,022,826 events "
            "and 2.0 Terabytes of disk space. Within CMSSW, it is possible to access the AOD files remotely through the "
            "XRootD interface [218]. We found it more convenient to first download the AOD files and then process them locally, "
            "being careful to maintain the same directory structure as on the Open Data servers in order to ensure consistency of the workflow."
        ),
        extractions = [
            lx.data.Extraction(
                extraction_class = "Amount of data used, the specific dataset, and number of events",
                extraction_text = "1664 AOD files in the Jet Primary Dataset, corresponding to 20,022,826 events and 2.0 Terabytes of disk space",
                attributes = {
                    "Dataset": "Jet Primary Dataset",
                    "Number of files": "1664",
                    "Number of events": "20,022,826",
                    "Disk size": "2.0 Terabytes"
                },
            )
        ]
    )
]

example2 = [
    lx.data.ExampleData(
        text = (
            "The data used in this analysis corresponds to an integrated luminosity of 2.3 fb-1 of proton-proton collisions at √s = 13 TeV recorded by the CMS experiment "
            "in 2015. The data were collected using a trigger that requires at least one jet with pT > 450 GeV and |η| < 3.0. After applying data quality criteria, "
            "the total number of events passing the trigger is approximately 1.2 million, corresponding to a total disk size of about 150 GB."
        ),
        extractions = [
            lx.data.Extraction(
                extraction_class = "Amount of data used, the specific dataset, and number of events",
                extraction_text = "the total number of events passing the trigger is approximately 1.2 million, corresponding to a total disk size of about 150 GB",
                attributes = {
                    "Number of events": "approximately 1.2 million",
                    "Experiment": "CMS",
                    "Disk size": "150 GB",
                    "Date": "2015"
                },
            ),      
        ]
    )
]

example3 = [
    lx.data.ExampleData(
        text=(
            "We analyzed 350,000 events from the /JetHT/Run2016B dataset across 420 compressed AOD files, "
            "totaling approximately 320 GB on disk (compressed) and about 1.2 TB uncompressed. "
            "The dataset identifier used was /JetHT/Run2016B-03Feb2017-v1/AOD."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class = "Amount of data used, the specific dataset, and number of events",
                extraction_text = "We analyzed 350,000 events from the /JetHT/Run2016B dataset across 420 compressed AOD files, totaling approximately 320 GB on disk (compressed).",
                attributes = {
                    "Dataset": "/JetHT/Run2016B-03Feb2017-v1/AOD",
                    "Number of files": "420",
                    "Number of events": "350,000",
                    "Disk size": "approximately 320 GB (compressed); about 1.2 TB uncompressed"
                },
            )
        ]
    )
]


example4 = [
    lx.data.ExampleData(
        text=(
            "Using data from the /SingleMuon/Run2018A-17Sep2018-v1/AOD dataset, we selected a high-purity sample corresponding to 4.18 fb^{-1}. "
            "After event cleaning and selection, the final sample contains 98,742 events distributed across 210 AOD files, occupying roughly 45 GiB of compressed storage."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class = "Amount of data used, the specific dataset, and number of events",
                extraction_text = "the final sample contains 98,742 events distributed across 210 AOD files, occupying roughly 45 GiB of compressed storage",
                attributes = {
                    "Dataset": "/SingleMuon/Run2018A-17Sep2018-v1/AOD",
                    "Number of files": "210",
                    "Number of events": "98,742",
                    "Disk size": "roughly 45 GiB (compressed)",
                    "Integrated luminosity": "4.18 fb^{-1}"
                },
            )
        ]
    )
]

example5 = [
    lx.data.ExampleData(
        text=(
            "The study uses approximately 2.5 x 10^5 events from the proprietary dataset labeled Dataset_X (internal tag: DX-2020-v2). "
            "Raw files total ~3.6 TB (uncompressed) across 6,450 files; when archived the footprint reduced to ~900 GB."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class = "Amount of data used, the specific dataset, and number of events",
                extraction_text = "uses approximately 2.5 x 10^5 events from the proprietary dataset labeled Dataset_X (internal tag: DX-2020-v2).",
                attributes = {
                    "Dataset": "Dataset_X (DX-2020-v2)",
                    "Number of files": "6,450",
                    "Number of events": "approximately 2.5 x 10^5",
                    "Disk size": "~3.6 TB uncompressed; ~900 GB archived"
                },
            )
        ]
    )
]


In [None]:
#result = lx.extract(
    #text_or_documents=input_text,
    #prompt_description="Extract information...",
    #examples=[...],
    #model_id="gemini-2.5-flash"
#)