In [1]:
#import modules 

import pymupdf
import langextract as lx
import textwrap
from dotenv import load_dotenv
import os

In [2]:
#Setting up Google Gemini API Key 
#set API key as a variable using linux command

load_dotenv()

api_key = os.getenv('LANGEXTRACT_API_KEY')

In [None]:
# Run this cell once

!curl https://arxiv.org/pdf/1704.05842 -o jet_substructure_paper.pdf

In [None]:
doc = pymupdf.open('jet_substructure_paper.pdf')

doc

In [None]:
for page in doc:
    text = page.get_text()

In [3]:
# 1. Define a concise prompt
prompt = textwrap.dedent("""\
Extract characters, emotions, and relationships in order of appearance.
Use exact text for extractions. Do not paraphrase or overlap entities.
Provide meaningful attributes for each entity to add context.""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text=(
            "ROMEO. But soft! What light through yonder window breaks? It is"
            " the east, and Juliet is the sun."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="character",
                extraction_text="ROMEO",
                attributes={"emotional_state": "wonder"},
            ),
            lx.data.Extraction(
                extraction_class="emotion",
                extraction_text="But soft!",
                attributes={"feeling": "gentle awe"},
            ),
            lx.data.Extraction(
                extraction_class="relationship",
                extraction_text="Juliet is the sun",
                attributes={"type": "metaphor"},
            ),
        ],
    )
]

# 3. Run the extraction on your input text
input_text = (
    "Lady Juliet gazed longingly at the stars, her heart aching for Romeo"
)



result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    api_key= api_key,
)
   

In [None]:
# Save and visualize the results
lx.io.save_annotated_documents([result], output_name="romeo_juliet_extraction.jsonl", output_dir=".")

# Generate the interactive visualization
html_content = lx.visualize("romeo_juliet_extraction.jsonl")
with open("romeo_juliet_extraction.html", "w", encoding='utf-8') as f:
    if hasattr(html_content, 'data'): #does the object have a data attribute?
        f.write(html_content.data)  #if so, write that as the html content
    else:
        f.write(html_content) #it must be just a string, write directly

print("Interactive visualization saved to romeo_juliet_extraction_visualization.html")


In [None]:
# 1. Define a concise prompt
prompt = textwrap.dedent("""\
Extract mentions of datasets, file counts, and disk space from scientific texts. Use exact text for extractions. Do not paraphrase or overlap entities.
""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text=(
            "Our jet substructure study is based on the Jet Primary Dataset [76], "
            "which is a subset of the full open data release with events that pass a predefined set "
            "of single-jet and multi-jet triggers. There are 1664 AOD files in the Jet Primary Dataset, corresponding to 20,022,826 events "
            "and 2.0 Terabytes of disk space."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="file count",
                extraction_text="1664 AOD files",
                attributes={"type": "count"},
            ),
            lx.data.Extraction(
                extraction_class="data set",
                extraction_text="Jet Primary Dataset",
                attributes={"type": "data set"},
            ),
            lx.data.Extraction(
                extraction_class="disk space",
                extraction_text="2.0 Terabytes of disk space",
                attributes={"type": "disk space"},
            ),
        ],
    )
]

# 3. Run the extraction on your input text
input_text = (
    "Our jet substructure study is based on the Jet Primary Dataset [76], "
    "which is a subset of the full open data release with events that pass a predefined set "
    "of single-jet and multi-jet triggers. There are 1664 AOD files in the Jet Primary Dataset, corresponding to 20,022,826 events "
    "and 2.0 Terabytes of disk space."
)

result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    api_key=api_key,
)


In [None]:
# Save and visualize the results
lx.io.save_annotated_documents([result], output_name="simple_extraction.jsonl", output_dir=".")

# Generate the interactive visualization
html_content = lx.visualize("simple_extraction.jsonl")
with open("simple_extraction.html", "w", encoding='utf-8') as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  
    else:
        f.write(html_content)

print("Interactive visualization saved to simple_extraction_visualization.html")

In [None]:
#Adding text derived from scientific papers via pymupdf
# 1. Define a concise prompt
prompt = textwrap.dedent("""\
Extract mentions of datasets, file counts, and disk space from scientific texts. Use exact text for extractions. Do not paraphrase or overlap entities.
Provide the output in a structured JSON format. 

Important: Use exact text from the input for extraction_text. Do not paraphrase.
Extract entities in order of appearance with no overlapping text spans.
""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text=textwrap.dedent(
            "Our jet substructure study is based on the Jet Primary Dataset [76], "
            "which is a subset of the full open data release with events that pass a predefined set "
            "of single-jet and multi-jet triggers. There are 1664 AOD files in the Jet Primary Dataset, corresponding to 20,022,826 events "
            "and 2.0 Terabytes of disk space."
        ),
        extractions=[
            lx.data.Extraction(
                extraction_class="file count",
                extraction_text="1664 AOD files",
                attributes={"type": "count"},
            ),
            lx.data.Extraction(
                extraction_class="data set",
                extraction_text="Jet Primary Dataset",
                attributes={"type": "data set"},
            ),
            lx.data.Extraction(
                extraction_class="disk space",
                extraction_text="2.0 Terabytes of disk space",
                attributes={"type": "disk space"},
            ),
        ],
    )
]

# 3. Run the extraction on your input text
input_text = (
    text
)


result = lx.extract(
    text_or_documents=text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    api_key=api_key,
    extraction_passes=3,     
    max_workers=10,          
    max_char_buffer=1000,   
)

print(f"Extracted {len(result.extractions)} entities from {len(result.text):,} characters")


In [None]:
# Save and visualize the results
lx.io.save_annotated_documents([result], output_name="jet_substructure_test_extraction.jsonl", output_dir=".")

# Generate the interactive visualization
html_content = lx.visualize("jet_substructure_test_extraction.jsonl")
with open("jet_substructure_test_extraction.html", "w", encoding='utf-8') as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  
    else:
        f.write(html_content)

print("Interactive visualization saved to jet_substructure_test_extraction_visualization.html")