In [None]:
import os
from openai import OpenAI
import pymupdf
import textwrap
from langchain.text_splitter import CharacterTextSplitter
from dotenv import load_dotenv
import openai
import json


In [None]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
#function to extract text from pdf
def extract_text_from_pdf(pdf_path): 
    text = ""
    with pymupdf.open(pdf_path) as doc: #open the pdf file
        for page in doc: #iterate through each page
            text += page.get_text() #extract text from each page and concatenate
    return text

In [None]:
#Function to split text into smaller chunks so ChatGPT doesn't lowkey crash out
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    return text_splitter.split_text(text)

In [None]:
#function to send chunks and extract data using GPT
def extract_data_with_gpt(text_chunks, prompt_instructions):
    extracted_data = []
    for chunk in text_chunks:
        try:
            response = openai.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are a data extraction assistant. Follow the user's instructions."},
                    {"role": "user", "content": f"{prompt_instructions}\n\nDocument text:\n{chunk}"}
                ]
            )
            extracted_data.append(response.choices[0].message.content)
        except Exception as e:
            print(f"Error with OpenAI API call: {e}")
    return extracted_data

In [None]:
#Perhaps this works-- ahahhahahahaha no it doesn't because paywall :(
#Figure out later how to put all of the extracted data into a nice organized file

# Define Extraction Logic
pdf_file_path = "jet_substructure_paper.pdf" 
extraction_prompt = """
You are an expert at high energy particle physics and you understand jargon like "events" and datasets. You are also very, very careful.
Extract the following information from the provided document text:
1. Names of the datasets used (e.g., "CMS Open Data", "ATLAS Open Data").
2. Size in the number of events of each dataset.
3. The size in bytes of each dataset.
4. The size in number of files of each dataset.

If the paper gives a table of datasets, extract the information from the table. 
If the paper gives certain parameters for the datasets, use those to infer the size of the datasets.

"""

# Execute the Workflow
document_text = extract_text_from_pdf(pdf_file_path)

if document_text:
    chunks = chunk_text(document_text)
    extracted_json_strings = extract_data_with_gpt(chunks, extraction_prompt)

    # Combine results (assuming the main data to combine are line items)
    all_extracted_items = []
    for json_str in extracted_json_strings: 
        try:
            data = json.loads(json_str)
            if "line_items" in data and isinstance(data["line_items"], list):
                all_extracted_items.extend(data["line_items"])
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON: {e}")

    # Output
    if all_extracted_items:
        print("\nSuccessfully extracted data:")
        print(json.dumps(all_extracted_items, indent=2))
    else:
        print("\nCould not extract the requested data.")