In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import time
import csv
import pandas as pd
import pymupdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader



In [None]:
# Load environment variables from .env (if present)
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

In [None]:
#test with orginial paper- jet substructure
client = OpenAI(api_key=api_key)

file = client.files.create(
    file=open("jet_substructure_paper.pdf", "rb"),
    purpose="user_data"
)

myprompt = 'You are an expert at high energy particle physics and you understand jargon like "events" and datasets.'
myprompt += 'You are also very, very careful and a good explainer. '
myprompt += 'I need your help reading some documents and extracting some information. '
myprompt += "I'm looking for information on the dataset the authors used. So things like \n"
myprompt += "* Title of the paper \n"
myprompt += "* Authors of the paper \n"
myprompt += "* Name of the dataset (collision or MC) \n"
myprompt += "* Size in number of events \n"
myprompt += "* Size in number of files \n"
myprompt += "* Size in bytes \n"
myprompt += "* Dataformat (AOD, miniAOD, nanoAOD, etc) \n"
myprompt += "* Doi of datasets used \n"
myprompt += "I just uploaded to you a pdf of one of these papers. Can you try to extract that information?"
myprompt += "Note that if the paper does specify the exact size in number of events, approximation is fine, just indicate that it's an approximation."
myprompt += "look up the exact DOIs and sizes from the CMS Open Data records you cite if they are not included in the paper."
myprompt += "Do not use em dashes (—) in the csv, use regular hyphens (-) instead."
myprompt += "Can you also create a csv file with that information, with columns for each of the items above?"

start = time.time()

response = client.responses.create(
    model="gpt-5",
    input=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_file",
                    "file_id": file.id,
                },
                {
                    "type": "input_text",
                    #"text": "What is the title of this paper and who wrote it?",
                    "text": myprompt,
                },
            ]
        }
    ]
)

print(response.output_text)
print()
print(f"Time to process: {time.time() - start:.2f} seconds")

In [None]:
#Run this cell ONCE
!curl https://arxiv.org/pdf/2312.06909v1 -o pretraining_strat.pdf

In [None]:
#trying other paper
client = OpenAI(api_key=api_key)

file = client.files.create(
    file=open("pretraining_strat.pdf", "rb"),
    purpose="user_data"
)

myprompt = 'You are an expert at high energy particle physics and you understand jargon like "events" and datasets.'
myprompt += 'You are also very, very careful and a good explainer. '
myprompt += 'I need your help reading some documents and extracting some information. '
myprompt += "I'm looking for information on the dataset the authors used. So things like \n"
myprompt += "* Title of the paper \n"
myprompt += "* Authors of the paper \n"
myprompt += "* Name of the dataset (collision or MC) \n"
myprompt += "* Size in number of events \n"
myprompt += "* Size in number of files \n"
myprompt += "* Size in bytes \n"
myprompt += "* Dataformat (AOD, miniAOD, nanoAOD, etc) \n"
myprompt += "* Doi of datasets used \n"
myprompt += "I just uploaded to you a pdf of one of these papers. Can you try to extract that information?"
myprompt += "Note that if the paper does specify the exact size in number of events, approximation is fine, just indicate that it's an approximation."
myprompt += "look up the exact DOIs and sizes from the CMS Open Data records you cite if they are not included in the paper."
myprompt += "Do not use em dashes (—) in the csv, use regular hyphens (-) instead."
myprompt += "Can you also create a csv file with that information, with columns for each of the items above?"

start = time.time()

response = client.responses.create(
    model="gpt-5",
    input=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_file",
                    "file_id": file.id,
                },
                {
                    "type": "input_text",
                    #"text": "What is the title of this paper and who wrote it?",
                    "text": myprompt,
                },
            ]
        }
    ]
)

print(response.output_text)
print()
print(f"Time to process: {time.time() - start:.2f} seconds")

In [None]:
#Create a sample csv file from the output from pretraining paper and jet substructure paper (find a way to automate just using chat response instead of manually copying)
data = [
         ["Title","Authors","Dataset name (collision or MC)","Size (events)","Size (files)","Size (bytes)","Data format","Dataset DOI"],
         ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","CMS SingleElectron primary dataset Run2015D-08Jun2016-v1 AOD (collision)","","","","AOD","http://opendata.cern.ch/record/24103"],
         ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","CMS SingleMuon primary dataset Run2015D-16Dec2015-v1 AOD (collision)","","","","AOD","http://opendata.cern.ch/record/24102"],
         ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","Private MC: 2HDM signal plus SM ttbar background (MC)","~1200000 (total across train, val, test)","N/A","N/A","Delphes fast-sim ROOT files","N/A"],
         ["Jet Substructure Studies with CMS Open Data","Aashish Tripathee; Wei Xue; Andrew Larkoski; Simone Marzani; Jesse Thaler","CMS Open Data - Jet Primary Dataset (/Jet/Run2010B-Apr21ReReco-v1/AOD), pp collision data at 7 TeV","20022826","1664","2000000000000","AOD","10.7483/OPENDATA.CMS.3S7F.2E9W"],

        
     ]

with open('output.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)



In [None]:
df = pd.read_csv('output.csv')

df


In [None]:
#trying to extract multiple pdfs at once (not with RAG, just extract text from multiple pdfs in folder)
#Probably too slow, so might just try to use RAG instead
#Step 1: extract text from pdfs in folder

def extract_text_from_pdfs(pdf_folder_path):
    #extract text from all PDF files in folder
    all_text = {} #holds the extracted text for each file
    for filename in os.listdir(pdf_folder_path): #loop through files in folder
        if not filename.lower().endswith('.pdf'): #only process pdf files
            continue

        pdf_path = os.path.join(pdf_folder_path, filename) #get path to pdf file

        try:
            doc = pymupdf.open(pdf_path)  # open by path 
        except Exception as e:
            print(f"Failed to open {pdf_path}: {e}")
            continue

        text_parts = [] #holds text parts for this file
        try:
            for page in doc:  # iterate pages
                page_text = page.get_text() or ""
                text_parts.append(page_text) #append the extracted text
        except Exception as e:
            print(f"Failed to extract text from {pdf_path}: {e}")
        finally: #ensure document is closed
            try:
                doc.close()
            except Exception:
                pass

        all_text[filename] = "\n".join(text_parts) #combine text parts

    return all_text #return all_text

# Define the path to folder containing the PDFs
pdf_folder = r"C:/Users/ejren/OneDrive/DPOA_papers"
# Run extraction
document_texts = extract_text_from_pdfs(pdf_folder)
print(f"Extracted text from {len(document_texts)} PDF(s)")


In [None]:
#Step 2: Chunk text for large pdfs so chatGPT doesn't lowkey crash out 

def chunk_text(text, chunk_size=1500, chunk_overlap=200): #chunk text into smaller pieces
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    return text_splitter.split_text(text)

In [None]:
#Step 3: Generate prompt from chunks
def generate_prompt(text_chunk):
    #generate prompt for a given text chunk for GPT
    return f"""
    You are an expert at high energy particle physics and you understand jargon like "events" and datasets. You are also very, very careful and a good explainer. 
    I need your help reading some documents and extracting some information. I'm looking for information on the dataset the authors used. So things like: \n
    * Title of the paper \n
    * Authors of the paper \n
    * Name of the dataset (collision or MC) \n
    * Size in number of events \n
    * Size in number of files \n
    * Size in bytes \n
    * Dataformat (AOD, miniAOD, nanoAOD, etc) \n
    * Doi of datasets used \n
    I just uploaded to you a pdf of one of these papers. Can you try to extract that information? Note that if the paper does specify the exact size in number of events, approximation is fine, just indicate that it's an approximation.
    Look up the exact DOIs and sizes from the CMS Open Data records you cite if they are not included in the paper. Do not use em dashes (—) in the csv, use regular hyphens (-) instead.
    Can you also create a csv file with that information, with columns for each of the items above?
    
    Document Chunk:
    "{text_chunk}"

    Summary:
    """

In [None]:
#Step 4: Big boy chatGPT extraction (way over my head I have no idea what is going on I literally vibe coded this)

def response_to_text(resp): #extract text from GPT response
    
    # Prefer a convenience property if present
    if hasattr(resp, "output_text") and resp.output_text:
        return resp.output_text

    # Defensive extraction for common shapes
    try:
        out = getattr(resp, "output", None)
        if out:
            # out may be a list of objects with a 'content' field
            if isinstance(out, (list, tuple)) and len(out) > 0:
                first = out[0]
                if isinstance(first, dict):
                    # shape: {'content': [{'type':..., 'text': '...'}]}
                    content = first.get("content")
                    if isinstance(content, (list, tuple)) and len(content) > 0:
                        c0 = content[0]
                        if isinstance(c0, dict) and "text" in c0:
                            return c0["text"]
                        if isinstance(c0, dict) and "content" in c0 and isinstance(c0["content"], str):
                            return c0["content"]
                # fallback: string-cast the first element
                return str(first)
    except Exception:
        pass

    # Last resort: try dictionary conversion
    try:
        d = resp.to_dict()
        return str(d)
    except Exception:
        return str(resp)


def process_with_gpt(prompt_text, client=OpenAI(api_key=api_key), model="gpt-5"):
    if client is None:
        raise RuntimeError("OpenAI client is not configured (client is None)")

    try:
        
        resp = client.responses.create(
            model=model,
            input=prompt_text,
        )
    except Exception as e:
        # Re-raise with context so notebook shows a helpful message
        print(f"API call failed: {e}")
        raise

    text = response_to_text(resp)
    # Ensure a string is returned
    if text is None:
        return ""
    return text.strip()


def extract_data(document_texts):
    """Process multiple documents: chunk, call model per chunk, and combine results."""
    results = {}
    for filename, text in document_texts.items():
        print(f"Processing {filename}...")
        if not text or not text.strip():
            print(" - Empty document, skipping")
            results[filename] = ""
            continue

        # Decide whether to chunk
        if len(text.split()) > 1500:
            chunks = chunk_text(text)
            data_extracted_chunks = []
            for i, chunk in enumerate(chunks):
                chunk_prompt = generate_prompt(chunk)
                chunk_data_extract = process_with_gpt(chunk_prompt, client=OpenAI(api_key=api_key))
                # Ensure chunk_data_extract is a string
                if not isinstance(chunk_data_extract, str):
                    chunk_data_extract = str(chunk_data_extract)
                data_extracted_chunks.append(chunk_data_extract)
                print(f" - Data extracted chunk {i+1}/{len(chunks)}")

            # Combine chunk outputs and ask the model to consolidate into CSV-like output
            combined_data_extraction_prompt = (
                "Combine the following data extractions into a single, cohesive CSV-format extraction:\n\n"
                + "\n\n".join(data_extracted_chunks)
            )
            final_data_extraction = process_with_gpt(combined_data_extraction_prompt, client=OpenAI(api_key=api_key))
            results[filename] = final_data_extraction
        else:
            prompt = generate_prompt(text)
            result_text = process_with_gpt(prompt, client=OpenAI(api_key=api_key))
            if not isinstance(result_text, str):
                result_text = str(result_text)
            results[filename] = result_text

    return results


# If running as a script, process and save outputs
if __name__ == "__main__":
    data_extractions = extract_data(document_texts)

    # Print or save the results
    for filename, data in data_extractions.items():
        print(f"\n--- Summary for {filename} ---\n{data}\n")
        # Ensure we write a string
        if data is None:
            data = ""
        if not isinstance(data, str):
            data = str(data)
        safe_name = filename.replace('.pdf', '').replace(' ', '_')
        with open(f"summary_{safe_name}.txt", "w", encoding="utf-8") as f:
            f.write(data)



In [None]:
#Use RAG to extract data from multiple pdfs at once
