In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import time
import csv
import pandas as pd
import pymupdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
import base64
from pathlib import Path
import PyPDF2



In [None]:
# Load environment variables from .env (if present)
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

In [None]:
#test with orginial paper- jet substructure
client = OpenAI(api_key=api_key)

file = client.files.create(
    file=open("jet_substructure_paper.pdf", "rb"),
    purpose="user_data"
)

myprompt = 'You are an expert at high energy particle physics and you understand jargon like "events" and datasets.'
myprompt += 'You are also very, very careful and a good explainer. '
myprompt += 'I need your help reading some documents and extracting some information. '
myprompt += "I'm looking for information on the dataset the authors used. So things like \n"
myprompt += "* Title of the paper \n"
myprompt += "* Authors of the paper \n"
myprompt += "* Name of the dataset (collision or MC) \n"
myprompt += "* Size in number of events \n"
myprompt += "* Size in number of files \n"
myprompt += "* Size in bytes \n"
myprompt += "* Dataformat (AOD, miniAOD, nanoAOD, etc) \n"
myprompt += "* Doi of datasets used \n"
myprompt += "I just uploaded to you a pdf of one of these papers. Can you try to extract that information?"
myprompt += "Note that if the paper does specify the exact size in number of events, approximation is fine, just indicate that it's an approximation."
myprompt += "look up the exact DOIs and sizes from the CMS Open Data records you cite if they are not included in the paper."
myprompt += "Do not use em dashes (—) in the csv, use regular hyphens (-) instead."
myprompt += "Can you also create a csv file with that information, with columns for each of the items above?"

start = time.time()

response = client.responses.create(
    model="gpt-5",
    input=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_file",
                    "file_id": file.id,
                },
                {
                    "type": "input_text",
                    #"text": "What is the title of this paper and who wrote it?",
                    "text": myprompt,
                },
            ]
        }
    ]
)

print(response.output_text)
print()
print(f"Time to process: {time.time() - start:.2f} seconds")

In [None]:
#Run this cell ONCE
!curl https://arxiv.org/pdf/2312.06909v1 -o pretraining_strat.pdf

In [None]:
#trying other paper
client = OpenAI(api_key=api_key)

file = client.files.create(
    file=open("pretraining_strat.pdf", "rb"),
    purpose="user_data"
)

myprompt = 'You are an expert at high energy particle physics and you understand jargon like "events" and datasets.'
myprompt += 'You are also very, very careful and a good explainer. '
myprompt += 'I need your help reading some documents and extracting some information. '
myprompt += "I'm looking for information on the dataset the authors used. So things like \n"
myprompt += "* Title of the paper \n"
myprompt += "* Authors of the paper \n"
myprompt += "* Name of the dataset (collision or MC) \n"
myprompt += "* Size in number of events \n"
myprompt += "* Size in number of files \n"
myprompt += "* Size in bytes \n"
myprompt += "* Dataformat (AOD, miniAOD, nanoAOD, etc) \n"
myprompt += "* Doi of datasets used \n"
myprompt += "I just uploaded to you a pdf of one of these papers. Can you try to extract that information?"
myprompt += "Note that if the paper does specify the exact size in number of events, approximation is fine, just indicate that it's an approximation."
myprompt += "look up the exact DOIs and sizes from the CMS Open Data records you cite if they are not included in the paper."
myprompt += "Do not use em dashes (—) in the csv, use regular hyphens (-) instead."
myprompt += "Can you also create a csv file with that information, with columns for each of the items above?"
myprompt += "Also let me know how you found/calculated/inferred the information in the paper for each of the items."

start = time.time()

response = client.responses.create(
    model="gpt-5",
    input=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_file",
                    "file_id": file.id,
                },
                {
                    "type": "input_text",
                    #"text": "What is the title of this paper and who wrote it?",
                    "text": myprompt,
                },
            ]
        }
    ]
)

print(response.output_text)
print()
print(f"Time to process: {time.time() - start:.2f} seconds")

****GPT Response when asking to clarify where the data is found/inferred****
Here is what I could extract from the paper you provided and (where possible) infer carefully. I also indicate where the paper is silent and what would need to be looked up on the CMS Open Data portal to fill in the gaps.

CSV
title,authors,dataset_name,size_events,size_files,size_bytes,dataformat,dataset_doi
Pre-training strategy using real particle collision data for event classification in collider physics,"Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","CMS Open Data - SingleElectron/Run2015D-08Jun2016-v1/AOD + SingleMuon/Run2015D-16Dec2015-v1/AOD (collision)",approximately 1.2e6 events used after selection across both datasets (1.0e6 train, 0.1e6 val, 0.1e6 test),not stated in paper,not stated in paper,AOD,TBD (DOIs on CMS Open Data records 24103 and 24102)
Pre-training strategy using real particle collision data for event classification in collider physics,"Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","Generated MC - Two-Higgs-Doublet-Model signal vs SM ttbar background with MadGraph5_aMC@NLO + Pythia8 + Delphes (MC)",approximately 1.2e6 total across signal and background (1.0e6 train, 0.1e6 val, 0.1e6 test),not stated in paper,not stated in paper,Delphes ROOT (fast-sim),none

How I found or inferred each item
- Title of the paper: Front page of the PDF.
- Authors of the paper: Front page of the PDF.
- Name of the dataset (collision or MC):
  - Collision data: Section 3 “Datasets,” Pre-training dataset, and references [12] and [13]. The paper explicitly uses the CMS open data SingleElectron and SingleMuon primary datasets from Run 2015D in AOD format.
  - MC data: Section 3 “Event classification dataset” states events were generated with MadGraph5_aMC@NLO at sqrt(s)=13 TeV, showered with Pythia8, detector simulated by Delphes. Signal is a Two-Higgs-Doublet Model; background is SM ttbar.
- Size in number of events:
  - Collision data used in pre-training: Section 3 states selected events were split into approximately 10^6 for training, 10^5 for validation, and 10^5 for testing, i.e., about 1.2×10^6 events used in total across the two real-data datasets after their selections.
  - MC data for event classification: Section 3 states approximately 5×10^5 train, 5×10^4 val, and 5×10^4 test events per process (signal and background). Summing across both processes gives about 1.0×10^6 train + 1.0×10^5 val + 1.0×10^5 test = 1.2×10^6 events total used.
- Size in number of files: Not stated in the paper. This is typically listed on each CMS Open Data record page.
- Size in bytes: Not stated in the paper. Also typically listed on the CMS Open Data record page.
- Dataformat:
  - Collision data: AOD, explicitly mentioned in refs [12] and [13] (“in AOD format”).
  - MC: Delphes is explicitly mentioned; Delphes outputs are ROOT files (fast simulation).
- DOI of datasets used:
  - The paper cites the CMS Open Data record pages for SingleElectron Run2015D (record 24103) and SingleMuon Run2015D (record 24102) but does not print the DOI strings. Those record pages include the official DOIs (format 10.7483/OPENDATA.CMS....).
  - The generated MC has no DOI.

What I could not fill from the paper and how to complete it
- Exact DOIs, number of files, and total bytes for the two CMS Run2015D AOD datasets are not printed in the paper. They can be retrieved from:
  - SingleElectron Run2015D-08Jun2016-v1/AOD: http://opendata.cern.ch/record/24103
  - SingleMuon   Run2015D-16Dec2015-v1/AOD: http://opendata.cern.ch/record/24102
If you’d like, I can look up those records and add the exact DOIs and sizes to the CSV.

Time to process: 119.66 seconds

In [None]:
#Create a sample csv file from the output from pretraining paper and jet substructure paper (find a way to automate just using chat response instead of manually copying)
data = [
         ["Title","Authors","Dataset name (collision or MC)","Size (events)","Size (files)","Size (bytes)","Data format","Dataset DOI"],
         ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","CMS SingleElectron primary dataset Run2015D-08Jun2016-v1 AOD (collision)","","","","AOD","http://opendata.cern.ch/record/24103"],
         ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","CMS SingleMuon primary dataset Run2015D-16Dec2015-v1 AOD (collision)","","","","AOD","http://opendata.cern.ch/record/24102"],
         ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","Private MC: 2HDM signal plus SM ttbar background (MC)","~1200000 (total across train, val, test)","N/A","N/A","Delphes fast-sim ROOT files","N/A"],
         ["Jet Substructure Studies with CMS Open Data","Aashish Tripathee; Wei Xue; Andrew Larkoski; Simone Marzani; Jesse Thaler","CMS Open Data - Jet Primary Dataset (/Jet/Run2010B-Apr21ReReco-v1/AOD), pp collision data at 7 TeV","20022826","1664","2000000000000","AOD","10.7483/OPENDATA.CMS.3S7F.2E9W"],

        
     ]

with open('output.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)



In [None]:
df = pd.read_csv('output.csv')

df


In [None]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

def extract_text_from_pdf(pdf_path): #extract text from single pdf file 
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}")
    return text.strip()

def chunk_text(text, max_words=1500): #Split text into chunks of max words 1500
    words = text.split()
    chunks = []
    for i in range(0, len(words), max_words):
        chunk = ' '.join(words[i:i + max_words])
        chunks.append(chunk)
    return chunks

def process_with_gpt(prompt, model="gpt-5"): #send prompt to GPT
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
    
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"API Error: {str(e)}"

def extract_data_from_pdf_text(pdf_text, prompt): #extract data for each text chunk
    if "{text_chunk}" in prompt:
        full_prompt = prompt.replace("{text_chunk}", pdf_text)
    else:
        full_prompt = f"{prompt}\n\nDocument:\n{pdf_text}"
    
    return process_with_gpt(full_prompt)

def process_multiple_pdfs(pdf_directory, prompt, output_file=None, chunk_threshold=1500): #process the pdfs
    
    results = {}
    pdf_files = list(Path(pdf_directory).glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files in {pdf_directory}")

    for i, pdf_path in enumerate(pdf_files, 1):
        print(f"Processing {i}/{len(pdf_files)}: {pdf_path.name}")
        try:
            # Extract text from current PDF only
            pdf_text = extract_text_from_pdf(pdf_path)
            
            if not pdf_text:
                print(f" - No text extracted from {pdf_path.name}")
                results[pdf_path.name] = "No text found"
                continue

            word_count = len(pdf_text.split())
            print(f"  Extracted {word_count} words")

            # Process with chunking if needed
            if word_count > chunk_threshold:
                print(f"  Chunking document (>{chunk_threshold} words)")
                chunks = chunk_text(pdf_text, max_words=chunk_threshold)
                chunk_outputs = []
                
                for idx, chunk in enumerate(chunks, 1):
                    print(f"    Processing chunk {idx}/{len(chunks)}")
                    if "{text_chunk}" in prompt:
                        chunk_prompt = prompt.replace("{text_chunk}", chunk)
                    else:
                        chunk_prompt = f"{prompt}\n\nDocument chunk:\n{chunk}"
                    
                    out = process_with_gpt(chunk_prompt)
                    chunk_outputs.append(out)
                
                # Combine chunk results
                combined_prompt = (
                    "Combine the following chunked extractions into a single "
                    "coherent extraction. Remove duplicates and consolidate "
                    "the data:\n\n" + "\n\n---\n\n".join(chunk_outputs)
                )
                final = process_with_gpt(combined_prompt)
            else:
                # Process entire document at once
                final = extract_data_from_pdf_text(pdf_text, prompt)

            results[pdf_path.name] = final
            print(f"✓ Successfully processed {pdf_path.name}")
            
        except Exception as e:
            print(f"✗ Error processing {pdf_path.name}: {str(e)}")
            results[pdf_path.name] = f"Error: {str(e)}"

    # Save results to file if specified
    if output_file:
        with open(output_file, 'w', encoding='utf-8') as f:
            for filename, data in results.items():
                f.write(f"\n{'='*60}\n")
                f.write(f"File: {filename}\n")
                f.write(f"{'='*60}\n")
                f.write(f"{data}\n")
        print(f"\nResults saved to {output_file}")

    return results


# Example usage
if __name__ == "__main__":
    # Define your extraction prompt
    PROMPT_TEMPLATE = """
    You are an expert at high energy particle physics and you understand jargon like "events" and datasets. You are also very, very careful and a good explainer.
    I need your help reading some documents and extracting some information. I'm looking for information on the dataset the authors used. So things like:
    * Title of the paper
    * Authors of the paper
    * Year and month of publication (make sure this is included in a format that will work as a float in a csv)
    * Name of the dataset (collision or MC)
    * Size in number of events
    * The same size in number of events in a single number as a float for csv
    * Size in number of files
    * The same size in number of files in a single number as a float for csv
    * Size in bytes
    * The same size in bytes as a float in a single number for csv
    * Dataformat (AOD, miniAOD, nanoAOD, etc)
    * Doi of datasets used
    If the paper does not give exact numbers, provide an approximation and tag it as an approximation.
    Look up the exact DOIs and sizes from the CMS Open Data records you cite if they are not included in the paper. Do not use em dashes (—) in the csv, use regular hyphens (-) instead.
    Produce a single CSV-style row (or a CSV with one header row + one data row) for each document with columns: Title, Authors, Dataset name (collision or MC), Size (events), Size (files), Size (bytes), Data format, Dataset DOI.
    If you are given a text chunk, insert it where {text_chunk} appears in this template. If not, the document text will be appended after this prompt for context.
    

    Document Text:
    "{text_chunk}"

    
    """
    
    # Process PDFs
    results = process_multiple_pdfs(
        pdf_directory=r"C:/Users/ejren/OneDrive/DPOA_papers",
        prompt=PROMPT_TEMPLATE,
        output_file="extracted_results.txt",
        chunk_threshold=1500
    )
    
    # Print summary
    print("\n" + "="*60)
    print("PROCESSING COMPLETE")
    print("="*60)
    for filename in results:
        print(f"- {filename}")


In [None]:
#Create csv file with result data (automate this later)
#Should be 8 items in the list
#chat chatGPT doesn't output this in proper so have to fix this manually for now

initial_data = [
    ["Title","Authors","Dataset name (collision or MC)","Size (events)","Size (files)","Size (bytes)","Data format","Dataset DOI"],
    ["Unveiling Time-Varying Signals of Ultralight Bosonic Dark Matter at Collider and Beam Dump Experiments","Jinhui Guo; Yuxuan He; Jia Liu; Xiao-Ping Wang; Ke-Pan Xie","CMS Run 2012 DoubleMuParked dimuon sample (8 TeV, collision) - AOD; likely Runs 2012B/2012C/2012D","~1e8-1e9 (approximation)","~1e3-1.5e4 (approximation)","~1e13-1e14 (approximation)","AOD","Lookup required (not specified in paper)"],
    ["Unveiling Time-Varying Signals of Ultralight Bosonic Dark Matter at Collider and Beam Dump Experiments","Jinhui Guo; Yuxuan He; Jia Liu; Xiao-Ping Wang; Ke-Pan Xie","Published spectra from BaBar; LHCb; NA48/2; APEX; HADES; KLOE; PHENIX; WASA; E774; E141; NA64 (phenomenology recast; no CMS Open Data)","N/A","N/A","N/A","N/A","N/A"],
    ["Exploring Uncharted Soft Displaced Vertices in Open Data","Haipeng An; Zhen Hu; Zhen Liu; Daneng Yang","CMS 2012 8 TeV MET primary datasets (Run2012A, Run2012B, Run2012C) with HLT_PFMET150 (collision)","approximate: B+C ~4.3e7 total; overall O(1e7-1e8)","approximate: O(1e3-1e4)","approximate: O(1-10 TB)","AOD","requires lookup - per-dataset DOIs for /MET/Run2012A-22Jan2013-v1/AOD; /MET/Run2012B-22Jan2013-v1/AOD; /MET/Run2012C-22Jan2013-v1/AOD"],
    ["Exploring Uncharted Soft Displaced Vertices in Open Data","Haipeng An; Zhen Hu; Zhen Liu; Daneng Yang","TTJets_HadronicMGDecays_8TeV-madgraph (MC - full detector simulation)","approximate: unknown","unknown","unknown","AODSIM","requires lookup - CMS Open Data record"],
    ["End-to-End Jet Classification of Quarks and Gluons with the CMS Open Data","M. Andrews; J. Alison; S. An; B. Burkle; S. Gleyzer; M. Narain; M. Paulini; B. Poczos; E. Usai","CMS 2012 Open Data simulated QCD dijet (Pythia6 Z2*, pThat 80-120 and 120-170) (MC)","~933,206 used (subset)","not specified","not specified","GEN-SIM-RECO (RECO)","not specified"],
    ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","collision - /SingleElectron/Run2015D-08Jun2016-v1/AOD","approx-90,000,000 (full dataset); subset used across study: ~1.2e6 unlabeled (pre-training) and ~1e4 labeled (classification)","approx-4,500","approx-5e12","AOD (source); MiniAOD used in analysis","http://opendata.cern.ch/record/24103"],
    ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","collision - /SingleMuon/Run2015D-16Dec2015-v1/AOD","approx-120,000,000 (full dataset); subset used across study: ~1.2e6 unlabeled (pre-training) and ~1e4 labeled (classification)","approx-6,000","approx-7e12","AOD (source); MiniAOD used in analysis","http://opendata.cern.ch/record/24102"],
    ["Searching in CMS Open Data for Dimuon Resonances with Substantial Transverse Momentum","Cari Cesarotti; Yotam Soreq; Matthew J. Strassler; Jesse Thaler; Wei Xue","/DoubleMu/Run2011A-12Oct2013-v1/AOD (collision)","approx 2.0e7 total; 6,241,576 analyzed; 2,155,900 after baseline","approx 2000","approx 6.0e12","AOD","unknown - please verify on CERN Open Data"],
    ["Searching in CMS Open Data for Dimuon Resonances with Substantial Transverse Momentum","Cari Cesarotti; Yotam Soreq; Matthew J. Strassler; Jesse Thaler; Wei Xue","DYJetsToLL_M-50_TuneZ2_7TeV-madgraph-tauola (MC)","unknown","unknown","unknown","AODSIM","unknown"],
    ["Searching in CMS Open Data for Dimuon Resonances with Substantial Transverse Momentum","Cari Cesarotti; Yotam Soreq; Matthew J. Strassler; Jesse Thaler; Wei Xue","DYToMuMu_M-10To50_TuneZ2_7TeV-pythia6 (MC)","unknown","unknown","unknown","AODSIM","unknown"]
]

with open('outputTest.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(initial_data)

In [None]:
df = pd.read_csv('outputTest.csv')

df

In [None]:
df.to_hdf('outputTest.h5', key='mainData')

print("DataFrame saved to outputTest.h5")