In [None]:
import os
from openai import OpenAI
from dotenv import load_dotenv
import time
import csv
import pandas as pd


In [None]:
# Load environment variables from .env (if present)
load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")

In [None]:
#test with orginial paper- jet substructure
client = OpenAI(api_key=api_key)

file = client.files.create(
    file=open("jet_substructure_paper.pdf", "rb"),
    purpose="user_data"
)

myprompt = 'You are an expert at high energy particle physics and you understand jargon like "events" and datasets.'
myprompt += 'You are also very, very careful and a good explainer. '
myprompt += 'I need your help reading some documents and extracting some information. '
myprompt += "I'm looking for information on the dataset the authors used. So things like \n"
myprompt += "* Title of the paper \n"
myprompt += "* Authors of the paper \n"
myprompt += "* Name of the dataset (collision or MC) \n"
myprompt += "* Size in number of events \n"
myprompt += "* Size in number of files \n"
myprompt += "* Size in bytes \n"
myprompt += "* Dataformat (AOD, miniAOD, nanoAOD, etc) \n"
myprompt += "* Doi of datasets used \n"
myprompt += "I just uploaded to you a pdf of one of these papers. Can you try to extract that information?"
myprompt += "Note that if the paper does specify the exact size in number of events, approximation is fine, just indicate that it's an approximation."
myprompt += "look up the exact DOIs and sizes from the CMS Open Data records you cite if they are not included in the paper."
myprompt += "Do not use em dashes (—) in the csv, use regular hyphens (-) instead."
myprompt += "Can you also create a csv file with that information, with columns for each of the items above?"

start = time.time()

response = client.responses.create(
    model="gpt-5",
    input=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_file",
                    "file_id": file.id,
                },
                {
                    "type": "input_text",
                    #"text": "What is the title of this paper and who wrote it?",
                    "text": myprompt,
                },
            ]
        }
    ]
)

print(response.output_text)
print()
print(f"Time to process: {time.time() - start:.2f} seconds")

In [None]:
#Run this cell ONCE
!curl https://arxiv.org/pdf/2312.06909v1 -o pretraining_strat.pdf

In [None]:
#trying other paper
client = OpenAI(api_key=api_key)

file = client.files.create(
    file=open("pretraining_strat.pdf", "rb"),
    purpose="user_data"
)

myprompt = 'You are an expert at high energy particle physics and you understand jargon like "events" and datasets.'
myprompt += 'You are also very, very careful and a good explainer. '
myprompt += 'I need your help reading some documents and extracting some information. '
myprompt += "I'm looking for information on the dataset the authors used. So things like \n"
myprompt += "* Title of the paper \n"
myprompt += "* Authors of the paper \n"
myprompt += "* Name of the dataset (collision or MC) \n"
myprompt += "* Size in number of events \n"
myprompt += "* Size in number of files \n"
myprompt += "* Size in bytes \n"
myprompt += "* Dataformat (AOD, miniAOD, nanoAOD, etc) \n"
myprompt += "* Doi of datasets used \n"
myprompt += "I just uploaded to you a pdf of one of these papers. Can you try to extract that information?"
myprompt += "Note that if the paper does specify the exact size in number of events, approximation is fine, just indicate that it's an approximation."
myprompt += "look up the exact DOIs and sizes from the CMS Open Data records you cite if they are not included in the paper."
myprompt += "Do not use em dashes (—) in the csv, use regular hyphens (-) instead."
myprompt += "Can you also create a csv file with that information, with columns for each of the items above?"

start = time.time()

response = client.responses.create(
    model="gpt-5",
    input=[
        {
            "role": "user",
            "content": [
                {
                    "type": "input_file",
                    "file_id": file.id,
                },
                {
                    "type": "input_text",
                    #"text": "What is the title of this paper and who wrote it?",
                    "text": myprompt,
                },
            ]
        }
    ]
)

print(response.output_text)
print()
print(f"Time to process: {time.time() - start:.2f} seconds")

In [None]:
#Create a sample csv file from the output from pretraining paper and jet substructure paper (find a way to automate just using chat response instead of manually copying)
data = [
         ["Title","Authors","Dataset name (collision or MC)","Size (events)","Size (files)","Size (bytes)","Data format","Dataset DOI"],
         ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","CMS SingleElectron primary dataset Run2015D-08Jun2016-v1 AOD (collision)","","","","AOD","http://opendata.cern.ch/record/24103"],
         ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","CMS SingleMuon primary dataset Run2015D-16Dec2015-v1 AOD (collision)","","","","AOD","http://opendata.cern.ch/record/24102"],
         ["Pre-training strategy using real particle collision data for event classification in collider physics","Tomoe Kishimoto; Masahiro Morinaga; Masahiko Saito; Junichi Tanaka","Private MC: 2HDM signal plus SM ttbar background (MC)","~1200000 (total across train, val, test)","N/A","N/A","Delphes fast-sim ROOT files","N/A"],
         ["Jet Substructure Studies with CMS Open Data","Aashish Tripathee; Wei Xue; Andrew Larkoski; Simone Marzani; Jesse Thaler","CMS Open Data - Jet Primary Dataset (/Jet/Run2010B-Apr21ReReco-v1/AOD), pp collision data at 7 TeV","20022826","1664","2000000000000","AOD","10.7483/OPENDATA.CMS.3S7F.2E9W"],

        
     ]

with open('output.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)



In [None]:
df = pd.read_csv('output.csv')

df
