In [None]:
import openai
import json
import math
from openai.lib._parsing._completions import type_to_response_format_param
import pandas as pd

import functions.prompts as prompts

client = openai.Client()

In [None]:
df = pd.read_csv("papers.csv")
df.head()

In [None]:
def wrap(body, id):
    return {
        "custom_id": id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": body
    }
    
def gen_body(text, top5=True):
    return {
            "model": "gpt-4o-mini",
            "messages":[{
                    "role": "system",
                    "content": prompts.top5() if top5 else prompts.analysis()
                },
                {
                    "role": "user",
                    "content": text
                }],
            "response_format": type_to_response_format_param(prompts.Top5Model if top5 else prompts.AnalysisModel)
        }


In [None]:
import numpy as np

divider = 5

for index, row in df.iterrows():
    with open(f'output/{row["id"]}.txt', 'r') as f:
        text = f.read()
        paper = f"PAPER TITLE: {row['name']}\n\nPAPER TEXT: {text}"
        # [ wrap(gen_body(f"PAPER TITLE: {row['name']}\n\nPAPER TEXT: {text}"), f"{row["id"]}_{i}") for i in range(3) ]
        id = [  *[ wrap( gen_body(paper),                f"{row["id"]}|{i}+top5") for i in range(3) ],
                *[ wrap( gen_body(paper, top5=False),    f"{row['id']}|{i}+analysis") for i in range(3) ] ]
        
        file_index = math.floor(index / (len(df) / divider))
        # file_index = 0

        with open(f"input/batch-{file_index}.jsonl", 'a') as f:
            f.write(f"{"\n".join([json.dumps(i) for i in id])}\n")

In [None]:
batch_input_files = []

for i in range(divider):
    print(f"Sending batch {i}")
    batch_input_file = client.files.create(
        file=open(f"input/batch-{i}.jsonl", "rb"),
        purpose="batch"
    )
    print(f"{i} {batch_input_file.id}")
    batch_input_files.append(batch_input_file)

print(batch_input_files)

In [None]:
# KINDA DANGEROUS
# batch_objects = []
# for i, b in enumerate(batch_input_files):
#     batch_input_file_id = b.id
#     batch_object = client.batches.create(
#         input_file_id=batch_input_file_id,
#         endpoint="/v1/chat/completions",
#         completion_window="24h",
#         metadata={
#             "description": f"[AI REVIEWER] Set {i}"
#         }
#     )
#     print(f"{i} {batch_object.id}")
#     batch_objects.append(batch_object)

In [6]:
batches = []
with open("dump/openai-batches.txt", 'r') as f:
    batches = f.read()
    batches = batches.split("\n")

In [10]:
file_ids = []
for i, batch in enumerate(batches):
    print(client.batches.retrieve(batch))
    # file_ids.append(x.output_file_id)

Batch(id='batch_67debc8d4f788190b1fb3f0654ae9e71', completion_window='24h', created_at=1742650509, endpoint='/v1/chat/completions', input_file_id='file-3JZPEFBedFWR5PTCRF3dBq', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1742651652, error_file_id=None, errors=None, expired_at=None, expires_at=1742736909, failed_at=None, finalizing_at=1742651562, in_progress_at=1742650512, metadata={'description': '[AI REVIEWER] Set 0'}, output_file_id='file-MK6mgmciFdGAPbRmbAhR61', request_counts=BatchRequestCounts(completed=1320, failed=0, total=1320))
Batch(id='batch_67debc8dad3c819080c0754b3610d6f0', completion_window='24h', created_at=1742650509, endpoint='/v1/chat/completions', input_file_id='file-N3VozYe9vaQFTz92434ZzR', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1742651662, error_file_id=None, errors=None, expired_at=None, expires_at=1742736909, failed_at=None, finalizing_at=1742651551, in_progress_at=174265

In [None]:
file_ids

In [None]:
for i, id in enumerate(file_ids):
    file_response = client.files.content(id)

In [None]:
import pandas as pd
import json

df = pd.read_csv("papers.csv")

for line in file_response.text.split("\n")[:-1]:
    l = json.loads(line)
    id = l['custom_id'].split("|")[0]
    no, typ = l['custom_id'].split("|")[1].split("+")
    
    idx = df.index[df['id'] == id].tolist()[0]
    
    content = l['response']['body']['choices'][0]['message']['content']
    
    metrics = ['score'] if typ == "top5" else ['originality', 'rigor', 'scope', 'impact', 'written_by_ai']
    validateModel = prompts.Top5Model if typ == "top5" else prompts.AnalysisModel

    for i, metric in enumerate(metrics):
        column_name = f"openai-{metric}-{int(no)+1}"
        
        if column_name not in df.columns:
            df[column_name] = None

        o = validateModel.model_validate_json(content)
        df.loc[idx, column_name] = o.__dict__[metric]

In [None]:
df

In [None]:
df.to_csv("papers_openai.csv", index=False)