In [4]:
import openai
import json
import math
from openai.lib._parsing._completions import type_to_response_format_param
import pandas as pd

import functions.prompts as prompts

client = openai.Client()

In [34]:
df = pd.read_csv("dump/csv/papers.csv")
df['rank'] = df['id'].apply(lambda x: x.split("_")[0])
df = df.loc[df['rank'].isin(["1", "50", "100"])].reset_index(drop=True)

In [5]:
def wrap(body, id):
    return {
        "custom_id": id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": body
    }
    
def gen_body(text, top5=True):
    return {
            "model": "gpt-4o-mini",
            "messages":[{
                    "role": "system",
                    "content": prompts.top5() if top5 else prompts.analysis()
                },
                {
                    "role": "user",
                    "content": text
                }],
            "response_format": type_to_response_format_param(prompts.Top5Model if top5 else prompts.AnalysisModel)
        }


In [6]:
institutions = [
       "Massachusetts Institute of Technology;",
       "Harvard University;",
       "University of Warwick;",
       "London School of Economics and Political Science;",
       "University of Tokyo;",
       "University of Cape Town;",
       "Nanyang Technological University;",
       "Chulalongkorn University;",
       "Universiti Malaya;",
   ]

top_names = [
       "Andrei Shleifer", "Daron Acemoglu", "James J. Heckman",
       "Joseph E. Stiglitz", "John List", "Carmen M. Reinhart",
       "Janet Currie", "Esther Duflo", "Asli Demirguc-Kunt",
       "Marianne Bertrand"
]


bottom_names = [
       "Harold Huibing Zhang", "Lin Zhou", "Andrei Zlate",
       "Ulf Zoelitz", "Asaf Zussman", "Lu Yang",
       "Anzelika Zaiceva", "Aleksandra Zdzienicka",
       "Qiankun Zhou", "Vera Zipperer"
]


random_names = [
       "Bruce S. Green", "Alejandro L. James", "Billie J. Abels",
       "Paul A. Jenkins", "Gary L. Bodie", "Gail J. Doan",
       "Shirley S. Hodgkins", "Pattie K. Reinhardt",
       "Tara R. Weber", "Tabitha J. Cox"
]

In [15]:
import numpy as np

divider = 5
t_name = ["top", "bot", "ran"]

for index, row in df.iterrows():
    with open(f'output/{row["id"]}.txt', 'r') as f:
        text = f.read()
        
        for ind, ins in enumerate(institutions):
        # paper = text
        # [ wrap(gen_body(f"PAPER TITLE: {row['name']}\n\nPAPER TEXT: {text}"), f"{row["id"]}_{i}") for i in range(3) ]
            paper = f"PAPER TITLE: {row['name']}\n\nAFFILIATION: {ins}\n\nPAPER TEXT: {text}"
            id = [  *[ wrap( gen_body(paper),                f"{row["id"]}Qins{ind}|{i}+top5") for i in range(3) ],
                    *[ wrap( gen_body(paper, top5=False),    f"{row['id']}Qins{ind}|{i}+analysis") for i in range(3) ] ]

            file_index = math.floor(index / (len(df) / divider))
            # file_index = 0

            with open(f"dump/openai-batch/batch-{file_index}-bias.jsonl", 'a') as f:
                f.write(f"{"\n".join([json.dumps(i) for i in id])}\n")
                
        for ind, names in enumerate([top_names, bottom_names, random_names]):
            for ind2, name in enumerate(names):
                # paper = text
                # [ wrap(gen_body(f"PAPER TITLE: {row['name']}\n\nPAPER TEXT: {text}"), f"{row["id"]}_{i}") for i in range(3) ]
                paper = f"PAPER TITLE: {row['name']}\n\nAUTHOR: {name}\n\nPAPER TEXT: {text}"
                id = [  *[ wrap( gen_body(paper),                f"{row["id"]}Q{t_name[ind]}{ind2}|{i}+top5") for i in range(3) ],
                        *[ wrap( gen_body(paper, top5=False),    f"{row['id']}Q{t_name[ind]}{ind2}|{i}+analysis") for i in range(3) ] ]

                file_index = math.floor(index / (len(df) / divider))
                # file_index = 0

                with open(f"dump/openai-batch/batch-{file_index}-bias.jsonl", 'a') as f:
                    f.write(f"{"\n".join([json.dumps(i) for i in id])}\n")

In [None]:
batch_input_files = []

for i in range(divider):
    print(f"Sending batch {i}")
    batch_input_file = client.files.create(
        file=open(f"dump/openai-batch/batch-{i}-bias.jsonl", "rb"),
        purpose="batch"
    )
    print(f"{i} {batch_input_file.id}")
    batch_input_files.append(batch_input_file)

print(batch_input_files)

In [None]:
# # KINDA DANGEROUS
# batch_objects = []
# for i, b in enumerate(batch_input_files):
#     batch_input_file_id = b.id
#     batch_object = client.batches.create(
#         input_file_id=batch_input_file_id,
#         endpoint="/v1/chat/completions",
#         completion_window="24h",
#         metadata={
#             "description": f"[AI REVIEWER] Set {i}"
#         }
#     )
#     print(f"{i} {batch_object.id}")
#     batch_objects.append(batch_object)

0 batch_67e3d7b805488190ae75e6cfd5e4e26d
1 batch_67e3d7b9e21481909e29f6d06d1d7071
2 batch_67e3d7ba596481908d0a0bec7fb8236e
3 batch_67e3d7bac2588190b746c158afa36055
4 batch_67e3d7bb21048190a182517317d05338


In [2]:
batches = []
with open("dump/openai-batch/openai-batches-bias.txt", 'r') as f:
    batches = f.read()
    batches = batches.split("\n")

In [30]:
file_ids = []
for i, batch in enumerate(batches):
    x = client.batches.retrieve(batch)
    print(x)
    file_ids.append(x.output_file_id)

Batch(id='batch_67e3d7b805488190ae75e6cfd5e4e26d', completion_window='24h', created_at=1742985144, endpoint='/v1/chat/completions', input_file_id='file-2YGzV1UEC9BECvtJ7Ffdfo', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1742985769, error_file_id=None, errors=None, expired_at=None, expires_at=1743071544, failed_at=None, finalizing_at=1742985684, in_progress_at=1742985147, metadata={'description': '[AI REVIEWER] Set 0'}, output_file_id='file-Df1U3qu8XSBzD7TxrZnXQQ', request_counts=BatchRequestCounts(completed=1404, failed=0, total=1404))
Batch(id='batch_67e3d7b9e21481909e29f6d06d1d7071', completion_window='24h', created_at=1742985145, endpoint='/v1/chat/completions', input_file_id='file-Q6uehFifeRCk3v3kFT7q9X', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1742985525, error_file_id=None, errors=None, expired_at=None, expires_at=1743071545, failed_at=None, finalizing_at=1742985437, in_progress_at=174298

In [31]:
file_ids

['file-Df1U3qu8XSBzD7TxrZnXQQ',
 'file-Daa42L59Bk2yrQmPHWNNPb',
 'file-XbJjMx7Z8Qr6YrxcVoC35x',
 'file-UvQrm9oUWizbCUVwgmNhDF',
 'file-WJDBmBqmDeHHWcG7fwVjX2']

In [32]:
for i, id in enumerate(file_ids):
    file_response = client.files.content(id)
    with open(f"dump/eval-output/openai-output-bias.jsonl", 'a') as f:
        f.write(file_response.text)

In [None]:
import pandas as pd
import json

# df = pd.read_csv("dump/csv/papers.csv")

f = open("dump/eval-output/openai-output-bias.jsonl", "r")
file_response = f.read()
f.close()

for line in file_response.split("\n")[:-1]:
    l = json.loads(line)
    id, bias = l['custom_id'].split("|")[0].split("Q")
    no, typ = l['custom_id'].split("|")[1].split("+")
    
    idx = df.index[df['id'] == id].tolist()[0]
    
    content = l['response']['body']['choices'][0]['message']['content']
    
    metrics = ['score'] if typ == "top5" else ['originality', 'rigor', 'scope', 'impact', 'written_by_ai']
    validateModel = prompts.Top5Model if typ == "top5" else prompts.AnalysisModel

    for i, metric in enumerate(metrics):
        column_name = f"openai-{bias}-{metric}-{int(no)+1}"
        
        if column_name not in df.columns:
            df[column_name] = None

        o = validateModel.model_validate_json(content)
        df.loc[idx, column_name] = o.__dict__[metric]

In [None]:
df

In [42]:
df.to_csv("dump/csv/bias_o.csv", index=False)