In [None]:
import openai
import json
import math
from openai.lib._parsing._completions import type_to_response_format_param
import pandas as pd

import functions.prompts as prompts

client = openai.Client()

In [None]:
df = pd.read_csv("dump/csv/papers.csv")
df['rank'] = df['id'].apply(lambda x: x.split("_")[0])
df = df.groupby('rank').sample(n=3, random_state=42).reset_index(drop=True)

In [None]:
def wrap(body, id):
    return {
        "custom_id": id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": body
    }
    
def gen_body(text, top5=True):
    return {
            "model": "gpt-4o-mini",
            "messages":[{
                    "role": "system",
                    "content": prompts.top5() if top5 else prompts.analysis()
                },
                {
                    "role": "user",
                    "content": text
                }],
            "response_format": type_to_response_format_param(prompts.Top5Model if top5 else prompts.AnalysisModel)
        }


In [None]:
institutions = [
       "Massachusetts Institute of Technology",
       "Harvard University",
       "London School of Economics and Political Science",
       "University of Cape Town",
       "Nanyang Technological University",
       "Chulalongkorn University",
   ]

top_names = [
       "Andrei Shleifer", "Daron Acemoglu", "James J. Heckman",
       "Joseph E. Stiglitz", "John List", "Carmen M. Reinhart",
       "Janet Currie", "Esther Duflo", "Asli Demirguc-Kunt",
       "Marianne Bertrand"
]


random_names = [
       "Bruce S. Green", "Alejandro L. James", "Billie J. Abels",
       "Paul A. Jenkins", "Gary L. Bodie", "Gail J. Doan",
       "Shirley S. Hodgkins", "Pattie K. Reinhardt",
       "Tara R. Weber", "Tabitha J. Cox"
]

In [None]:
import numpy as np

divider = 5
t_name = ["top", "ran"]

for index, row in df.iterrows():
    with open(f'output/{row["id"]}.txt', 'r') as f:
        text = f.read()

        for ind, ins in enumerate(institutions):
            paper = f"PAPER TITLE: {row['name']}\n\nAFFILIATION: {ins}\n\nPAPER TEXT: {text}"
            id = [ wrap( gen_body(paper),                f"{row["id"]}Qins{ind}|{0}+top5") ]

            file_index = math.floor(index / (len(df) / divider))

            with open(f"dump/openai-batch/batch-{file_index}-bias.jsonl", 'a') as f:
                f.write(f"{"\n".join([json.dumps(i) for i in id])}\n")
        
        for ind, names in enumerate([top_names, random_names]):
            for ind2, name in enumerate(names):
                paper = f"PAPER TITLE: {row['name']}\n\nAUTHOR: {name}\n\nPAPER TEXT: {text}"
                id = [  wrap( gen_body(paper),                f"{row["id"]}Q{t_name[ind]}{ind2}|{0}+top5")]

                file_index = math.floor(index / (len(df) / divider))

                with open(f"dump/openai-batch/batch-{file_index}-bias.jsonl", 'a') as f:
                    f.write(f"{"\n".join([json.dumps(i) for i in id])}\n")

In [None]:
batch_input_files = []

for i in range(divider):
    print(f"Sending batch {i}")
    batch_input_file = client.files.create(
        file=open(f"dump/openai-batch/batch-{i}-bias.jsonl", "rb"),
        purpose="batch"
    )
    print(f"{i} {batch_input_file.id}")
    batch_input_files.append(batch_input_file)

print(batch_input_files)

In [None]:
# DANGEROUS
batch_objects = []
for i, b in enumerate(batch_input_files):
    batch_input_file_id = b.id
    batch_object = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "description": f"[AI REVIEWER] Set {i}"
        }
    )
    print(f"{i} {batch_object.id}")
    batch_objects.append(batch_object)

In [None]:
batches = []
with open("dump/openai-batch/openai-batches-bias.txt", 'r') as f:
    batches = f.read()
    batches = batches.split("\n")

In [None]:
file_ids = []
for i, batch in enumerate(batches):
    x = client.batches.retrieve(batch)
    print(x)
    file_ids.append(x.output_file_id)

In [None]:
file_ids

In [None]:
for i, id in enumerate(file_ids):
    file_response = client.files.content(id)
    with open(f"dump/eval-output/openai-output-bias.jsonl", 'a') as f:
        f.write(file_response.text)

In [None]:
import pandas as pd
import json

f = open("dump/eval-output/openai-output-bias.jsonl", "r")
file_response = f.read()
f.close()

for line in file_response.split("\n")[:-1]:
    l = json.loads(line)
    id, bias = l['custom_id'].split("|")[0].split("Q")
    no, typ = l['custom_id'].split("|")[1].split("+")
    
    idx = df.index[df['id'] == id].tolist()[0]
    
    content = l['response']['body']['choices'][0]['message']['content']
    
    metrics = ['score'] if typ == "top5" else ['originality', 'rigor', 'scope', 'impact', 'written_by_ai']
    validateModel = prompts.Top5Model if typ == "top5" else prompts.AnalysisModel

    for i, metric in enumerate(metrics):
        column_name = f"openai-{bias}-{metric}-{int(no)+1}"
        
        if column_name not in df.columns:
            df[column_name] = None

        o = validateModel.model_validate_json(content)
        df.loc[idx, column_name] = o.__dict__[metric]

In [None]:
df.to_csv("dump/csv/bias_o.csv", index=False)