In [1]:
import anthropic
from anthropic.types.message_create_params import MessageCreateParamsNonStreaming
from anthropic.types.messages.batch_create_params import Request

import json
import math
import pandas as pd

import functions.prompts as prompts

client = anthropic.Anthropic()

In [3]:
import numpy as np

df = pd.read_csv("dump/csv/papers.csv")
df['rank'] = df['id'].apply(lambda x: x.split("_")[0])
df = df.loc[df['rank'].isin(["1", "25", "50", "75", "100"])].reset_index(drop=True)

len(df)

# random -> only real paper / is not in 1/25/50/75/100 -> sample n=10
# rand = df[:-100].loc[~df['rank'].isin(["1", "25", "50", "75", "100"])].sample(n=10, random_state=42)

# sample = pd.concat([ranks, rand]).reset_index(drop=True)
# sample

50

In [4]:
def req(id, text, top5=True):
    return Request(
        custom_id=id,
        params=MessageCreateParamsNonStreaming(
            model="claude-3-5-haiku-20241022",
            max_tokens=1024,
            system=f"{prompts.top5() if top5 else prompts.analysis()}\nPlease respond in valid JSON format that matches this schema: {str(prompts.Top5Model.model_json_schema() if top5 else prompts.AnalysisModel.model_json_schema())}. **IMPORTANT**: ONLY RESPOND WITH AN JSON OBJECT CONTAINING SCORES ACCORDING TO THE ABOVE SCHEMA. THE RESPONSE MUST END WITH A CURLY BRACKET. DO NOT ADD ANALYSIS OR EXPLANATION.",
            messages=[{
                "role": "user",
                "content": text
            }, {
                "role": "assistant",
                "content": "{"
            }]
        )
    )
    
def batch(text, id):
    return [    *[ req(f"{id}Z{i}Qtop5", text) for i in range(3) ],
                *[ req(f"{id}Z{i}Qanalysis", text, top5=False) for i in range(3) ] ]

In [25]:
full_req = {}
divider = 1
for i in range(divider):
    full_req[f'batch-{i}'] = []

def partial(paper, no):
    l = len(paper.split())
    # if(no == 0.01):
        # print("asdf", math.ceil((l * no) / 100))
    return " ".join(paper.split()[:math.ceil((l * no) / 100)])

for index, row in df.iterrows():
    file_index = math.floor(index / (len(df) / divider))
    with open(f'output/{row["id"]}.txt', 'r') as f:
        text = f.read()
        paper = f"PAPER TITLE: {row['name']}\n\nPAPER TEXT: {text}"
        
        for no in [0.1, 1, 10, 50, 100]:
            full_req[f"batch-{file_index}"] += batch(partial(paper, no), row["id"]+"P"+str(no).split(".")[0])

In [26]:
for i in range(divider):
    print(len(full_req[f'batch-{i}']))

1500


In [27]:
full_req[f'batch-{i}'][10]

{'custom_id': '1_0P1Z1Qanalysis',
 'params': {'model': 'claude-3-5-haiku-20241022',
  'max_tokens': 1024,
  'system': 'Please evaluate the attached research according to the following criteria.\n\nORIGINALITY\n"In your capacity as an editorial board/reviewer for this paper, please rate this paper’s originality. Note that papers with high originality typically address questions of broad, foundational importance or propose groundbreaking methodologies. They often set new standards in the field or open new research avenues. \n (0 = Completely unoriginal, …, 10 = Completely original)”\n\nRIGOR\n"In your capacity as an editorial board/reviewer for this paper, please rate this paper’s rigor. Note that papers that are rigorous are those in which the data handling and analysis process is highly transparent, with all steps carefully documented (0 = Not at all rigorous, …, 10 = Extremely rigorous)”\n\nSCOPE\n"In your capacity as an editorial board/reviewer for this paper, please rate this paper’

In [28]:
# KINDA DANGEROUS
batches = []
for i in range(divider):
    print(f"Sending Batch {i}")
    message_batch = client.messages.batches.create(
        requests=full_req[f'batch-{i}'])
    print(f"{i} {message_batch.id}")
    batches.append(message_batch)

print(batches)

Sending Batch 0
0 msgbatch_01MWaRLXaNoCnT1QTMtTx1Ec
[MessageBatch(id='msgbatch_01MWaRLXaNoCnT1QTMtTx1Ec', archived_at=None, cancel_initiated_at=None, created_at=datetime.datetime(2025, 3, 25, 18, 8, 49, 649326, tzinfo=datetime.timezone.utc), ended_at=None, expires_at=datetime.datetime(2025, 3, 26, 18, 8, 49, 649326, tzinfo=datetime.timezone.utc), processing_status='in_progress', request_counts=MessageBatchRequestCounts(canceled=0, errored=0, expired=0, processing=1500, succeeded=0), results_url=None, type='message_batch')]


In [31]:
import time

id = "msgbatch_01MWaRLXaNoCnT1QTMtTx1Ec"
def wait(id):
    results = client.messages.batches.retrieve(id).processing_status
    while results == "in_progress":
        stat = client.messages.batches.retrieve(id)
        print(stat.request_counts)
        results = stat.processing_status
        time.sleep(5)

wait(id)

In [32]:
batches = []
with open("dump/anthropic-batch/anthropic-batch-partial.txt", 'r') as f:
    batches = f.read()
    batches = batches.split("\n")

In [34]:
def parse_r(r):
    id = r.custom_id
    validateModel = prompts.Top5Model if "top5" in id else prompts.AnalysisModel
    try:
        text = r.result.message.content[0].text
        text = "{" + text.split("}")[0] + "}"
        return {
            "id": id,
            "scores": validateModel.model_validate(json.loads(text)).model_dump()
        }
    except Exception as e:
        print(f"Error {e} - {"{" + r.result.message.content[0].text}")
        return {
            "id": id,
            "scores": None
        }

for b in batches:
    results = client.messages.batches.results(b)
    for r in results:
        if(r and r.result.type == 'succeeded'):
            try:
                with open('dump/eval-output/anthropic-result-partial.jsonl', 'a') as f:
                    f.write(json.dumps(parse_r(r)) + "\n")
            except Exception as e:
                print("ERROR! - " + str(e))

In [39]:
import pandas as pd

df = pd.read_csv("dump/csv/segments.csv")

f = open('dump/eval-output/anthropic-result-partial.jsonl', 'r')
file_response = f.read()
f.close()

for line in file_response.split("\n")[:-1]:
    l = json.loads(line)
    id, s = l['id'].split("Z")[0].split("P")
    no, typ = l['id'].split("Z")[1].split("Q")
    
    idx = df.index[df['id'] == id].tolist()[0]
    
    # content = l['response']['body']['choices'][0]['message']['content']
    
    metrics = ['score'] if typ == "top5" else ['originality', 'rigor', 'scope', 'impact', 'written_by_ai']
    validateModel = prompts.Top5Model if typ == "top5" else prompts.AnalysisModel

    for i, metric in enumerate(metrics):
        column_name = f"anthropic-{s}-{metric}-{int(no)+1}"
        
        if column_name not in df.columns:
            df[column_name] = None

        o = validateModel.model_validate(l['scores'])
        df.loc[idx, column_name] = o.__dict__[metric]

In [41]:
df['rank'] = df['id'].apply(lambda x: x.split("_")[0])
df.head()

Unnamed: 0,id,file,name,journal,authors,affiliations,len-original,len-anond,rank,anthropic-0-score-1,...,anthropic-100-originality-2,anthropic-100-rigor-2,anthropic-100-scope-2,anthropic-100-impact-2,anthropic-100-written_by_ai-2,anthropic-100-originality-3,anthropic-100-rigor-3,anthropic-100-scope-3,anthropic-100-impact-3,anthropic-100-written_by_ai-3
0,1_0,1. Econometrica/ecta200736.pdf,The Political Economy of Zero-Sum Thinking,Econometrica,S. Nageeb Ali; Maximilian Mihm; Lucas Siga,"Department of Economics, Pennsylvania State Un...",16496.0,15956.0,1,2,...,8,9,9,9,1,8,9,9,10,1
1,1_1,1. Econometrica/ecta200731.pdf,Social Media and Collective Action in China,Econometrica,Bei Qin; David Strömberg; Yanhui Wu,"Bei Qin: Department of Accountancy, Economics ...",18206.0,17419.0,1,2,...,9,10,9,10,1,9,10,9,10,1
2,1_2,1. Econometrica/ecta200725.pdf,Ambiguous Contracts,Econometrica,Paul Dütting; Michal Feldman; Daniel Peretz; L...,"Google Research; School of Computer Science, T...",11830.0,11448.0,1,2,...,8,9,9,8,1,9,10,9,8,1
3,1_3,1. Econometrica/ecta200741.pdf,PERSUASION MEETS DELEGATION,Econometrica,Anton Kolotilin; Andriy Zapechelnyuk,"Anton Kolotilin: School of Economics, UNSW Bus...",15756.0,15075.0,1,3,...,8,9,9,8,1,8,9,9,8,1
4,1_4,1. Econometrica/Econometrica - 2025 - Berger -...,"Minimum Wages, Efficiency, and Welfare",Econometrica,David Berger; Kyle Herkenhoff; Simon Mongey,"Economics Department, Duke University; Departm...",18607.0,16735.0,1,2,...,9,10,9,8,1,9,10,9,9,0


In [42]:
df.to_csv("dump/csv/result_partial.csv", index=False)