In [11]:
from openai import OpenAI
from dotenv import dotenv_values
import json
import math
import numpy as np
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

import functions.prompts as prompts

config = dotenv_values(".env")

client = OpenAI(
    api_key=config['DEEPINFRA_TOKEN'],
    base_url="https://api.deepinfra.com/v1/openai",
)

In [12]:
df = pd.read_csv("dump/csv/papers.csv")
# df.iloc[np.r_[0:4, -4:0]]
df.head()

Unnamed: 0,id,file,name,journal,authors,affiliations,len-original,len-anond
0,1_0,1. Econometrica/ecta200736.pdf,The Political Economy of Zero-Sum Thinking,Econometrica,S. Nageeb Ali; Maximilian Mihm; Lucas Siga,"Department of Economics, Pennsylvania State Un...",16496.0,15956.0
1,1_1,1. Econometrica/ecta200731.pdf,Social Media and Collective Action in China,Econometrica,Bei Qin; David Strömberg; Yanhui Wu,"Bei Qin: Department of Accountancy, Economics ...",18206.0,17419.0
2,1_2,1. Econometrica/ecta200725.pdf,Ambiguous Contracts,Econometrica,Paul Dütting; Michal Feldman; Daniel Peretz; L...,"Google Research; School of Computer Science, T...",11830.0,11448.0
3,1_3,1. Econometrica/ecta200741.pdf,PERSUASION MEETS DELEGATION,Econometrica,Anton Kolotilin; Andriy Zapechelnyuk,"Anton Kolotilin: School of Economics, UNSW Bus...",15756.0,15075.0
4,1_4,1. Econometrica/Econometrica - 2025 - Berger -...,"Minimum Wages, Efficiency, and Welfare",Econometrica,David Berger; Kyle Herkenhoff; Simon Mongey,"Economics Department, Duke University; Departm...",18607.0,16735.0


In [13]:
def gen_body(text, top5=True, model="meta-llama/Llama-3.3-70B-Instruct-Turbo"):
    return {
            "model": model,
            "messages":[{
                    "role": "system",
                    "content": f"{prompts.top5() if top5 else prompts.analysis()}\nPlease respond in valid JSON format that matches this schema: {str(prompts.Top5Model.model_json_schema() if top5 else prompts.AnalysisModel.model_json_schema())}. **IMPORTANT**: ONLY RESPOND WITH AN JSON OBJECT CONTAINING SCORES ACCORDING TO THE ABOVE SCHEMA. THE RESPONSE MUST END WITH A CURLY BRACKET. DO NOT ADD ANALYSIS OR EXPLANATION."
                },
                {
                    "role": "user",
                    "content": text
                }, 
                {
                    "role": "assistant",
                    "content": "{"
                }],
        }

In [19]:
def partial(paper, no):
    l = len(paper.split())
    return " ".join(paper.split()[:math.ceil((l * no) / 100)])

def llm_paper(client, i, df, per):
    text = ""
    f = open(f"output/{df.iloc[i]['id']}.txt", "r")
    text = f.read()
    f.close()

    paper = f"PAPER TITLE: {df.iloc[i]['name']}\n\nPAPER TEXT: {partial(text, per)}"
    # print(per, paper)
    # gen_body(paper)
    
    model = "google/gemma-3-27b-it"
    # model = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
    
    scores = [ client.chat.completions.create(**gen_body(paper, model=model)) for _ in range(3)]
    analysis = [ client.chat.completions.create(**gen_body(paper, top5=False, model=model)) for _ in range(3)]
    
    return {
        "scores": scores,
        "analysis": analysis
    }
    
def parse_r(r, id, typ):
    # id = r.custom_id
    validateModel = prompts.Top5Model if typ == "top5" else prompts.AnalysisModel
    try:
        # print(r)
        text = r.choices[0].message.content
        if(text.startswith("```json")):
            text = text.split("```json")[1].split("}")[0].replace("'", '"') + "}"
        else:
            text = "{" + text.split("}")[0].replace("'", '"') + "}"
        if(text.startswith("{{")):
            text = text[1:]
        return {
            "id": id,
            "scores": validateModel.model_validate(json.loads(text)).model_dump()
        }
    except Exception as e:
        print(f"Error {e} - {"{" + r.choices[0].message.content}")
        return {
            "id": id,
            "scores": None
        }

def parse_paper(rs):
    scores = [ parse_r(x, j, "top5") for j, x in enumerate(rs['scores']) ]
    analysis = [ parse_r(x, j, "analysis") for j, x in enumerate(rs['analysis']) ]
    return {
        "scores": scores,
        "analysis": analysis
    }

In [15]:
model_name = "gemma"

def update_df(df, i, no, score, typ, per):
    metrics = ['score'] if typ == "top5" else ['originality', 'rigor', 'scope', 'impact', 'written_by_ai']
    validateModel = prompts.Top5Model if typ == "top5" else prompts.AnalysisModel

    for _, metric in enumerate(metrics):
        column_name = f"{model_name}-{per}-{metric}-{int(no)+1}"
        
        if column_name not in df.columns:
            df[column_name] = None

        try:
            o = validateModel.model_validate(score)
            df.loc[i, column_name] = o.__dict__[metric]
        except:
            print(f"ERROR | Can't update the model in in {column_name}, skipping...")

In [20]:
def evaluate_paper(client, i, df):
    print(f"Evaluating {i} -> {df.iloc[i]['name']}")
    
    for per in [0.1, 1, 10, 50]:
        x = llm_paper(client, i, df, per)
        y = parse_paper(x)
    
        per_str = str(per).split(".")[0]
        
        for j, s in enumerate(y['scores']):
            update_df(df, i, j, s['scores'], "top5", per_str)
    
        for j, a in enumerate(y['analysis']):
            update_df(df, i, j, a['scores'], "analysis", per_str)
            
    print(f"Finish Evaluating {i} -> {df.iloc[i]['name']}")
    # update_df(df, i, 0, y['scores'][0]['scores'], "top5")
    # idx = df.index[df['id'] == id].tolist()[0]
    # content = l['response']['body']['choices'][0]['message']['content']
    
# evaluate_paper(client, 0, df)    

In [None]:
# from concurrent.futures import ThreadPoolExecutor

# def evaluate_paper_per(client, i, df, per):
#     print(f"Evaluating {i} -> {df.iloc[i]['name']} with percentage {per}")
    
#     x = llm_paper(client, i, df, per)
#     y = parse_paper(x)

#     per_str = str(per).split(".")[0]
    
#     for j, s in enumerate(y['scores']):
#         update_df(df, i, j, s['scores'], "top5", per_str)

#     for j, a in enumerate(y['analysis']):
#         update_df(df, i, j, a['scores'], "analysis", per_str)
    
#     return per

# def evaluate_paper(client, i, df):
#     print(f"Starting evaluation for {i} -> {df.iloc[i]['name']}")
    
#     percentages = [0.1, 1, 10, 50]
    
#     with futures.ThreadPoolExecutor(max_workers=len(percentages)) as executor:
#         futures = {executor.submit(evaluate_paper_per, client, i, df, per): per for per in percentages}
        
#         for future in futures.as_completed(futures):
#             per = futures[future]
#             try:
#                 result = future.result()
#                 print(f"Completed {i} with percentage {result}")
#             except Exception as exc:
#                 print(f"Generated an exception for {i} with percentage {per}: {exc}")
    
#     print(f"Completed all evaluations for {i} -> {df.iloc[i]['name']}")

In [22]:
import math

chunk = 100
for i in range(0, len(df), chunk):
    print(f"PROCESSING CHUNK {(i // chunk) + 1} of {math.ceil(len(df) / chunk)}")
    with ThreadPoolExecutor(max_workers=200) as executor:
        results = list(executor.map(
            evaluate_paper, 
            [client] * chunk,
            [j for j in range(i, min(i+chunk, len(df)))],
            [df] * len(df["file"]),
        ))
        
    df.to_csv(f"{model_name}/{(i // chunk) + 1}.csv", index=False)

PROCESSING CHUNK 1 of 11
Evaluating 0 -> The Political Economy of Zero-Sum Thinking
Evaluating 1 -> Social Media and Collective Action in China
Evaluating 2 -> Ambiguous Contracts
Evaluating 3 -> PERSUASION MEETS DELEGATION
Evaluating 4 -> Minimum Wages, Efficiency, and Welfare
Evaluating 5 -> The Impacts of Managerial Autonomy on Firm Outcomes
Evaluating 6 -> Sparse Network Asymptotics for Logistic Regression Under Possible Misspecification
Evaluating 7 -> Seeding a Simple Contagion
Evaluating 8 -> CAUTION AND REFERENCE EFFECTS
Evaluating 9 -> HISTORY’S MASTERS: THE EFFECT OF EUROPEAN MONARCHS ON STATE PERFORMANCE
Evaluating 10 -> Stories, Statistics, and Memory
Evaluating 11 -> A Welfare Analysis of Tax Audits across the Income Distribution
Evaluating 12 -> War Reparations, Structural Change, and Intergenerational Mobility
Evaluating 13 -> THE DYNAMICS OF ABUSIVE RELATIONSHIPS
Evaluating 14 -> Present Bias Amplifies the Household Balance-Sheet Channels of Macroeconomic Policy
Evaluat

In [None]:
# # Fallback
# fallback = df[df.isna().any(axis=1)].index

# with ThreadPoolExecutor(max_workers=200) as executor:
#         results = list(executor.map(
#             evaluate_paper, 
#             [client] * len(fallback),
#             fallback,
#             [df] * len(fallback),
#         ))
        
# df.to_csv(f"{model_name}/fallback.csv", index=False)

In [47]:
full = pd.read_csv("dump/csv/result_oalg.csv")

In [48]:
cols = [ 'gemma-score-1',
 'gemma-score-2',
 'gemma-score-3',
 'gemma-originality-1',
 'gemma-rigor-1',
 'gemma-scope-1',
 'gemma-impact-1',
 'gemma-written_by_ai-1',
 'gemma-originality-2',
 'gemma-rigor-2',
 'gemma-scope-2',
 'gemma-impact-2',
 'gemma-written_by_ai-2',
 'gemma-originality-3',
 'gemma-rigor-3',
 'gemma-scope-3',
 'gemma-impact-3',
 'gemma-written_by_ai-3',]

for col in cols:
    # copy column from full to df
    df[col.split("-")[0] + "-100-" + "-".join(col.split("-")[1:])] = full[col]

In [50]:
df.to_csv('gemma-partial.csv')