In [None]:
import openai
import pandas as pd

import functions.prompts as prompts
import functions.dupes as dupes
import functions.llm as llm
import functions.anonymize as anonymize
import functions.process as process
import importlib

importlib.reload(prompts)
importlib.reload(dupes)
importlib.reload(llm)
importlib.reload(anonymize)
importlib.reload(process)

client = openai.Client()

In [1]:
def anonymize_paper(path, client, id, df, danger_dupe=False):
    try:
        print(f"Processing {path}")

        texts = process.extract_texts_from_pdf(path)
        metadata = process.get_paper_metadata(client, "\n".join(texts))
        print(f"- {path} | Metadata: {metadata}")

        flagged_dupes = dupes.get_flagged_duplicates(texts, metadata, client) \
            if not danger_dupe \
            else dupes.get_flagged_duplicates_danger(texts, metadata, client)
        formatted_texts = process.remove_segments_from_texts(texts, flagged_dupes)

        print(f"- {path} | Anonymizing Paper")
        anond = anonymize.anonymized_texts(formatted_texts, metadata, client, path)

        print(f"- {path} | Done Processing!")
            
        result = {
            "metadata": prompts.MetadataModel.model_validate_json(metadata),
            "original": texts,
            "anond": process.remove_page_numbers(anond),
            "id": id
        }
        
        idx = df.index[df['id'] == id].tolist()[0]

        updated_row = df.loc[idx].copy()

        updated_row['name'] = result["metadata"].title
        updated_row['authors'] = result["metadata"].authors
        updated_row['affiliations'] = result["metadata"].affiliations
        updated_row['len-original'] = len(" ".join(result["original"]).split())
        updated_row['len-anond'] = len(" ".join(result["anond"]).split())

        df.loc[idx] = updated_row

        with open(f'./output/{result["id"]}.txt', 'w') as f:
            f.write("\n".join(result["anond"]))

        with open(f'./output/{result["id"]}-original.txt', 'w') as f:
            f.write("\n".join(result["original"]))

        return result

    except Exception as e:
        print(f"- {path} | ERROR: {e}")
        return None

In [44]:
df = pd.read_csv("dump/papers.csv")

In [52]:
replacements = {
    # "56": "2|s11127-025-01270-9",
    # "76": "0|ejaf002",
    # "98": "9|economic-evaluation-under-ambiguity-and-structural-uncertainties",
    # "100": "4|s11156-024-01381-2",
    # "P4": "5|socsci-14-00189",
    # "P7": "7|commodities-03-00024-v2",
    "P7_2": "0|commodities-04-00003",
}

In [46]:
df[df['id'].isin(['56_2', '76_0', '98_9', '100_4', 'P4_5', 'P7_0', 'P7_7'])]

Unnamed: 0,id,file,name,journal,authors,affiliations,len-original,len-anond
552,56_2,56. Public Choice/s11127-024-01209-6.pdf,Correction to: Ruled by robots: preference for...,Public Choice,Marina Chugunova; Wolfgang J. Luhan,Max Planck Institute for Innovation and Compet...,408.0,319.0
750,76_0,76. Journal of African Economics/ejae024.pdf,Preface: African Economies: Recovery from Mult...,Journal of African Economics,Victor Murinde,African Economic Research Consortium,790.0,575.0
979,98_9,98. Journal of Benefit-Cost Analysis/introduct...,Introduction to the Special Issue,Journal of Benefit-Cost Analysis,Susan E. Dudley,"GW Regulatory Studies Center, George Washingto...",582.0,112.0
994,100_4,100. Review of Quant Finance and Accounting/s1...,Correction to: Social media disclosure and rep...,Review of Quant Finance and Accounting,Xing Huan; Antonio Parbonetti; Giulia Redigolo...,"EDHEC Business School, 393 Promenade Des Angla...",484.0,269.0
1035,P4_5,P4. Social Sciences/socsci-14-00050-v2.pdf,Digital Intervention for Advancing Social Work...,Social Sciences,Erica Russ; Elizabeth Claire Reimer,"Erica Russ, Social Work, Faculty of Health, So...",1167.0,904.0
1060,P7_0,P7. Commodities/commodities-03-00025.pdf,Expanding the Scope of Commodities to Reflect ...,Commodities,Jungho Baek,"Jungho Baek; Department of Economics, College ...",731.0,589.0
1067,P7_7,P7. Commodities/commodities-04-00001.pdf,Commodities: The Year 2024 in Retrospect,Commodities,Julien Chevallier,"Julien Chevallier; Economics Department (LED),...",1907.0,1753.0


In [53]:
for k, v in replacements.items():
    k = k.split("_")[0]
    f, t = v.split("|")
    id = f'{k}_{f}'

    idx = df.index[df['id'] == id].tolist()[0]
    old = df.iloc[idx]
    
    updated_row = old.copy()
    updated_row['file'] = f"{k}. {old['journal']}/{t}.pdf"
    
    df.loc[idx] = updated_row
    
    anonymize_paper("./Journals/" + df[df['id'] == id].iloc[0]['file'], client, id, df)

Processing ./Journals/P7. Commodities/commodities-04-00003.pdf
- ./Journals/P7. Commodities/commodities-04-00003.pdf | Metadata: {"title":"Secondhand Clothing in Global Commerce: Trade Patterns and Impact","authors":"Debanjan Das","affiliations":"Department of Fashion, Design & Merchandising, School of Art and Design, West Virginia University, 702C Allen Hall, 355 Oakland Street, Morgantown, WV 26506, USA; debanjan.das@mail.wvu.edu","publication":"Das, D. Secondhand Clothing in Global Commerce: Trade Patterns and Impact. Commodities 2025, 4, 3. https://doi.org/10.3390/commodities4010003","funding":"This research received no external funding."}
- ./Journals/P7. Commodities/commodities-04-00003.pdf | Anonymizing Paper
- ./Journals/P7. Commodities/commodities-04-00003.pdf | Anonymizing 1/20 [5 Attempt(s)]
- ./Journals/P7. Commodities/commodities-04-00003.pdf | Anonymizing 2/20 [5 Attempt(s)]
- ./Journals/P7. Commodities/commodities-04-00003.pdf | Anonymizing 3/20 [1 Attempt(s)]
- ./Journa

In [54]:
df[df['id'].isin(['56_2', '76_0', '98_9', '100_4', 'P4_5', 'P7_0', 'P7_7'])]

Unnamed: 0,id,file,name,journal,authors,affiliations,len-original,len-anond
552,56_2,56. Public Choice/s11127-025-01270-9.pdf,When efficient help is perceived as greed: exp...,Public Choice,Andrej Angelovski; Werner Güth; Simón Lodato; ...,"Andrej Angelovski, Department of Economics, Mi...",8678.0,7864.0
750,76_0,76. Journal of African Economics/ejaf002.pdf,Does Linkages with MNEs Improve Firms’ Technol...,Journal of African Economics,Yohannes Ayelea; Habtamu Edjigub; Admasu Marutac,"Yohannes Ayelea, Overseas Development Institut...",9663.0,9109.0
979,98_9,98. Journal of Benefit-Cost Analysis/economic-...,Economic Evaluation Under Ambiguity and Struct...,Journal of Benefit-Cost Analysis,Brendon P. Andrews,"Brendon P. Andrews; Department of Economics, U...",11212.0,10715.0
994,100_4,100. Review of Quant Finance and Accounting/s1...,Capital and liquidity creation: does the capit...,Review of Quant Finance and Accounting,Meng‑Wen Wu; Chung‑Hua Shen; Kuo‑Jui Huang; Yi...,"Meng‑Wen Wu, Department of Business Administra...",22154.0,20877.0
1035,P4_5,P4. Social Sciences/socsci-14-00189.pdf,Not All Migrants Are the Same: Decent Work and...,Social Sciences,Maha Yomn Sbaa; Simone Donati; Salvatore Zappalà,"Maha Yomn Sbaa, Department of Psychology “Renz...",13425.0,12987.0
1060,P7_0,P7. Commodities/commodities-04-00003.pdf,Secondhand Clothing in Global Commerce: Trade ...,Commodities,Debanjan Das,"Department of Fashion, Design & Merchandising,...",10072.0,9658.0
1067,P7_7,P7. Commodities/commodities-03-00024-v2.pdf,An Econometric and Time Series Analysis of the...,Commodities,Papa Ousseynou Diop,Papa Ousseynou Diop; Laboratoire d’Économie Di...,11708.0,11436.0


In [56]:
df.to_csv('paper-cleanup.csv', index=False)