In [23]:
import openai
import os
import pandas as pd

import functions.prompts as prompts
import functions.dupes as dupes
import functions.llm as llm
import functions.anonymize as anonymize
import functions.process as process
import importlib

importlib.reload(prompts)
importlib.reload(dupes)
importlib.reload(llm)
importlib.reload(anonymize)
importlib.reload(process)

client = openai.Client()

In [24]:
def anonymize_paper(path, client, id):
    try:
        print(f"Processing {path}")

        texts = process.extract_texts_from_pdf(path)
        metadata = process.get_paper_metadata(client, "\n".join(texts))
        print(f"- {path} | Metadata: {metadata}")

        flagged_dupes = dupes.get_flagged_duplicates(texts, metadata, client)
        formatted_texts = process.remove_segments_from_texts(texts, flagged_dupes)

        print(f"- {path} | Anonymizing Paper")
        anond = anonymize.anonymized_texts(formatted_texts, metadata, client, path)

        print(f"- {path} | Done Processing!")

        return {
            "metadata": prompts.MetadataModel.model_validate_json(metadata),
            "original": texts,
            "anond": process.remove_page_numbers(anond),
            "id": id
        }

    except Exception as e:
        print(f"- {path} | ERROR: {e}")
        return None

In [25]:
def format_paper_names(name):
    return {
        "no": name.split(".")[0],
        "name": " ".join(name.split(" ")[1:]),
        "folder": name
    }

journals = [ format_paper_names(name) for name in os.listdir('./Journals') ]
journals = [ name for name in journals if len(name["no"]) > 0 and (name['no'].isnumeric() or name['no'][0] == 'P') ]
journals = sorted(journals, key=lambda k: int(k['no']) if k['no'].isnumeric() else 1000 + int(k['no'][1:]))

In [26]:
df = pd.DataFrame(columns=['id', 'file', 'name', 'journal', 'authors', 'affiliations', 'len-original', 'len-anond'])

for journal in journals:
    papers = [{
        "id": f"{journal['no']}_{i}",
        "name": paper,
        "path": f"{journal['folder']}/{paper}"
    } for i, paper in enumerate(os.listdir(f'./Journals/{journal["folder"]}'))]
    
    if(len(papers) != 10):
        print(f"Journal {journal['no']} {journal['name']} has {len(papers)} papers")
    
    for paper in papers:
        df = pd.concat([df, pd.DataFrame([[paper['id'], paper['path'], journal['name'], None, None, None, None, None]], columns=df.columns)], ignore_index=True)

In [27]:
df

Unnamed: 0,id,file,name,journal,authors,affiliations,len-original,len-anond
0,1_0,1. Econometrica/ecta200736.pdf,Econometrica,,,,,
1,1_1,1. Econometrica/ecta200731.pdf,Econometrica,,,,,
2,1_2,1. Econometrica/ecta200725.pdf,Econometrica,,,,,
3,1_3,1. Econometrica/ecta200741.pdf,Econometrica,,,,,
4,1_4,1. Econometrica/Econometrica - 2025 - Berger -...,Econometrica,,,,,
...,...,...,...,...,...,...,...,...
1095,P10_5,P10. Business and Econ Journal/the-impact-of-a...,Business and Econ Journal,,,,,
1096,P10_6,P10. Business and Econ Journal/effect-of-workl...,Business and Econ Journal,,,,,
1097,P10_7,P10. Business and Econ Journal/an-exploratory-...,Business and Econ Journal,,,,,
1098,P10_8,P10. Business and Econ Journal/centre-peripher...,Business and Econ Journal,,,,,


In [15]:
from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(anonymize_paper, "./Journals/" + df["file"], [client] * len(df["file"]), df["id"]))
    
    for i, result in enumerate(results):
        try:
            if result is not None:
                print(result)

                df.at[i, "name"] = result["metadata"].title
                df.at[i, "authors"] = result["metadata"].authors
                df.at[i, "affiliations"] = result["metadata"].affiliations

                df.at[i, "len-original"] = len(" ".join(result["original"]).split())
                df.at[i, "len-anond"] = len(" ".join(result["anond"]).split())

                with open(f'./output/{result["id"]}.txt', 'w') as f:
                    f.write("\n".join(result["anond"]))

                with open(f'./output/{result["id"]}-original.txt', 'w') as f:
                    f.write("\n".join(result["original"]))

        except Exception as e:
            print(f"ERROR!! Failed to process {df.at[i, 'file']} -> {e}")

Processing ./Journals/1. Econometrica/ecta200736.pdf
- ./Journals/1. Econometrica/ecta200736.pdf | Metadata: {"title":"The Political Economy of Zero-Sum Thinking","authors":"S. Nageeb Ali; Maximilian Mihm; Lucas Siga","affiliations":"Department of Economics, Pennsylvania State University; Division of Social Science, New York University Abu Dhabi; Department of Economics, University of Essex","publication":"Econometrica, Vol. 93, No. 1 (January, 2025), 41–70. DOI: Not specified. ISSN: Not specified.","funding":"Not specified"}
- ./Journals/1. Econometrica/ecta200736.pdf | Anonymizing Paper
- ./Journals/1. Econometrica/ecta200736.pdf | Anonymizing 1/31 [4 Attempt(s)]
- ./Journals/1. Econometrica/ecta200736.pdf | Anonymizing 2/31 [1 Attempt(s)]
- ./Journals/1. Econometrica/ecta200736.pdf | Anonymizing 3/31 [1 Attempt(s)]
- ./Journals/1. Econometrica/ecta200736.pdf | Anonymizing 4/31 [1 Attempt(s)]
- ./Journals/1. Econometrica/ecta200736.pdf | Anonymizing 5/31 [2 Attempt(s)]
- ./Journals/1

In [16]:
df

Unnamed: 0,id,file,name,journal,authors,affiliations,len-original,len-anond
0,1_0,1. Econometrica/ecta200736.pdf,The Political Economy of Zero-Sum Thinking,,S. Nageeb Ali; Maximilian Mihm; Lucas Siga,"Department of Economics, Pennsylvania State Un...",16496,16008
