In [10]:
import openai
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

import functions.prompts as prompts
import functions.dupes as dupes
import functions.llm as llm
import functions.anonymize as anonymize
import functions.process as process
import importlib

importlib.reload(prompts)
importlib.reload(dupes)
importlib.reload(llm)
importlib.reload(anonymize)
importlib.reload(process)

client = openai.Client()

In [None]:
def anonymize_paper(path, client, id, df, danger_dupe=False):
    try:
        print(f"Processing {path}")

        texts = process.extract_texts_from_pdf(path)
        metadata = process.get_paper_metadata(client, "\n".join(texts))
        print(f"- {path} | Metadata: {metadata}")

        flagged_dupes = dupes.get_flagged_duplicates(texts, metadata, client) \
            if not danger_dupe \
            else dupes.get_flagged_duplicates_danger(texts, metadata, client)
        formatted_texts = process.remove_segments_from_texts(texts, flagged_dupes)

        print(f"- {path} | Anonymizing Paper")
        anond = anonymize.anonymized_texts(formatted_texts, metadata, client, path)

        print(f"- {path} | Done Processing!")
            
        result = {
            "metadata": prompts.MetadataModel.model_validate_json(metadata),
            "original": texts,
            "anond": process.remove_page_numbers(anond),
            "id": id
        }
        
        idx = df.index[df['id'] == id].tolist()[0]

        updated_row = df.loc[idx].copy()

        updated_row['name'] = result["metadata"].title
        updated_row['authors'] = result["metadata"].authors
        updated_row['affiliations'] = result["metadata"].affiliations
        updated_row['len-original'] = len(" ".join(result["original"]).split())
        updated_row['len-anond'] = len(" ".join(result["anond"]).split())

        df.loc[idx] = updated_row

        with open(f'./output/{result["id"]}.txt', 'w') as f:
            f.write("\n".join(result["anond"]))

        with open(f'./output/{result["id"]}-original.txt', 'w') as f:
            f.write("\n".join(result["original"]))

        return result

    except Exception as e:
        print(f"- {path} | ERROR: {e}")
        return None

In [5]:
def format_paper_names(name):
    return {
        "no": name.split(".")[0],
        "name": " ".join(name.split(" ")[1:]),
        "folder": name
    }

journals = [ format_paper_names(name) for name in os.listdir('./Journals') ]
journals = [ name for name in journals if len(name["no"]) > 0 and (name['no'].isnumeric() or name['no'][0] == 'P') ]
journals = sorted(journals, key=lambda k: int(k['no']) if k['no'].isnumeric() else 1000 + int(k['no'][1:]))

In [4]:
df = pd.DataFrame(columns=['id', 'file', 'name', 'journal', 'authors', 'affiliations', 'len-original', 'len-anond'])

for journal in journals:
    papers = [{
        "id": f"{journal['no']}_{i}",
        "name": paper,
        "path": f"{journal['folder']}/{paper}"
    } for i, paper in enumerate(os.listdir(f'./Journals/{journal["folder"]}'))]
    
    if(len(papers) != 10):
        print(f"Journal {journal['no']} {journal['name']} has {len(papers)} papers")
    
    for paper in papers:
        df = pd.concat([df, pd.DataFrame([[paper['id'], paper['path'], None, journal['name'], None, None, None, None]], columns=df.columns)], ignore_index=True)

In [None]:
# Main process

with ThreadPoolExecutor(max_workers=1000) as executor:
    results = list(executor.map(
        anonymize_paper, 
        "./Journals/" + df["file"],
        [client] * len(df["file"]),
        df["id"],
        [df] * len(df["file"])
    ))

In [7]:
df = pd.read_csv("checkpoint.csv")
fallback = df[df['name'].isnull()].copy()

In [None]:
# Fallback iterations: run until len(fallback) == 0 // i think manual run would be nicer compared to loop in this case

fallback = df[df['name'].isnull()].copy()

with ThreadPoolExecutor(max_workers=300) as executor:
    results = list(executor.map(
        anonymize_paper, 
        "./Journals/" + fallback["file"],
        [client] * len(fallback["file"]),
        fallback["id"],
        [df] * len(fallback["file"]),
    ))
    
# chage danger_dupe to True for some leftovers

Processing ./Journals/8. Review of Financial Studies/hhae071.pdfProcessing ./Journals/31. Experimental Econ/s10683-024-09830-4.pdf

Processing ./Journals/81. Education Economics/Improving the teacher feedback process in primary education  evidence from randomized controlled trials in schools in rural China.pdf
Processing ./Journals/81. Education Economics/Explaining educational achievement among Indigenous individuals  how important are culture and language .pdf
Processing ./Journals/P9. Journal of Applied Econ and Business/110303.pdf
- ./Journals/P9. Journal of Applied Econ and Business/110303.pdf | Metadata: {"title":"The Determinants of Interest Rates in the Kingdom of Saudi Arabia: An ARDL Approach (1985-2020)","authors":"Arwa Althobaiti","affiliations":"Arwa Althobaiti; Department of Economics, School of Analytics and Finance, Southern Illinois University, Carbondale IL","publication":"Journal of Applied Economics and Business, 11(3), 46-58, September 2023, DOI: Not specified","fu

In [20]:
# df[df['name'].isnull()].copy()
df.to_csv("papers.csv", index=False)

In [None]:
# singie paper clean up
id = ''
anonymize_paper("./Journals/" + df[df['id'] == id].iloc[0]['file'], client, id, df)

In [38]:
# clean up publisher -specific artifacts

papers = [file for file in os.listdir('./output') if file.endswith('.txt')]
anond = [paper for paper in papers if '-original' not in paper]

for paper in anond:
    with open(f'./output/{paper}', 'r') as f:
        content = f.read()
    
    with open(f'./output/{paper}', 'w') as f:
        f.write(content
                .replace("JEL Classification:", "")
                .replace("A R T I C L E I N F O", "") # Elsevier
                .replace("A B S T R A C T", "") # Elsevier
                .replace('O R I G I N A L A R T I C L E', '') # Wiley
                .replace('K E Y W O R D S', '') # Wiley
                )