In [None]:
import openai
import pandas as pd

import functions.prompts as prompts
import functions.dupes as dupes
import functions.llm as llm
import functions.anonymize as anonymize
import functions.process as process
import importlib

importlib.reload(prompts)
importlib.reload(dupes)
importlib.reload(llm)
importlib.reload(anonymize)
importlib.reload(process)

client = openai.Client()

In [None]:
def anonymize_paper(path, client, id, df, danger_dupe=False):
    try:
        print(f"Processing {path}")

        texts = process.extract_texts_from_pdf(path)
        metadata = process.get_paper_metadata(client, "\n".join(texts))
        print(f"- {path} | Metadata: {metadata}")

        flagged_dupes = dupes.get_flagged_duplicates(texts, metadata, client) \
            if not danger_dupe \
            else dupes.get_flagged_duplicates_danger(texts, metadata, client)
        formatted_texts = process.remove_segments_from_texts(texts, flagged_dupes)

        print(f"- {path} | Anonymizing Paper")
        anond = anonymize.anonymized_texts(formatted_texts, metadata, client, path)

        print(f"- {path} | Done Processing!")
            
        result = {
            "metadata": prompts.MetadataModel.model_validate_json(metadata),
            "original": texts,
            "anond": process.remove_page_numbers(anond),
            "id": id
        }
        
        idx = df.index[df['id'] == id].tolist()[0]

        updated_row = df.loc[idx].copy()

        updated_row['name'] = result["metadata"].title
        updated_row['authors'] = result["metadata"].authors
        updated_row['affiliations'] = result["metadata"].affiliations
        updated_row['len-original'] = len(" ".join(result["original"]).split())
        updated_row['len-anond'] = len(" ".join(result["anond"]).split())

        df.loc[idx] = updated_row

        with open(f'./output/{result["id"]}.txt', 'w') as f:
            f.write("\n".join(result["anond"]))

        with open(f'./output/{result["id"]}-original.txt', 'w') as f:
            f.write("\n".join(result["original"]))

        return result

    except Exception as e:
        print(f"- {path} | ERROR: {e}")
        return None

In [None]:
df = pd.read_csv("papers-temp.csv")

In [None]:
replacements = {
    # "56": "2|s11127-025-01270-9",
    # "76": "0|ejaf002",
    # "98": "9|economic-evaluation-under-ambiguity-and-structural-uncertainties",
    # "100": "4|s11156-024-01381-2",
    # "P4": "5|socsci-14-00189",
    # "P7": "7|commodities-03-00024-v2",
    "P7_2": "0|commodities-04-00003",
}

In [None]:
# df[df['id'].isin(['56_2', '76_0', '98_9', '100_4', 'P4_5', 'P7_0', 'P7_7'])]

In [None]:
for k, v in replacements.items():
    k = k.split("_")[0]
    f, t = v.split("|")
    id = f'{k}_{f}'

    idx = df.index[df['id'] == id].tolist()[0]
    old = df.iloc[idx]
    
    updated_row = old.copy()
    updated_row['file'] = f"{k}. {old['journal']}/{t}.pdf"
    
    df.loc[idx] = updated_row
    
    anonymize_paper("./Journals/" + df[df['id'] == id].iloc[0]['file'], client, id, df)

In [None]:
# df[df['id'].isin(['56_2', '76_0', '98_9', '100_4', 'P4_5', 'P7_0', 'P7_7'])]

In [None]:
df.to_csv('paper.csv', index=False)