In [None]:
from spacy import load
import pandas as pd
from enum import Enum

nlp = load("en_core_web_trf")


In [None]:
import re

def process_text(text:str) -> str:
    text = re.sub(r"^(the|an) ", "", text, flags=re.IGNORECASE)
    text = re.sub(r"^-|-$", "", text, flags=re.IGNORECASE)
    text = re.sub(r"[’']s$", "", text, flags=re.IGNORECASE)
    text = re.sub(r"\.", "", text, flags=re.IGNORECASE)
    return text

def filter_entities(doc, entity_type: str) -> list[str]:
    a = [process_text(ent.text) for ent in doc.ents if ent.label_ in entity_type]
    return list(set(a))

def resolve_entities(text: str, prefix: str) -> dict:
    doc = nlp(text)
    return {
        f"{prefix}_geo": filter_entities(doc, ["GPE", "LOC"]),
        f"{prefix}_org": filter_entities(doc, "ORG"),
        f"{prefix}_ppl": filter_entities(doc, "PERSON"),
    }

def resolve_entities_multi(df: pd.DataFrame, cols: list[str]) -> pd.DataFrame:
    for col in cols:
        df = pd.concat(
            [
                df, 
                test.get(col).apply(resolve_entities, prefix=col).apply(pd.Series)
            ], 
            axis=1
            )
    return df
    

test = pd.read_csv("output.csv", keep_default_na=False)

results = resolve_entities_multi(test, cols=["title", "abstract"])

In [105]:
print(results)

                              oa_id  \
0  https://openalex.org/W1520259670   
1  https://openalex.org/W3117619809   
2  https://openalex.org/W1487766560   
3  https://openalex.org/W4213049131   
4   https://openalex.org/W578223960   
5  https://openalex.org/W2483500851   
6  https://openalex.org/W2295723959   
7  https://openalex.org/W1989753893   
8  https://openalex.org/W3175380181   
9  https://openalex.org/W2941382029   

                                                 doi  \
0        https://doi.org/10.1093/0199291926.001.0001   
1                                                      
2             https://doi.org/10.5860/choice.52-0562   
3  https://doi.org/10.1093/oso/9780197508893.001....   
4                                                      
5        https://doi.org/10.1093/0199291926.003.0013   
6      https://doi.org/10.1080/01419870.2016.1155722   
7      https://doi.org/10.1016/s1090-9524(01)00023-7   
8          https://doi.org/10.1136/bmjgh-2021-006504   
9         