In [170]:
import re
import pandas as pd
import json

def retrieve_json(path):
    json_string = open(path)
    
    graph = json.load(json_string)
    return graph

def get_content_by_person(knowledge_graph, category, file_name):
    """
    takes as input the knowledge graphs for all the persons and return only the knowledge graph of the specified person
    """
    # matches only whats before '_Category.txt' and after 'Biographies_Category_'
    person_name = (re.match(r"Biographies_[A-Za-z]+\\(.+?)_.*\.txt", file_name)).group(1) 
    for index, item in enumerate(knowledge_graph.get(category, [])):
        if item.get('head', {}).get('person') == person_name:
            return item['results']['bindings']
    return None

def concatenate_s_o_values(graph_person):
    """
    create a unique string with all objects and subjects of the kg graph for a person to be able to search for the named entity in that string
    """
    result = []
    for fact in graph_person:
        # we keep only objects and subjects as we assume named entities will not be properties
        s_value = fact['s']['value']
        o_value = fact['o']['value']
        if s_value:
            result.append(str(s_value))
        if o_value:
            result.append(str(o_value))
    return ' '.join(result)

def ne_against_kg_graph(df_docrow, graph_person, print_all = True):
    """
    Function that checks the following for each person: 
    (a) for each set of NEs predicted by each package (i.e. Stanza and Spacy), and print how many of them can be found mentioned 
    in the KG Graph collected for the person (along with the list of NE found).
    (b) return for each package (i.e. Stanza and Spacy) the ratio of predicted NEs that can confidently be said to be in the KG graph for the person.
    """
    person_name = (re.match(r"Biographies_[A-Za-z]+\\(.+?)_.*\.txt", df_docrow["file_name"])).group(1)
    ents_stanza = [ df_docrow['ents_stanza'][x][0] for x in range(len(df_docrow['ents_stanza']))]
    ents_spacy = [ df_docrow['ents_spacy'][x][0] for x in range(len(df_docrow['ents_spacy']))]

    #convert the subjects and objects to a single string in which we will be able to search for the Named Entities easily
    graph_string = concatenate_s_o_values(graph_person)

    #search for the named entities predicted by Stanza in the graph string
    found_stanza = []
    not_found_stanza = []
    for entity in ents_stanza:
        if re.search(re.escape(entity), graph_string):
            found_stanza.append(entity)
        else:
            not_found_stanza.append(entity)

    #search for the named entities predicted by Spacy in the graph string
    found_spacy = []
    not_found_spacy = []
    for entity in ents_spacy:
        if re.search(re.escape(entity), graph_string):
            found_spacy.append(entity)
        else:
            not_found_spacy.append(entity)
            
    if print_all:  
        #print out the number of NE and the list of NE predicted by Spacy and Stanza which are mentioned in the kg graph, only if print_all = True to avoid printing all these information when the function is run in a loop
        print(f'For {person_name} :\n')
        print(f'Using Stanza:\n\nThere are {len(found_stanza)} predicted Named Entities FOUND in the knowledge graph:\n{found_stanza}\n\nThere are {len(not_found_stanza)} predicted Named Entities NOT FOUND in the knowledge graph:\n{not_found_stanza}.\n\n')
        print('--------------------------------------------------------------------------------------------------------------------------------')
        print(f'Using Spacy:\n\nThere are {len(found_spacy)} predicted Named Entities FOUND in the knowledge graph:\n{found_spacy}\n\nThere are {len(not_found_spacy)} predicted Named Entities NOT FOUND in the knowledge graph:\n{not_found_spacy}.')

    #return the ratio of predicted NE for each package
    return len(found_stanza)/ len(ents_stanza), len(found_spacy)/len(ents_spacy)

In [171]:
#retrieve dataframes with Named entities and json graphs
df_journalists = pd.read_pickle('ents_journalists.pkl')
df_sculptors = pd.read_pickle('ents_sculptors.pkl')
knowledge_graph = retrieve_json('Biographies\knowledge_graph.json')


In [172]:
#test with a Journalist (4th row of the dataframe for example)
ratio_stanza, ratio_spacy = ne_against_kg_graph(df_journalists.iloc[3], get_content_by_person(knowledge_graph, 'Journalists', df_journalists.iloc[3]['file_name']))
print(f'\n{format(ratio_stanza*100, ".2f")}% of the named entities predicted by Stanza were found in the knowledge graph')
print(f'\n{format(ratio_spacy*100, ".2f")}% of the named entities predicted by Spacy were found in the knowledge graph')

For AlbertoCañasEscalante :

Using Stanza:

There are 48 predicted Named Entities FOUND in the knowledge graph:
['Alberto Cañas Escalante', '16 March 1920', '14 June 2014', 'San José', 'Costa Rica', 'Costa Rica', 'the latter half of the twentieth century', 'The National Library System', 'Costa Rica', 'Cañas', 'more than 4,773', '2005', '1962', '1966', '1994', '1998', 'The Nation', 'Cañas', 'San José', 'San José', 'Cañas', 'Cañas', 'Cañas', 'San José', 'Cañas', 'Cañas', 'Cañas', 'San José', '1962', '1966', 'Cañas', 'Cañas', 'Cañas', 'Cañas', 'Cañas', 'Cañas', 'one', '14 June 2014', 'Cañas', '94', 'Cañas', 'Cañas', 'Costa Rica', 'San José', 'Costa Rica', 'Costa Rica', 'Costa Rica', 'Alberto Cañas']

There are 146 predicted Named Entities NOT FOUND in the knowledge graph:
['International Relations', '1955', '1956', '1956', '1958', 'two', 'first', 'Sports', '1970', 'Diario de Costa Rica', 'Costa Rica Daily', 'La República', 'The Republic', 'Excelsior', 'Excelsior', 'La Nación', 'La Prensa 

In [173]:
#test with a Sculptor (125th row of the dataframe for example)
ratio_stanza, ratio_spacy = ne_against_kg_graph(df_sculptors.iloc[124], get_content_by_person(knowledge_graph, 'Sculptors', df_sculptors.iloc[124]['file_name']))
print(f'\n{format(ratio_stanza*100, ".2f")}% of the named entities predicted by Stanza were found in the knowledge graph')
print(f'\n{format(ratio_spacy*100, ".2f")}% of the named entities predicted by Spacy were found in the knowledge graph')

For VincentasJakševičius :

Using Stanza:

There are 36 predicted Named Entities FOUND in the knowledge graph:
['Vincentas Jakševičius', '1873', 'Naujamiestis', 'Russian Empire', 'July 19, 1936', 'Kaišiadorys, Lithuania', 'Lithuanian', 'Kaišiadorys', 'Lithuania', 'Panevėžys', 'Jakševičius', 'Russia', 'Vilnius', 'Jakševičius', 'Švėkšna Manor', 'Jakševičius', 'Benjaminas', 'Silvanas', 'Adomas', '1908', 'Jakševičius', 'Veiviržėnai', 'Ablinga', 'Gardamas', 'Dembava', 'Inkakliai', 'Kaišiadorys Cathedral', 'Jakševičius', 'July 19, 1936', 'Kaišiadorys Cemetery', 'Panevėžys', 'Švėkšna Manor Park', 'Naujamiestis', '1928', 'Kaišiadorys Cathedral', '1936']

There are 43 predicted Named Entities NOT FOUND in the knowledge graph:
['Biography\nVincentas', 'Aleksandras Jakševičius', 'Marija Guzėnaitė-Jakševičienė', 'three', "Jakševičius'", '1887', 'three', "Vincentas'", '1897', 'seven', 'three', "Jakševičius'", 'Tsar Alexander III', 'Saint Petersburg', '1920', 'Plater', 'first', '1900', '1933', '1967

In [174]:
from statistics import mean

#mean ratio by category by package

ratio_stanza_j = []
ratio_spacy_j = []
for _, row in df_journalists.iterrows():
    try : 
        ratio_stanza, ratio_spacy = ne_against_kg_graph(row, get_content_by_person(knowledge_graph, 'Journalists', row['file_name']), print_all = False)
        ratio_stanza_j.append(ratio_stanza)
        ratio_spacy_j.append(ratio_spacy)
    except :
        continue

print('For the Journalists :\n')
print(f'\nOn  average, {format(mean(ratio_stanza_j)*100, ".2f")}% of the named entities predicted by Stanza for a person were found in the person\'s knowledge graph')
print(f'\nOn  average, {format(mean(ratio_spacy_j)*100, ".2f")}% of the named entities predicted by Stanza for a person were found in the person\'s knowledge graph')

ratio_stanza_s = []
ratio_spacy_s = []
for _, row_s in df_sculptors.iterrows():
    try : 
        ratio_stanza, ratio_spacy = ne_against_kg_graph(row_s, get_content_by_person(knowledge_graph, 'Sculptors', row_s['file_name']), print_all = False)
        ratio_stanza_s.append(ratio_stanza)
        ratio_spacy_s.append(ratio_spacy)  
    except :
        print("e")
        continue
print('\n\nFor the Sculptors :')
print(f'\nOn  average, {format(mean(ratio_stanza_s)*100, ".2f")}% of the named entities predicted by Stanza for a person were found in the person\'s knowledge graph')
print(f'\nOn  average, {format(mean(ratio_spacy_s)*100, ".2f")}% of the named entities predicted by Stanza for a person were found in the person\'s knowledge graph')

For the Journalists :


On  average, 49.74% of the named entities predicted by Stanza for a person were found in the person's knowledge graph

On  average, 49.60% of the named entities predicted by Stanza for a person were found in the person's knowledge graph


For the Sculptors :

On  average, 45.19% of the named entities predicted by Stanza for a person were found in the person's knowledge graph

On  average, 46.23% of the named entities predicted by Stanza for a person were found in the person's knowledge graph
