In [10]:
### all together ########################
import stanza.pipeline
import spacy
import glob
import pandas as pd

stanza.download('en')
nlp_stanza = stanza.Pipeline('en', processors = 'tokenize,ner')#stanza.Pipeline(lang="en") 
nlp_spacy = spacy.load('en_core_web_sm')

path = './Biographies/*'
files = sum([glob.glob(dir+'/*.txt') for dir in glob.glob(path)],[]) # a commenter pour tout faire
files = files[:3]

data = []
for file in files:
    filename = file.split('/')[-1]
    print(file)
    with open(file, 'r') as f:
        content = f.read()

        doc_stanza = nlp_stanza(content)
        ents_stanza = [(ent.text, ent.type) for ent in doc_stanza.ents]
    
        doc_spacy = nlp_spacy(content)
        ents_spacy = [(ent.text, ent.label_) for ent in doc_spacy.ents]
    
        data.append({'file_name': filename,
                    'ents_stanza': ents_stanza,
                    'ents_spacy': ents_spacy})

df = pd.DataFrame(data)
df.head()

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 5.05MB/s]
2024-06-12 11:23:02 INFO: Downloaded file to /Users/abigail.berthe/stanza_resources/resources.json
2024-06-12 11:23:02 INFO: Downloading default packages for language: en (English) ...
2024-06-12 11:23:04 INFO: File exists: /Users/abigail.berthe/stanza_resources/en/default.zip
2024-06-12 11:23:17 INFO: Finished downloading models and saved to /Users/abigail.berthe/stanza_resources
2024-06-12 11:23:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 23.9MB/s]
2024-06-12 11:23:17 INFO: Downloaded file to /Users/abigail.berthe/stanza_resources/resources.json
2024-06-12 11:23:18 INFO: Loading these models for 

./Biographies/Biographies_Journalists/SérgioPereiraCouto_Journalists.txt
./Biographies/Biographies_Journalists/Gamalal-Ghitani_Journalists.txt
./Biographies/Biographies_Journalists/J.R.RalphCasimir_Journalists.txt


AttributeError: module 'pandas' has no attribute 'loc'

In [124]:
import numpy as np
"""
• Write a function that checks one document (i.e. a biography) for
the following:
(a) the number of spans (i.e. token(s)) where both packages agree and predict is an NE (i.e. complete overlap in span predicted).
(b) the number of spans where there is a partial agreement between both packages (i.e. partial overlap in spans predicted).
(c) for each package, the number of spans that a package predicted as an NE, but the other package did not predict as an NE.
(d) for the spans with full and partial agreement, was there an agreement in the NE type (e.g. Person, Location, Organisation etc)
(e) Use visualisation to compare the above statistics, per category per package (i.e. Spacy vs Stanza)
"""

def complete_overlap(df_docrow):
    """
    (a) the number of spans (i.e. token(s)) where both packages agree and predict is an NE (i.e. complete overlap in span predicted)
    """
    ents_stanza = [ df_docrow['ents_stanza'][0][x][0] for x in range(len(df_docrow['ents_stanza'][0]))]
    ents_spacy = [ df_docrow['ents_spacy'][0][x][0] for x in range(len(df_docrow['ents_spacy'][0]))]
    ents_both = [ent for ent in ents_stanza if ent in ents_spacy]
    #print(f"{ents_stanza}\n\n{ents_spacy}\n\n{ents_both}")
    #print(ents_both)
    return ents_both

def partial_overlap(df_docrow):
    """
    (b) the number of spans where there is a partial agreement between both packages (i.e. partial overlap in spans predicted).   
    PARTIAL strictly (perfect matches are removed)
    """
    ents_stanza = [ df_docrow['ents_stanza'][0][x][0] for x in range(len(df_docrow['ents_stanza'][0]))]
    ents_spacy = [ df_docrow['ents_spacy'][0][x][0] for x in range(len(df_docrow['ents_spacy'][0]))]
    complete = complete_overlap(df_docrow)
    ents_both_partial = [ent for ent in ents_stanza if ent in " ".join(ents_spacy) and ent not in complete]
    ents_both_partial.extend([ent for ent in ents_spacy if ent in " ".join(ents_stanza) and ent not in ents_both_partial and ent not in complete])
    #print(ents_both_partial)
    return ents_both_partial

def one_but_not_the_other(df_docrow):
    """
    (c) for each package, the number of spans that a package predicted as an NE, but the other package did not predict as an NE.
    """
    ents_stanza = [ df_docrow['ents_stanza'][0][x][0] for x in range(len(df_docrow['ents_stanza'][0]))]
    ents_spacy = [ df_docrow['ents_spacy'][0][x][0] for x in range(len(df_docrow['ents_spacy'][0]))]
    elts_both_lists = complete_overlap(df_docrow) #elements that are in both lists (complete element)
    elts_both_lists_partial = partial_overlap(df_docrow)
    #print(elts_both_lists)
    merged_lists = ents_stanza
    merged_lists.extend(ents_spacy)
    #print(list(set(merged_lists)))
    only_in_one_complete= [ent for ent in merged_lists if ent not in elts_both_lists] # yields the elements in `merged_lists` that are NOT in `elts_both_lists`
    only_in_one_partial= [ent for ent in only_in_one_complete if ent not in elts_both_lists_partial]
    return only_in_one_complete, only_in_one_partial

def find_tuple_with_first_value(tuples_list, specific_value):
    """Returns the tuple with the specific_value as the first element if found, else None."""
    for tup in tuples_list:
        if tup[0] == specific_value:
            return tup
    return None

def find_tuple_containing_value(tuples_list, specific_value):
    """Returns the tuple with the specific_value as the first element if found, else None."""
    for tup in tuples_list:
        if specific_value in tup[0] :
            return tup
    return None

def agreement_ne_type_complete(df_docrow):
    """
    (d) for the spans with full agreement, was there an agreement in the NE type (e.g. Person, Location, Organisation etc)
    """
    spans_agreement = complete_overlap(df_docrow)

    row_span_spacy = [ df_docrow['ents_stanza'][0][x] for x in range(len(df_docrow['ents_stanza'][0]))]
    row_span_stanza = [ df_docrow['ents_spacy'][0][x] for x in range(len(df_docrow['ents_spacy'][0]))]
    agree = 0
    disagree = 0
    for span in spans_agreement:
        if(find_tuple_with_first_value(row_span_spacy, span)[1] == find_tuple_with_first_value(row_span_stanza, span)[1]):
            agree +=1
        else:
            disagree +=1
    
        
    return agree, disagree

def agreement_ne_type_partial(df_docrow):
    """
    (d) for the spans with partial agreement, was there an agreement in the NE type (e.g. Person, Location, Organisation etc)
    """
    spans_agreement = partial_overlap(df_docrow)

    row_span_spacy = [ df_docrow['ents_stanza'][0][x] for x in range(len(df_docrow['ents_stanza'][0]))]
    row_span_stanza = [ df_docrow['ents_spacy'][0][x] for x in range(len(df_docrow['ents_spacy'][0]))]
    agree = 0
    disagree = 0
    for span in spans_agreement:
            if(find_tuple_with_first_value(row_span_spacy, span) != None and find_tuple_with_first_value(row_span_spacy, span)[1] == find_tuple_containing_value(row_span_stanza, span)[1]):
                agree +=1
            elif(find_tuple_with_first_value(row_span_stanza, span) != None and find_tuple_with_first_value(row_span_stanza, span)[1] == find_tuple_containing_value(row_span_spacy, span)[1]):
                agree +=1
            else:
                disagree +=1
        
    return agree, disagree

In [125]:
def comparison_stanza_spacy(pd_df, doc):
    row_doc = pd_df.loc[pd_df['file_name'] == doc]
    print(f"Number of where both packages agree and predict is a NE : {len(complete_overlap(row_doc))}")
    print('\n\n')
    print(f"Number of spans where there is a partial agreement between both packages: {len(partial_overlap(row_doc))}")
    print('\n\n')
    complete_one, partial_one = one_but_not_the_other(row_doc)
    print(f"Number of spans that a package predicted as an NE, but the other package did not predict as a NE (only exact matches): {len(complete_one)}")
    print(f"Number of spans that a package predicted as an NE, but the other package did not predict as a NE (with partial matches): {len(partial_one)}")
    total_agree, total_disagree = agreement_ne_type_complete(pd_df)
    partial_agree, partial_disagree = agreement_ne_type_partial(pd_df)
    print("\n\nTotal agreement for NE: ")
    print(f"Number of span agreeing on the NE type : {total_agree}")
    print(f"Number of span disagreeing on the NE type : {total_disagree}")
    print("\nPartial agreement for NE: ")
    print(f"Number of span agreeing on the NE type : {partial_agree}")
    print(f"Number of span disagreeing on the NE type : {partial_disagree}")
                        
    
    
comparison_stanza_spacy(df, 'SérgioPereiraCouto_Journalists.txt')

Number of where both packages agree and predict is a NE : 5



Number of spans where there is a partial agreement between both packages: 42



Number of spans that a package predicted as an NE, but the other package did not predict as a NE (only exact matches): 59
Number of spans that a package predicted as an NE, but the other package did not predict as a NE (with partial matches): 17


Total agreement for NE: 
Number of span agreeing on the NE type : 3
Number of span disagreeing on the NE type : 2

Partial agreement for NE: 
Number of span agreeing on the NE type : 5
Number of span disagreeing on the NE type : 37


In [117]:
print('hello')

17
52
