In [28]:
### all together ########################
import stanza
import spacy
import glob
import pandas as pd

import stanza.pipeline
stanza.download('en')
nlp_stanza = stanza.Pipeline('en', processors='tokenize,ner') 
nlp_spacy = spacy.load('en_core_web_sm')

path_journalists = './Biographies/Biographies_Journalists/*.txt'
files_journalists = glob.glob(path_journalists)
files_journalists = files_journalists[:3]

path_sculptors = './Biographies/Biographies_Sculptors/*.txt'
files_sculptors = glob.glob(path_sculptors)
files_sculpors = files_sculptors[:3]

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 7.44MB/s]         
2024-06-13 17:56:57 INFO: Downloaded file to /home/marina/stanza_resources/resources.json
2024-06-13 17:56:57 INFO: Downloading default packages for language: en (English) ...
2024-06-13 17:56:59 INFO: File exists: /home/marina/stanza_resources/en/default.zip
2024-06-13 17:57:04 INFO: Finished downloading models and saved to /home/marina/stanza_resources
2024-06-13 17:57:04 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 379kB [00:00, 12.1MB/s]         
2024-06-13 17:57:05 INFO: Downloaded file to /home/marina/stanza_resources/resources.json
2024-06-13 17:57:05 INFO: Loading these models for language: en (Engl

In [29]:
def extract_ents_stanza(file):
    with open(file, 'r') as f:
        content = f.read()
        doc = nlp_stanza(content)
        ents = [(ent.text, ent.type) for ent in doc.ents]
    return ents

def extract_ents_spacy(file):
    with open(file, 'r') as f:
        content = f.read()
        doc = nlp_spacy(content)
        ents = [(ent.text, ent.label_) for ent in doc.ents]
    return ents

def store_ents(files):
    data = []
    for file in files:
        file_name = file.split('/')[-1]
        data.append({'file_name': file_name, 
                     'ents_stanza': extract_ents_stanza(file), 
                     'ents_spacy': extract_ents_spacy(file)})
    return data

data_journalists = store_ents(files_journalists)
df_journalists = pd.DataFrame(data_journalists)
df_journalists.head()

data_sculptors = store_ents(files_journalists)
df_sculptors = pd.DataFrame(data_sculptors)
df_sculptors.head()

Unnamed: 0,file_name,ents_stanza,ents_spacy
0,AquileoJ.Echeverría_Journalists.txt,"[(Aquileo J. Echeverría, PERSON), (May 22, 186...","[(Aquileo J. Echeverría, PERSON), (May 22, 186..."
1,ThomasHenryShadwellClerke_Journalists.txt,"[(Thomas Henry Shadwell Clerke, PERSON), (KH, ...","[(Thomas Henry Shadwell Clerke, PERSON), (KH, ..."
2,Journalist_Journalists.txt,"[(Matthew C. Nisbet, PERSON), (Walter Lippmann...","[(Matthew C. Nisbet, PERSON), (Walter Lippmann..."


In [31]:
import numpy as np

nb_ents_journalists_stanza = [len(ents) for ents in df_journalists['ents_stanza']]
nb_ents_journalists_spacy = [len(ents) for ents in df_journalists['ents_spacy']]
nb_ents_sculptors_stanza = [len(ents) for ents in df_sculptors['ents_stanza']]
nb_ents_sculptors_spacy = [len(ents) for ents in df_sculptors['ents_spacy']]

nb_ents_list = [nb_ents_journalists_stanza,nb_ents_journalists_spacy,nb_ents_sculptors_stanza,nb_ents_sculptors_spacy]

avg_nb_ents = [np.mean(list) for list in nb_ents_list]
min_nb_ents = [np.min(list) for list in nb_ents_list]
max_nb_ents = [np.max(list) for list in nb_ents_list]


nb_words_journalists_spacy = [len(ent[0]) for ent in [ents for ents in df_journalists['ents_stanza']]]
nb_words_journalists_spacy = [len(ent[0]) for ent in [ents for ents in df_journalists['ents_spacy']]]
nb_words_sculptors_stanza = [len(ent[0]) for ent in [ents for ents in df_sculptors['ents_stanza']]]
nb_words_sculptors_spacy = [len(ent[0]) for ent in [ents for ents in df_sculptors['ents_spacy']]]

nb_words_list = [nb_words_journalists_spacy,nb_words_journalists_spacy,nb_words_sculptors_stanza,nb_words_sculptors_spacy]

avg_nb_words = [np.mean(list) for list in nb_words_list]
min_nb_words = [np.min(list) for list in nb_words_list]
max_nb_words = [np.max(list) for list in nb_words_list]

data = {"category" : ["Journalists","Journalists","Sculptors","Sculptors"],
                "package" : ["stanza","spacy","stanza","spacy"],
                "avg_nb_ents" : avg_nb_ents,
                "min_nb_ents" : min_nb_ents,
                "max_nb_ents" : max_nb_ents,
                "avg_nb_words" : avg_nb_words,
                "min_nb_words" : min_nb_words,
                "max_nb_words" : max_nb_words}
                

df_statistics = pd.DataFrame(data)
df_statistics.head()

Unnamed: 0,category,package,avg_nb_ents,min_nb_ents,max_nb_ents,avg_nb_words,min_nb_words,max_nb_words
0,Journalists,stanza,113.333333,46,235,2.0,2,2
1,Journalists,spacy,115.333333,47,242,2.0,2,2
2,Sculptors,stanza,113.333333,46,235,2.0,2,2
3,Sculptors,spacy,115.333333,47,242,2.0,2,2
