### This is Owen Monroe's Notebook for IS 567 Text Mining for the Project Progress Report
This notebook evaluates NER models and tagging, comparing the spacy NER model to my own hand-annotated data. The text data comes from downloaded files from the Hathi-Trust Digital Library. These texts are 19th Century Magazines from the publishers, The Society for the Diffusion of Useful Knowledge, the William and Robert Chambers, and the Society for the Diffusion of General and Christian Knowledge among the Chinese. 

In [21]:
# Loading the Spacy Model

import spacy



In [22]:
nlp = spacy.load('en_core_web_sm')

In [23]:
with open("Useful_Knowledge_Texts/Clipped_Texts/Chambers_InfoForPeople_France_1835.txt", "r", encoding="utf-8") as text_file:
    text = text_file.read()

In [24]:
doc = nlp(text)

In [25]:
hand_annotated_data = []
with open("Useful_Knowledge_Annotations_iob_3/DOCUMENT-omonroe2_illinois_edu/Chambers_InfoForPeople_France_1835.txt.tsv", "r", encoding="utf-8") as iob_file:
    for line in iob_file:
        line = line.strip()
        if line:
            token, label = line.split("\t")
           
            hand_annotated_data.append((token, label))



In [26]:
hand_annotated_data


[('he', 'O'),
 ('principal', 'O'),
 ('mountains', 'O'),
 ('of', 'O'),
 ('France', 'B-GPE'),
 ('are', 'O'),
 (',', 'O'),
 ('1', 'B-CARDINAL'),
 ('.', 'O'),
 ('The', 'O'),
 ('Toeget', 'O'),
 ('on', 'O'),
 ('the', 'O'),
 ('north', 'O'),
 ('-', 'O'),
 ('east', 'O'),
 ('.', 'O'),
 ('They', 'O'),
 ('are', 'O'),
 ('of', 'O'),
 ('a', 'O'),
 ('rounded', 'O'),
 ('oat', 'O'),
 ('-', 'O'),
 ('Bne', 'O'),
 (',', 'O'),
 ('with', 'O'),
 ('gentle', 'O'),
 ('slopes', 'O'),
 (',', 'O'),
 ('and', 'O'),
 ('afford', 'O'),
 ('much', 'O'),
 ('open', 'O'),
 ('paa', 'O'),
 ('-', 'O'),
 ('turage', 'O'),
 ('.', 'O'),
 ('The', 'O'),
 ('highest', 'O'),
 ('summit', 'O'),
 ('is', 'O'),
 ('not', 'O'),
 ('more', 'O'),
 ('than', 'O'),
 ('4500', 'B-QUANTITY'),
 ('ftet', 'O'),
 ('nigh', 'O'),
 ('.', 'O'),
 ('2', 'O'),
 ('.', 'O'),
 ('The', 'O'),
 ('Jura', 'B-LOC'),
 ('mountains', 'I-LOC'),
 ('He', 'O'),
 ('to', 'O'),
 ('the', 'O'),
 ('south', 'O'),
 ('of', 'O'),
 ('these', 'O'),
 (',', 'O'),
 ('and', 'O'),
 ('their', 'O'

In [27]:
hand_annotated_tagged = [(token, label) for token, label in hand_annotated_data if label != 'O']

In [28]:
hand_annotated_tagged

[('France', 'B-GPE'),
 ('1', 'B-CARDINAL'),
 ('4500', 'B-QUANTITY'),
 ('Jura', 'B-LOC'),
 ('mountains', 'I-LOC'),
 ('0000', 'B-QUANTITY'),
 ('Daaphlny', 'B-GPE'),
 ('Provence', 'B-GPE'),
 ('mountains', 'B-LOC'),
 ('of', 'I-LOC'),
 ('Auvergne', 'I-LOC'),
 ('Puy', 'B-LOC'),
 ('de', 'I-LOC'),
 ('Dome', 'I-LOC'),
 ('Moots', 'B-LOC'),
 ("d'Or", 'I-LOC'),
 ('Mont', 'B-LOC'),
 ('Lozcre', 'I-LOC'),
 ('510', 'B-QUANTITY'),
 ('Pyrenees', 'B-LOC'),
 ('France', 'B-GPE'),
 ('Spain', 'B-GPE'),
 ('Rhine', 'B-LOC'),
 ('Seine', 'B-LOC'),
 ('Loire', 'B-LOC'),
 ('Garonne', 'B-LOC'),
 ('Adour', 'B-LOC'),
 ('Pyrenees', 'B-LOC'),
 ('Bayonne', 'B-LOC'),
 ('Marne', 'B-LOC'),
 ('Oise', 'B-LOC'),
 ('Seine', 'B-LOC'),
 ('Loire', 'B-LOC'),
 ('Sarthe', 'B-LOC'),
 ('Maycnne', 'B-LOC'),
 ('Loire', 'B-LOC'),
 ('Rhone', 'B-LOC'),
 ('Saoue', 'B-LOC'),
 ('Isire', 'B-LOC'),
 ('Durance', 'B-LOC'),
 ('Ain', 'B-LOC'),
 ('Sorgue', 'B-LOC'),
 ('Tarn', 'B-LOC'),
 ('Dordi', 'B-LOC'),
 ('Garonne', 'B-LOC'),
 ('canals', 'B-FAC'),

In [40]:
for i in range(len(hand_annotated_tagged)):
    token, label = hand_annotated_tagged[i]
    if label.startswith('B-'):
        label = label[2:]  # Remove "B-"
    elif label.startswith('I-'):
        label = label[2:]  # Remove "I-"
    hand_annotated_tagged[i] = (token, label)
    hand_annotated_clean = hand_annotated_tagged

In [42]:
print(f'Hand Annotations List number = {len(hand_annotated_clean)}')
hand_annotated_clean

Hand Annotations List number = 645


[('France', 'GPE'),
 ('1', 'CARDINAL'),
 ('4500', 'QUANTITY'),
 ('Jura', 'LOC'),
 ('mountains', 'LOC'),
 ('0000', 'QUANTITY'),
 ('Daaphlny', 'GPE'),
 ('Provence', 'GPE'),
 ('mountains', 'LOC'),
 ('of', 'LOC'),
 ('Auvergne', 'LOC'),
 ('Puy', 'LOC'),
 ('de', 'LOC'),
 ('Dome', 'LOC'),
 ('Moots', 'LOC'),
 ("d'Or", 'LOC'),
 ('Mont', 'LOC'),
 ('Lozcre', 'LOC'),
 ('510', 'QUANTITY'),
 ('Pyrenees', 'LOC'),
 ('France', 'GPE'),
 ('Spain', 'GPE'),
 ('Rhine', 'LOC'),
 ('Seine', 'LOC'),
 ('Loire', 'LOC'),
 ('Garonne', 'LOC'),
 ('Adour', 'LOC'),
 ('Pyrenees', 'LOC'),
 ('Bayonne', 'LOC'),
 ('Marne', 'LOC'),
 ('Oise', 'LOC'),
 ('Seine', 'LOC'),
 ('Loire', 'LOC'),
 ('Sarthe', 'LOC'),
 ('Maycnne', 'LOC'),
 ('Loire', 'LOC'),
 ('Rhone', 'LOC'),
 ('Saoue', 'LOC'),
 ('Isire', 'LOC'),
 ('Durance', 'LOC'),
 ('Ain', 'LOC'),
 ('Sorgue', 'LOC'),
 ('Tarn', 'LOC'),
 ('Dordi', 'LOC'),
 ('Garonne', 'LOC'),
 ('canals', 'FAC'),
 ('France', 'GPE'),
 ('Flanders', 'GPE'),
 ('Artuis', 'GPE'),
 ('Picardy', 'GPE'),
 ('Norma

In [31]:
spacy_annotations = [(ent.text, ent.label_) for ent in doc.ents]

In [39]:
print(f'Spacy Annotations List number = {len(spacy_annotations)}')
spacy_annotations


Spacy Annotations List number = 294


[('France', 'GPE'),
 ('1', 'CARDINAL'),
 ('The \n Toeget', 'ORG'),
 ('2', 'CARDINAL'),
 ('0000 \n feet', 'QUANTITY'),
 ('3', 'CARDINAL'),
 ('Alpine', 'ORG'),
 ('Daaphlny \n ', 'ORG'),
 ('Provence', 'GPE'),
 ('4', 'CARDINAL'),
 ('Auvergne', 'GPE'),
 ('the Puy de Dome', 'FAC'),
 ('the \n Cental', 'ORG'),
 ('5', 'CARDINAL'),
 ('0', 'CARDINAL'),
 ('France', 'GPE'),
 ('Spain', 'GPE'),
 ('four', 'CARDINAL'),
 ('Rhine', 'GPE'),
 ('Seine', 'PERSON'),
 ('Loire', 'ORG'),
 ('the \n Garonne', 'LOC'),
 ('Marne', 'PERSON'),
 ('Oise', 'PERSON'),
 ('Seine', 'PERSON'),
 ('Loire', 'ORG'),
 ('Maycnne', 'PERSON'),
 ('Loire', 'ORG'),
 ('Rhone', 'ORG'),
 ('Saoue', 'ORG'),
 ('Durance', 'ORG'),
 ('Ain', 'PERSON'),
 ('Garonne', 'LOC'),
 ('France', 'GPE'),
 ('Normandy', 'PERSON'),
 ('the Isle \n of', 'ORG'),
 ('France', 'GPE'),
 ('about \n 18,I70,51>0 acres', 'QUANTITY'),
 ('7)054,501 acres', 'QUANTITY'),
 ('Languedoc', 'ORG'),
 ('Limarne', 'PERSON'),
 ('Auvergne', 'GPE'),
 ('one', 'CARDINAL'),
 ('twenty feet',

In [51]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_true = [label for token, label in hand_annotated_clean]
y_pred = [label for token, label in spacy_annotations]



# Calculate precision, recall, and F1-score for each label

precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Label: {label}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")
print()

ValueError: Found input variables with inconsistent numbers of samples: [645, 294]