In [None]:

import spacy
from spacy import displacy
import random

# Load the spacy English language model
nlp = spacy.load('en_core_web_sm')

# Define entity types
entity_types = ['PERSON', 'LOC', 'ORG', 'GPE', 'FAC', 'PRODUCT'] #product is equivalent to vehicle

# Function to extract entities from text
def extract_entities(text):
    # Tokenize the text
    doc = nlp(text)

    # Extract entities and their types
    entities = []
    for ent in doc.ents:
        if ent.label_ in entity_types:
            entities.append((ent.text, ent.label_))

    print("Extracted entities from the text:")
    for item in entities:
        print(item);

    return entities

# Function to evaluate entity recognition performance
def evaluate_entity_recognition(text, manually_labelled_entities):
    # Extract entities from the text
    predicted_entities = extract_entities(text)

    # Calculate true positives (TP), false positives (FP), and false negatives (FN)
    TP = len(set(manually_labelled_entities) & set(predicted_entities))
    FP = len(set(predicted_entities) - set(manually_labelled_entities))
    FN = len(set(manually_labelled_entities) - set(predicted_entities))

    print("True positives (TP):", TP)
    print("False positives (FP):", FP)
    print("False negatives (FN):", FN)

    # Calculate precision, recall, and F1 score
    try:
        precision = TP / (TP + FP)
        recall = TP / (TP + FN)
        f1_score = 2 * (precision * recall) / (precision + recall)

        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score:", f1_score)
        print()
    except ZeroDivisionError:
        precision = 0
        recall = 0
        f1_score = 0

    return precision, recall, f1_score

# Load the book text
with open('nlp2a.txt', 'r', encoding='utf-8') as f:
    book_text = f.read()

# Select random passages for evaluation
passages =book_text.split('\n\n')

# Evaluate entity recognition performance three times
f1_scores = []
for passage in passages:
    # Manually label entities in the passage
    manually_labelled_entities =[
    ("Silas", "PER"),
    ("The Teacher", "PER"),
    ("Opus Dei", "PER"),
    ("His mother", "PER"),
    ("His father", "PER"),
    ("The boy", "PER"),
    ("Church of Saint-Sulpice", "ORG"),
    ("Brotherhood", "ORG"),
    ("Opus Dei", "ORG"),
    ("The authorities", "ORG"),
    ("Marseilles", "ORG"),
    ("Toulon", "ORG"),
    ("Church of Saint-Sulpice", "LOC"),
    ("Place Saint-Sulpice", "LOC"),
    ("Pyrenees", "LOC"),
    ("Andorra", "LOC"),
    ("Marseilles", "LOC"),
    ("Toulon", "LOC"),
    ("The coast", "LOC"),
    ("Spain", "GPE"),
    ("France", "GPE"),
    ("Andorra", "GPE"),
    ("Stone cell", "FAC"),
    ("Basement of a dilapidated factory", "FAC"),
    ("Audi", "PRODUCT")
]




    print("Manually labeled entities in the passage:", manually_labelled_entities)


    # Calculate F1 score
    _, _, f1_score = evaluate_entity_recognition(passage, manually_labelled_entities)
    f1_scores.append(f1_score)

# Calculate average F1 score
average_f1_score = sum(f1_scores) / len(f1_scores)

print("Average F1 score:", average_f1_score)

# Visualize entity recognition results
html_output = displacy.render(nlp(passages[0]), style='ent', options={'compact': True})
with open("entity_visualization.html", "w", encoding="utf-8") as file:
    file.write(html_output)


Manually labeled entities in the passage: [('Silas', 'PER'), ('The Teacher', 'PER'), ('Opus Dei', 'PER'), ('His mother', 'PER'), ('His father', 'PER'), ('The boy', 'PER'), ('Church of Saint-Sulpice', 'ORG'), ('Brotherhood', 'ORG'), ('Opus Dei', 'ORG'), ('The authorities', 'ORG'), ('Marseilles', 'ORG'), ('Toulon', 'ORG'), ('Church of Saint-Sulpice', 'LOC'), ('Place Saint-Sulpice', 'LOC'), ('Pyrenees', 'LOC'), ('Andorra', 'LOC'), ('Marseilles', 'LOC'), ('Toulon', 'LOC'), ('The coast', 'LOC'), ('Spain', 'GPE'), ('France', 'GPE'), ('Andorra', 'GPE'), ('Stone cell', 'FAC'), ('Basement of a dilapidated factory', 'FAC'), ('Audi', 'PRODUCT')]
Extracted entities from the text:
('Audi', 'ORG')
('Church of Saint-Sulpice', 'ORG')
('Opus Dei', 'ORG')
('Audi', 'ORG')
('Place Saint-Sulpice', 'PERSON')
('Opus Dei', 'ORG')
True positives (TP): 2
False positives (FP): 2
False negatives (FN): 23
Precision: 0.5
Recall: 0.08
F1 score: 0.13793103448275865

Manually labeled entities in the passage: [('Silas'