In [6]:
import spacy

nlp = spacy.load("en_core_web_md")

In [7]:
# Read the text files

# Read the "Moby Dick" extract
with open("mobydick.txt", "r", encoding="utf-8", errors="replace") as file:
    moby_dick_text = file.read()

# Read the ai_forecast1.txt file
with open("ai_forecast1.txt", "r", encoding="utf-8", errors="replace") as file:
    ai_forecast1_text = file.read()

# Read the ai_forecast2.txt file
with open("ai_forecast2.txt", "r", encoding="utf-8", errors="replace") as file:
    ai_forecast2_text = file.read()

In [8]:
# Process the text with SpaCy

# Increase the maximum length allowed by SpaCy
nlp.max_length = len(moby_dick_text)

doc_moby_dick = nlp(moby_dick_text)
doc_ai_forecast1 = nlp(ai_forecast1_text)
doc_ai_forecast2 = nlp(ai_forecast2_text)

In [9]:
# Compare the documents
similarity_1_2 = doc_moby_dick.similarity(doc_ai_forecast1)
similarity_1_3 = doc_moby_dick.similarity(doc_ai_forecast2)
similarity_2_3 = doc_ai_forecast1.similarity(doc_ai_forecast2)

print(f"Similarity between Moby Dick and ai_forecast1: {similarity_1_2}")
print(f"Similarity between Moby Dick and ai_forecast2: {similarity_1_3}")
print(f"Similarity between ai_forecast1 and ai_forecast2: {similarity_2_3}")


Similarity between Moby Dick and ai_forecast1: 0.8483013234148108
Similarity between Moby Dick and ai_forecast2: 0.894156146743148
Similarity between ai_forecast1 and ai_forecast2: 0.9841067849106802


In [10]:
# Compare the first 100 tokens of the documents
doc_moby_dick_100 = doc_moby_dick[:100]
doc_ai_forecast1_100 = doc_ai_forecast1[:100]
doc_ai_forecast2_100 = doc_ai_forecast2[:100]

similarity_100_1_2 = doc_moby_dick_100.similarity(doc_ai_forecast1_100)
similarity_100_1_3 = doc_moby_dick_100.similarity(doc_ai_forecast2_100)
similarity_100_2_3 = doc_ai_forecast1_100.similarity(doc_ai_forecast2_100)

print(f"Similarity between first 100 tokens of Moby Dick and ai_forecast1: {similarity_100_1_2}")
print(f"Similarity between first 100 tokens of Moby Dick and ai_forecast2: {similarity_100_1_3}")
print(f"Similarity between first 100 tokens of ai_forecast1 and ai_forecast2: {similarity_100_2_3}")

Similarity between first 100 tokens of Moby Dick and ai_forecast1: 0.45211589336395264
Similarity between first 100 tokens of Moby Dick and ai_forecast2: 0.43093761801719666
Similarity between first 100 tokens of ai_forecast1 and ai_forecast2: 0.9562583565711975


In [13]:
# Create a blank English model
nlp_blank = spacy.blank("en")

text_with_name = "Manuel Schönberger is working on a project."
doc = nlp_blank(text_with_name)

# Manually annotate the name as a PERSON entity
from spacy.tokens import Span

span = Span(doc, 0, 2, label="PERSON") 
doc.ents = [span]

for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Entity: Manuel Schönberger, Label: PERSON
