In [1]:
import spacy

# Step 1: Load SpaCy's large English language model
!python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")



Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:

# Step 2: Load the introduction and content from .txt files
intro_file_path = "intro.txt"  # Replace with your actual file path
content_file_path = "text.txt"  # Replace with your actual file path

with open(intro_file_path, "r", encoding="utf-8") as f:
    introduction = f.read()

with open(content_file_path, "r", encoding="utf-8") as f:
    content = f.read()


In [3]:

# Step 3a: Tokenize introduction and print each token and its lemma
print("\n--- Tokens and Lemmas in Introduction ---")
intro_doc = nlp(introduction)
for token in intro_doc:
    print(f"Token: {token.text}, Lemma: {token.lemma_}")

# Step 3b: Print each token and its Part-of-Speech (POS)
print("\n--- Tokens and POS in Introduction ---")
for token in intro_doc:
    print(f"Token: {token.text}, POS: {token.pos_}")



--- Tokens and Lemmas in Introduction ---
Token: Hyraxes, Lemma: Hyraxes
Token: (, Lemma: (
Token: from, Lemma: from
Token: Ancient, Lemma: ancient
Token: Greek, Lemma: greek
Token: ὕραξ, Lemma: ὕραξ
Token: hýrax, Lemma: hýrax
Token: ', Lemma: '
Token: shrew, Lemma: shrew
Token: -, Lemma: -
Token: mouse, Lemma: mouse
Token: ', Lemma: '
Token: ), Lemma: )
Token: ,, Lemma: ,
Token: also, Lemma: also
Token: called, Lemma: call
Token: dassies,[1][2, Lemma: dassies,[1][2
Token: ], Lemma: ]
Token: are, Lemma: be
Token: small, Lemma: small
Token: ,, Lemma: ,
Token: stout, Lemma: stout
Token: ,, Lemma: ,
Token: thickset, Lemma: thickset
Token: ,, Lemma: ,
Token: herbivorous, Lemma: herbivorous
Token: mammals, Lemma: mammal
Token: in, Lemma: in
Token: the, Lemma: the
Token: family, Lemma: family
Token: Procaviidae, Lemma: Procaviidae
Token: within, Lemma: within
Token: the, Lemma: the
Token: order, Lemma: order
Token: Hyracoidea, Lemma: Hyracoidea
Token: ., Lemma: .
Token: Hyraxes, Lemma: Hyra

In [4]:

# Step 4: Perform NER on the content and display people, places, and organizations
print("\n--- Named Entities in Content ---")
content_doc = nlp(content)
entities = {"PERSON": [], "GPE": [], "ORG": []}

for ent in content_doc.ents:
    if ent.label_ in entities:
        entities[ent.label_].append(ent.text)

# Remove duplicates
entities = {key: list(set(value)) for key, value in entities.items()}

print("\nPeople:", entities["PERSON"])
print("\nPlaces (GPE):", entities["GPE"])
print("\nOrganizations:", entities["ORG"])


--- Named Entities in Content ---

People: ['Tree hyrax', 'Pachyhyrax', 'Procaviidae \u2003\t\n\u2003\t\n\u2003 Dendrohyrax \u2003\t\n\u2003\t\nSouthern', '†Antilohyrax\n†Rupestrohyrax', 'Strabo', '†Meroehyrax\n', 'Archaeohyracidae', 'Heterohyrax', 'Procaviidae\nDendrohyrax', 'D. arboreus', 'Embrithopoda', '†Gigantohyrax\nHeterohyrax', 'H. brucei', 'Titanohyrax', '†Saghatherium\u2009\n\n\u2009†Titanohyrax', 'zone.[41', 'Gitori', 'Proboscidea', '†Hengduanshanhyrax', 'Leviticus', '†Antilohyrax\u2009\n\n†Megalohyrax\n\n  Geniohyiinae  \t\n', 'Bush', 'P. capensis', 'Dendrohyrax interfluvialis', 'Hadrian']

Places (GPE): ['tapirs', 'Polyphyletic', 'Kenya', 'Tigrinya', 'Mount Kenya', 'Dimaitherium', 'Hyraxes', 'Spain', 'Paenungulata', 'Niger', 'Egypt', 'Rusinga']

Organizations: ['†Kvabebihyrax', 'cud', '†Seggeurius', '†Geniohyus', '†Titanohyracidae', 'hyraceum', '†Thyrohyrax', 'מַעֲלֵה', '†Parapliohyrax', 'dugongs.[22', 'D. dorsalis\n \n\nBenin', 'Hyraxes', 'females.[18', '†Titanohyrax', '