In [3]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load and process the text
with open("peterrabbit.txt", "r") as file:
    text = file.read()
doc = nlp(text)

# Q.1: Create a Doc object from peterrabbit.txt
# This is done above as 'doc'



In [5]:
# Q 2: Token details in the third sentence
third_sentence = list(doc.sents)[2]
for token in third_sentence:
    print(f"token: {token.text}, {token.pos_}, {token.tag_}, {spacy.explain(token.tag_)}")



token: They, PRON, PRP, pronoun, personal
token: lived, VERB, VBD, verb, past tense
token: with, ADP, IN, conjunction, subordinating or preposition
token: their, PRON, PRP$, pronoun, possessive
token: Mother, NOUN, NN, noun, singular or mass
token: in, ADP, IN, conjunction, subordinating or preposition
token: a, DET, DT, determiner
token: sand, NOUN, NN, noun, singular or mass
token: -, PUNCT, HYPH, punctuation mark, hyphen
token: bank, NOUN, NN, noun, singular or mass
token: ,, PUNCT, ,, punctuation mark, comma
token: underneath, ADP, IN, conjunction, subordinating or preposition
token: the, DET, DT, determiner
token: root, NOUN, NN, noun, singular or mass
token: of, ADP, IN, conjunction, subordinating or preposition
token: a, DET, DT, determiner
token: 
, SPACE, _SP, whitespace
token: very, ADV, RB, adverb
token: big, ADJ, JJ, adjective (English), other noun-modifier (Chinese)
token: fir, NOUN, NN, noun, singular or mass
token: -, PUNCT, HYPH, punctuation mark, hyphen
token: tree, NO

In [8]:
# Q 3: Frequency list of POS tags from the entire document
pos_counts = Counter(token.pos_ for token in doc)
print(pos_counts)



Counter({'NOUN': 172, 'PUNCT': 171, 'VERB': 135, 'ADP': 125, 'PRON': 110, 'SPACE': 99, 'DET': 90, 'PROPN': 74, 'ADV': 63, 'CCONJ': 61, 'ADJ': 53, 'AUX': 49, 'PART': 28, 'SCONJ': 19, 'NUM': 9})


In [10]:
# Q 4: Percentage of tokens that are nouns
noun_count = sum(1 for token in doc if token.pos_ == "NOUN")
noun_percentage = (noun_count / len(doc)) * 100
print(f"Percentage of nouns: {noun_percentage:.2f}%")



Percentage of nouns: 13.67%


In [11]:
# Q 5: Dependency parse for the third sentence
for token in third_sentence:
    print(token.text, token.dep_, token.head.text)



They nsubj lived
lived ROOT lived
with prep lived
their poss Mother
Mother pobj with
in prep lived
a det bank
sand compound bank
- punct bank
bank pobj in
, punct lived
underneath prep lived
the det root
root pobj underneath
of prep root
a det tree

 dep a
very advmod big
big amod tree
fir compound tree
- punct tree
tree pobj of
. punct lived


 dep .


In [12]:
# Q 6: First two named entities
for ent in doc.ents[:2]:
    print(ent.text, ent.label_)



The Tale of Peter Rabbit WORK_OF_ART
Beatrix Potter PERSON


In [13]:
# Q 7: Total number of sentences
total_sentences = len(list(doc.sents))
print(f"Total sentences: {total_sentences}")


Total sentences: 55


In [14]:

# Q 8: Count sentences with named entities
sentences_with_ents = sum(1 for sent in doc.sents if any(token.ent_type_ for token in sent))
print(f"Sentences with named entities: {sentences_with_ents}")



Sentences with named entities: 35


In [15]:
# Q 9: Named entity visualization for the first sentence
from spacy import displacy
displacy.render(list(doc.sents)[0], style="ent")
