In [2]:
# Installing the spaCy library
!pip install spacy

# Installing the small English language model used by spaCy
!python -m spacy download en_core_web_sm 

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import spacy # Loads spaCy model
nlp = spacy.load("en_core_web_sm") # This code demonstrates tokenization, POS tagging, NER, and experimentation with spaCy

In [None]:
# Task 1: Tokenization
text_1 = "The quick brown fox doesn't jump over the lazy dog. Natural Language Processing is fascinating!"
doc1 = nlp(text_1) # Uses spaCy's NLP to analyze and extract linguistic features from the text

print("Task 1: Tokenization")
for token in doc1:
    print(f"Token: {token.text}, Head: {token.head.text}, Lemma: {token.lemma_}, Morph: {token.morph}")

# Answers:
# 1. spaCy processes each word or symbol as a token and assigns syntactic information using its dependency parser.
# 2. Punctuation like periods and commas are treated as separate tokens with specific POS tags (e.g., 'PUNCT').
# 3. Contractions like "doesn't" are split into "does" and "n't" as separate tokens.

Task 1: Tokenization
Token: The, Head: fox, Lemma: the, Morph: Definite=Def|PronType=Art
Token: quick, Head: fox, Lemma: quick, Morph: Degree=Pos
Token: brown, Head: fox, Lemma: brown, Morph: Degree=Pos
Token: fox, Head: jump, Lemma: fox, Morph: Number=Sing
Token: does, Head: jump, Lemma: do, Morph: Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
Token: n't, Head: jump, Lemma: not, Morph: Polarity=Neg
Token: jump, Head: jump, Lemma: jump, Morph: VerbForm=Inf
Token: over, Head: jump, Lemma: over, Morph: 
Token: the, Head: dog, Lemma: the, Morph: Definite=Def|PronType=Art
Token: lazy, Head: dog, Lemma: lazy, Morph: Degree=Pos
Token: dog, Head: over, Lemma: dog, Morph: Number=Sing
Token: ., Head: jump, Lemma: ., Morph: PunctType=Peri
Token: Natural, Head: Language, Lemma: Natural, Morph: Number=Sing
Token: Language, Head: Processing, Lemma: Language, Morph: Number=Sing
Token: Processing, Head: is, Lemma: processing, Morph: Number=Sing
Token: is, Head: is, Lemma: be, Morph: Mood=Ind|

In [None]:
# Task 2: Part-of-Speech Tagging
print("\nTask 2: POS Tagging") 

for token in doc1: # Loop through each token (word or punctuation) in the processed document
    print(f"Token: {token.text}, POS: {token.pos_}, Tag: {token.tag_}") # Print the token's text, its part-of-speech (POS) tag, and detailed POS tag

# Answers:
# 1. "Quick": POS is ADJ (adjective), "jumps": POS is VERB, and "is": POS is AUX (auxiliary verb).
# 2. POS tagging helps identify the grammatical structure, making it useful to detect errors (e.g., incorrect verb tense) in grammar checking and properly structure sentences across languages in machine translation.


Task 2: POS Tagging
Token: The, POS: DET, Tag: DT
Token: quick, POS: ADJ, Tag: JJ
Token: brown, POS: ADJ, Tag: JJ
Token: fox, POS: NOUN, Tag: NN
Token: does, POS: AUX, Tag: VBZ
Token: n't, POS: PART, Tag: RB
Token: jump, POS: VERB, Tag: VB
Token: over, POS: ADP, Tag: IN
Token: the, POS: DET, Tag: DT
Token: lazy, POS: ADJ, Tag: JJ
Token: dog, POS: NOUN, Tag: NN
Token: ., POS: PUNCT, Tag: .
Token: Natural, POS: PROPN, Tag: NNP
Token: Language, POS: PROPN, Tag: NNP
Token: Processing, POS: NOUN, Tag: NN
Token: is, POS: AUX, Tag: VBZ
Token: fascinating, POS: ADJ, Tag: JJ
Token: !, POS: PUNCT, Tag: .


In [None]:
# Task 3: Named Entity Recognition
text_2 = "Barack Obama was the 44th President of the United States. He was born in Hawaii."
doc2 = nlp(text_2) # Uses spaCy's NLP to analyze and extract linguistic features from the text

# Using doc.ents to loop through named entities
print("\nTask 3: Named Entity Recognition")
for ent in doc2.ents:  # Uses doc.ents
    print(f"Entity: {ent.text}, Label: {ent.label_}")  # Uses ent.label_ to get the entity type

# Answers:
# 1. spaCy recognizes the following entities: "Barack Obama" as a PERSON, "44th" as ORDINAL, "President" as TITLE (implied), "United States" as GPE, and "Hawaii" as GPE.
# 2. Entities assigned: "Barack Obama" → PERSON and "Hawaii" → GPE (Geo-Political Entity).


Task 3: Named Entity Recognition
Entity: Barack Obama, Label: PERSON
Entity: 44th, Label: ORDINAL
Entity: the United States, Label: GPE
Entity: Hawaii, Label: GPE


In [None]:
# Task 4: Experimentation
text_3 = "Rachel is studying NLP at the University of Notr Dame!"
doc3 = nlp(text_3)

print("\nTask 4: Experimentation")
for ent in doc3.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

# Answers:
# - Typo in "Notr Dame" (missing 'e') affected NER: "University of Notr Dame" may not be recognized as an organization.
# - spaCy is sensitive to correct spelling/punctuation; named entities or POS tags can change if the input is malformed.

# spaCy modifies analysis based on the changes. Typos or altered phrasing tend to confuse the model and may reduce the accuracy in tagging and/or entity identification.
# Entities and tags can change due to spelling errors, different word choices, or punctuation changes. This is due to the fact that spaCy relies on patterns of clean, well-formed language it has seen during training.



# Attempt to modify the sentence
text4 = "Rachel studies Natural Language Processing at the University of Notre Dame."
doc4 = nlp(text4)

print("\nModified Text:")
for ent in doc4.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}") # When corrected to "Notre Dame," spaCy correctly identifies the university as an ORG (organization).


Task 4: Experimentation
Entity: Rachel, Label: PERSON
Entity: NLP, Label: ORG
Entity: the University of Notr Dame, Label: ORG

Modified Text:
Entity: the University of Notre Dame, Label: ORG
