# SPACY
spaCy is a fast, robust, and industrial-strength NLP (Natural Language Processing) library written in Python and Cython. It’s widely used for tasks like tokenization, lemmatization, POS tagging, NER, parsing, and more. Unlike NLTK, which is more for education and research, spaCy is production-ready.

In [1]:
# NLTK IMPLEMENTATION
# This script demonstrates how to perform tokenization, POS tagging, and chunking using NLTK in Python.


import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

sentence = "The quick brown fox jumps over the lazy dog"

# Tokenize sentence into words
words = word_tokenize(sentence)

# Perform Part-of-Speech (POS) tagging
pos_tags = pos_tag(words)
print("POS Tags:", pos_tags)

# Note :
# ------
# NN → Noun
# JJ → Adjective
# VBZ → Verb
# DT → Determiner

# Define chunking pattern for Noun Phrases (NP)
chunk_grammar = r"""
    NP: {<DT>?<JJ>*<NN>}  # Noun Phrase: Optional Determiner (DT) + Adjective(s) (JJ) + Noun (NN)
"""

# Create a chunk parser
chunk_parser = RegexpParser(chunk_grammar)

# Apply chunking
chunk_tree = chunk_parser.parse(pos_tags)

# Print chunk structure
print(chunk_tree)


chunk_tree.draw()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chand\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


POS Tags: [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]
(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  jumps/VBZ
  over/IN
  (NP the/DT lazy/JJ dog/NN))


In [1]:
%pip install spacy



Note: you may need to restart the kernel to use updated packages.


In [3]:
#implementation using spacy

import spacy

# Download the spaCy small English model if not already installed
import sys
!{sys.executable} -m spacy download en_core_web_sm

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "The quick brown fox jumps over the lazy dog"

# Process text
doc = nlp(text)

# Extract noun chunks
for chunk in doc.noun_chunks:
    print(chunk.text)

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 6.6 MB/s eta 0:00:02
     ---- ----------------------------------- 1.6/12.8 MB 5.2 MB/s eta 0:00:03
     ------- -------------------------------- 2.4/12.8 MB 4.8 MB/s eta 0:00:03
     ---------- ----------------------------- 3.4/12.8 MB 4.3 MB/s eta 0:00:03
     ------------- -------------------------- 4.2/12.8 MB 4.2 MB/s eta 0:00:03
     --------------- ------------------------ 5.0/12.8 MB 4.2 MB/s eta 0:00:02
     ------------------ --------------------- 5.8/12.8 MB 4.1 MB/s eta 0:00:02
     -------------------- ------------------- 6.6/12.8 MB 4.1 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 4.1 MB/s eta 0:00:02
     -------------------------- ---------

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")


In [5]:
for token in doc:
    print(token.text)


Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


In [6]:
for token in doc:
    print(f"{token.text} - {token.pos_} - {token.tag_}")


Apple - PROPN - NNP
is - AUX - VBZ
looking - VERB - VBG
at - ADP - IN
buying - VERB - VBG
U.K. - PROPN - NNP
startup - VERB - VBD
for - ADP - IN
$ - SYM - $
1 - NUM - CD
billion - NUM - CD


In [7]:
for ent in doc.ents:
    print(ent.text, ent.label_)


Apple ORG
U.K. GPE
$1 billion MONEY


In [8]:
for token in doc:
    print(f"{token.text} → {token.lemma_}")


Apple → Apple
is → be
looking → look
at → at
buying → buy
U.K. → U.K.
startup → startup
for → for
$ → $
1 → 1
billion → billion


In [9]:
for token in doc:
    print(f"{token.text} --> {token.dep_} --> {token.head.text}")


Apple --> nsubj --> looking
is --> aux --> looking
looking --> ROOT --> looking
at --> prep --> looking
buying --> pcomp --> at
U.K. --> nsubj --> startup
startup --> ccomp --> buying
for --> prep --> startup
$ --> quantmod --> billion
1 --> compound --> billion
billion --> pobj --> for


In [11]:
import sys
!{sys.executable} -m spacy download en_core_web_md

nlp = spacy.load("en_core_web_md")
doc1 = nlp("cat")
doc2 = nlp("dog")
print(doc1.similarity(doc2))


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     - -------------------------------------- 1.0/33.5 MB 7.2 MB/s eta 0:00:05
     -- ------------------------------------- 1.8/33.5 MB 5.0 MB/s eta 0:00:07
     -- ------------------------------------- 2.4/33.5 MB 4.8 MB/s eta 0:00:07
     ---- ----------------------------------- 3.7/33.5 MB 4.5 MB/s eta 0:00:07
     ----- ---------------------------------- 4.5/33.5 MB 4.3 MB/s eta 0:00:07
     ------ --------------------------------- 5.2/33.5 MB 4.2 MB/s eta 0:00:07
     ------- -------------------------------- 6.0/33.5 MB 4.2 MB/s eta 0:00:07
     -------- ------------------------------- 6.8/33.5 MB 4.2 MB/s eta 0:00:07
     --------- ------------------------------ 7.6/33.5 MB 4.1 MB/s eta 0:00:07
     ---------- -------------------------

In [12]:
doc = nlp("This is the first sentence. This is the second.")
for sent in doc.sents:
    print(sent.text)


This is the first sentence.
This is the second.


In [13]:
for token in doc:
    if token.is_stop:
        print(token.text)


This
is
the
first
This
is
the


In [14]:
for token in doc:
    print(token.text, token.is_alpha, token.is_digit, token.is_punct)


This True False False
is True False False
the True False False
first True False False
sentence True False False
. False False True
This True False False
is True False False
the True False False
second True False False
. False False True
