### **Traditional (Rule-Based NLP using NLTK and Spacy)**

In [5]:
# !pip install spacy

Collecting spacy
  Downloading spacy-3.7.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (25 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp311-cp311-macosx_11_0_arm64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,>=8.1.8 (from spacy)
  Downloading thinc-8.2.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.2-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsl

In [6]:
import nltk
import spacy

In [8]:
nlp = spacy.load("en_core_web_sm")

**1. Tokenization- The process of breaking a text down into tokens is called tokenization.**

In [10]:
#using spacy
text = "Mary, don’t slap the green witch"
print([str(token) for token in nlp(text.lower())])
# nlp gives 'doc' objects,that is why conversion to str is required

['mary', ',', 'do', 'n’t', 'slap', 'the', 'green', 'witch']


In [23]:
print([token.dep_ for token in nlp(text.lower())])

['nsubj', 'punct', 'aux', 'neg', 'ROOT', 'det', 'amod', 'dobj']


In [21]:
#using nltk
from nltk.tokenize import word_tokenize
text = "Mary, don’t slap the green witch"
word_tokenize(text)

['Mary', ',', 'don', '’', 't', 'slap', 'the', 'green', 'witch']

In [25]:
#Tokenizing tweets using NLTK
from nltk.tokenize import TweetTokenizer
tweet="Snow White and the Seven Degrees #MakeAMovieCold@midnight: )"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':', ')']


 **2. Stopword Removal-Stopwords such as articles and
prepositions serve mostly a grammatical purpose, like filler holding the content words.**

In [28]:
#using spacy
nlp = spacy.load('en_core_web_sm')              #didnt specify language since 'en' model has been loaded
stopwords = nlp.Defaults.stop_words
text = " we will show how to remove stopwords using spacy library"

lst=[]
for token in text.split():
    print(token)
    if token.lower() not in stopwords:    #checking whether the word is not
        lst.append(token)                    #present in the stopword list.
print(lst)

we
will
show
how
to
remove
stopwords
using
spacy
library
['remove', 'stopwords', 'spacy', 'library']


In [29]:
#using nltk
from nltk.corpus import stopwords
stopwords=stopwords.words('english')
text = " we will show how to remove stopwords using spacy library"

lst=[]
for token in text.split():
    if token.lower() not in stopwords:    #checking whether the word is not
        lst.append(token)                    #present in the stopword list.
print(lst)

['show', 'remove', 'stopwords', 'using', 'spacy', 'library']


**3. Unigrams, Bigrams, Trigrams, …, N-grams**
N grams are fixed length (n) consecutive token sequences occurring in the text. A bigram has two
tokens, a unigram one. Generating n grams from a text is straightforward enough, but packages like spaCy and NLTK provide convenient methods.

In [31]:
def n_grams(text, n):
  return [text[i:i+n] for i in range(len(text)-n+1)]
cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']
print(n_grams(cleaned, 4))

[['mary', ',', "n't", 'slap'], [',', "n't", 'slap', 'green'], ["n't", 'slap', 'green', 'witch'], ['slap', 'green', 'witch', '.']]


In [41]:
#using spacy
import spacy
from spacy_ngram import NgramComponent

nlp = spacy.load('en_core_web_sm')  # or whatever model you downloaded
nlp.add_pipe('spacy-ngram')  # default to document-level ngrams, removing stopwords
# add_pipe meaning?? -> it's spacy's pipeline to which we have added 'spacy_ngram' for the given program usage.
text = 'Quark soup is an interacting localized assembly of quarks and gluons.'

print(nlp(text)._.ngram_1)
print(nlp(text)._.ngram_2)
# print(nlp(text)._.ngram_3)        ??? Can't retrieve unregistered extension attribute 'ngram_3'. Did you forget to call the `set_extension` method?

['quark', 'soup', 'interact', 'localize', 'assembly', 'quark', 'gluon']
['quark_soup', 'soup_interact', 'interact_localize', 'localize_assembly', 'assembly_quark', 'quark_gluon']


In [46]:
#using NLTK
from nltk.util import ngrams
text = 'Quark soup is an interacting localized assembly of quarks and gluons.'
# nltk digests lists instead of text directly(like in spacy)
unigrams = ngrams(text.split(), 1)
for item in unigrams:
    print(item)
bigrams = ngrams(text.split(), 2)
for item in bigrams:
    print(item)

quadgrams = ngrams(text.split(), 4)
for item in quadgrams:
    print(item)

('Quark',)
('soup',)
('is',)
('an',)
('interacting',)
('localized',)
('assembly',)
('of',)
('quarks',)
('and',)
('gluons.',)
('Quark', 'soup')
('soup', 'is')
('is', 'an')
('an', 'interacting')
('interacting', 'localized')
('localized', 'assembly')
('assembly', 'of')
('of', 'quarks')
('quarks', 'and')
('and', 'gluons.')
('Quark', 'soup', 'is', 'an')
('soup', 'is', 'an', 'interacting')
('is', 'an', 'interacting', 'localized')
('an', 'interacting', 'localized', 'assembly')
('interacting', 'localized', 'assembly', 'of')
('localized', 'assembly', 'of', 'quarks')
('assembly', 'of', 'quarks', 'and')
('of', 'quarks', 'and', 'gluons.')


**4. Lemmas and Stems**

In [48]:
#Lemmatization Using Spacy
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("he was running late")
for token in doc:
  print('{} > {}'.format(token, token.lemma_))
# no requirement of creating instances
#There is no stemming method in Spacy

he > he
was > be
running > run
late > late


In [50]:
#Lemmatization using NLTK
from nltk.stem import WordNetLemmatizer
wn=WordNetLemmatizer()
doc ="he was running late"
['{} > {}'.format(word, wn.lemmatize(word)) for word in doc.split()]


['he > he', 'was > wa', 'running > running', 'late > late']

In [51]:
#Lemmatization using NLTK
from nltk.stem import PorterStemmer
ps=PorterStemmer()
doc ="he was running late"
[ps.stem(word) for word in doc.split()]


['he', 'wa', 'run', 'late']

**5. Categorizing words: POS Tagging**

In [55]:
#using spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
  print('{} {}'.format(token, token.pos_))

Mary PROPN
slapped VERB
the DET
green ADJ
witch NOUN
. PUNCT


In [56]:
#using NLTK
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
doc = "Mary slapped the green witch."
pos_tag(word_tokenize(doc))

[('Mary', 'NNP'),
 ('slapped', 'VBD'),
 ('the', 'DT'),
 ('green', 'JJ'),
 ('witch', 'NN'),
 ('.', '.')]

**6. Categorizing Spans: Chunking and Named Entity Recognition**

**Chunking:** Often, we need to label a span of text; that is, a contiguous multitoken boundary. For example,
consider the sentence, “Mary slapped the green witch.” We might want to identify the noun phrases
(NP) and verb phrases (VP) in it,

In [57]:
#using spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
  print ('{} {}'.format(chunk, chunk.label_))

Mary NP
the green witch NP


**Named Entity**:  A named entity is a string mention of a real
world concept like a person, location, organization, drug name, and so on.

In [58]:
#using spacy
doc = nlp("Larry Page founded Google")
# Text and label of named entity span
[(ent.text, ent.label_) for ent in doc.ents]


[('Larry Page', 'PERSON'), ('Google', 'ORG')]

In [59]:
#using NLTK
import nltk
from nltk import word_tokenize,pos_tag
from nltk import ne_chunk
text = "NASA awarded Elon Musk’s SpaceX a $2.9 billion contract to build the lunar lander."
tokens = word_tokenize(text)
tag=pos_tag(tokens)
print(tag)


[('NASA', 'NNP'), ('awarded', 'VBD'), ('Elon', 'NNP'), ('Musk', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('SpaceX', 'NNP'), ('a', 'DT'), ('$', '$'), ('2.9', 'CD'), ('billion', 'CD'), ('contract', 'NN'), ('to', 'TO'), ('build', 'VB'), ('the', 'DT'), ('lunar', 'NN'), ('lander', 'NN'), ('.', '.')]


In [60]:
from nltk.chunk import ne_chunk
# nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     /Users/paridhimaheshwari/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [62]:

ne_tree = nltk.ne_chunk(tag)
print(ne_tree)

(S
  (ORGANIZATION NASA/NNP)
  awarded/VBD
  (PERSON Elon/NNP Musk/NNP)
  ’/NNP
  s/VBD
  (ORGANIZATION SpaceX/NNP)
  a/DT
  $/$
  2.9/CD
  billion/CD
  contract/NN
  to/TO
  build/VB
  the/DT
  lunar/NN
  lander/NN
  ./.)
