In [4]:
import spacy

In [5]:
#Load spacy model
nlp = spacy.load("en_core_web_sm")

#define the text
text = "I absolutely loved the new Batman movie! It's a must-watch. #Batman #MovieReview"


In [7]:
doc = nlp(text)

# Tokenization

# for token in doc:
#     print(token.text)

spacy_tokens = [token.text for token in doc]
print("Token: ", spacy_tokens)

Token:  ['I', 'absolutely', 'loved', 'the', 'new', 'Batman', 'movie', '!', 'It', "'s", 'a', 'must', '-', 'watch', '.', '#', 'Batman', '#', 'MovieReview']


In [9]:
# Lowercasing
spacy_lower = [token.lower_ for token in doc]
print("Lowercasing: ",spacy_lower)

Lowercasing:  ['i', 'absolutely', 'loved', 'the', 'new', 'batman', 'movie', '!', 'it', "'s", 'a', 'must', '-', 'watch', '.', '#', 'batman', '#', 'moviereview']


In [11]:
# Stopword Removal
spacy_no_stop = [token.text for token in doc if not token.is_stop]
print("Stopword Removal: ",spacy_no_stop)

Stopword Removal:  ['absolutely', 'loved', 'new', 'Batman', 'movie', '!', '-', 'watch', '.', '#', 'Batman', '#', 'MovieReview']


In [12]:
# Lemmatization
spacy_lemmas = [token.lemma_ for token in doc]
print("Lematization: ",spacy_lemmas)

Lematization:  ['I', 'absolutely', 'love', 'the', 'new', 'Batman', 'movie', '!', 'it', 'be', 'a', 'must', '-', 'watch', '.', '#', 'Batman', '#', 'MovieReview']


In [19]:
# Customize Tokenization
from spacy.symbols import ORTH

nlp.tokenizer.add_special_case("MovieReview",[
    {ORTH:"Movie"},
    {ORTH:"Review"}
])

doc = nlp(text)
tokens = [token.text for token in doc]
tokens

['I',
 'absolutely',
 'loved',
 'the',
 'new',
 'Batman',
 'movie',
 '!',
 'It',
 "'s",
 'a',
 'must',
 '-',
 'watch',
 '.',
 '#',
 'Batman',
 '#',
 'Movie',
 'Review']

In [22]:
# Pipeline names
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x20a516c2b10>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x20a516c1c10>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x20a51e67290>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x20a52e04790>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x20a52cb2f90>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x20a51e67060>)]

In [32]:
# print(spacy_tokens)

# spacy_pos = [token.pos_ for token in doc]
# print("Part of Speach: ",spacy_pos)

for token in doc:
    print(token,' | ', token.pos_)

I  |  PRON
absolutely  |  ADV
loved  |  VERB
the  |  DET
new  |  ADJ
Batman  |  PROPN
movie  |  NOUN
!  |  PUNCT
It  |  PRON
's  |  AUX
a  |  DET
must  |  AUX
-  |  PUNCT
watch  |  VERB
.  |  PUNCT
#  |  SYM
Batman  |  PROPN
#  |  PROPN
Movie  |  PROPN
Review  |  PROPN


In [33]:
#Name Entity Recognition
for ent in doc.ents:
    print(ent.text, " | ", ent.label_, " | ", spacy.explain(ent.label_))

Batman  |  PRODUCT  |  Objects, vehicles, foods, etc. (not services)
#  |  CARDINAL  |  Numerals that do not fall under another type


In [8]:
# Customize LEEMA

arr = nlp.get_pipe('attribute_ruler')
arr.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{'LEMMA': "Brother"})

doc = nlp("Bro, you wanna go ? Brah, dont say no! i am exhausted")

for token in doc:
    print(token.text, "|", token.lemma_)

Bro | Brother
, | ,
you | you
wanna | wanna
go | go
? | ?
Brah | Brother
, | ,
do | do
nt | not
say | say
no | no
! | !
i | I
am | be
exhausted | exhaust
