### Spacy Installation

In [None]:
# ! pip install spacy

#### Install Spacy pakage (sm -recommended)

In [None]:
#! python -m spacy download en_core_web_sm
#! python -m spacy download en_core_web_lg
#! python -m spacy download en_core_web_md

## Linguistic Features Extraction in NLP

### Tokenization

In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp("Apple isn't looking for to buy U.K. startup for $1 billion")

In [4]:
for token in doc:
    print(token.text)

Apple
is
n't
looking
for
to
buy
U.K.
startup
for
$
1
billion


In [5]:
doc =nlp("We are presenting a model that generates natural language descriptions of images and their regions. Our approach leverages datasets of images and their sentence descriptions to learn about the inter-modal correspondences between language and visual data. Our alignment model is based on a novel combination of Convolutional Neural Networks over image regions, bidirectional Recurrent Neural Networks over sentences, and a structured objective that aligns the two modalities through a multimodal embedding. We then describe a Multimodal Recurrent Neural Network architecture that uses the inferred alignments to learn to generate novel descriptions of image regions. We demonstrate that our alignment model produces state of the art results in retrieval experiments on Flickr8K, Flickr30K and MSCOCO datasets. We then show that the generated descriptions significantly outperform retrieval baselines on both full images and on a new dataset of region-level annotations.")

In [6]:
for token in doc:
    print(token.text)

We
are
presenting
a
model
that
generates
natural
language
descriptions
of
images
and
their
regions
.
Our
approach
leverages
datasets
of
images
and
their
sentence
descriptions
to
learn
about
the
inter
-
modal
correspondences
between
language
and
visual
data
.
Our
alignment
model
is
based
on
a
novel
combination
of
Convolutional
Neural
Networks
over
image
regions
,
bidirectional
Recurrent
Neural
Networks
over
sentences
,
and
a
structured
objective
that
aligns
the
two
modalities
through
a
multimodal
embedding
.
We
then
describe
a
Multimodal
Recurrent
Neural
Network
architecture
that
uses
the
inferred
alignments
to
learn
to
generate
novel
descriptions
of
image
regions
.
We
demonstrate
that
our
alignment
model
produces
state
of
the
art
results
in
retrieval
experiments
on
Flickr8
K
,
Flickr30
K
and
MSCOCO
datasets
.
We
then
show
that
the
generated
descriptions
significantly
outperform
retrieval
baselines
on
both
full
images
and
on
a
new
dataset
of
region
-
level
annotations
.


In [7]:
for lemma in doc:
    print(lemma.text + "----", lemma.lemma_)

We---- -PRON-
are---- be
presenting---- present
a---- a
model---- model
that---- that
generates---- generate
natural---- natural
language---- language
descriptions---- description
of---- of
images---- image
and---- and
their---- -PRON-
regions---- region
.---- .
Our---- -PRON-
approach---- approach
leverages---- leverage
datasets---- dataset
of---- of
images---- image
and---- and
their---- -PRON-
sentence---- sentence
descriptions---- description
to---- to
learn---- learn
about---- about
the---- the
inter---- inter
----- -
modal---- modal
correspondences---- correspondence
between---- between
language---- language
and---- and
visual---- visual
data---- datum
.---- .
Our---- -PRON-
alignment---- alignment
model---- model
is---- be
based---- base
on---- on
a---- a
novel---- novel
combination---- combination
of---- of
Convolutional---- Convolutional
Neural---- Neural
Networks---- Networks
over---- over
image---- image
regions---- region
,---- ,
bidirectional---- bidirectional
Recurrent---

In [8]:
for pos in doc:
    print(pos.text + "---", pos.pos_ +"---", pos.tag_)

We--- PRON--- PRP
are--- AUX--- VBP
presenting--- VERB--- VBG
a--- DET--- DT
model--- NOUN--- NN
that--- DET--- WDT
generates--- VERB--- VBZ
natural--- ADJ--- JJ
language--- NOUN--- NN
descriptions--- NOUN--- NNS
of--- ADP--- IN
images--- NOUN--- NNS
and--- CCONJ--- CC
their--- DET--- PRP$
regions--- NOUN--- NNS
.--- PUNCT--- .
Our--- DET--- PRP$
approach--- NOUN--- NN
leverages--- VERB--- VBZ
datasets--- NOUN--- NNS
of--- ADP--- IN
images--- NOUN--- NNS
and--- CCONJ--- CC
their--- DET--- PRP$
sentence--- NOUN--- NN
descriptions--- NOUN--- NNS
to--- PART--- TO
learn--- VERB--- VB
about--- ADP--- IN
the--- DET--- DT
inter--- ADJ--- JJ
---- ADJ--- JJ
modal--- ADJ--- JJ
correspondences--- NOUN--- NNS
between--- ADP--- IN
language--- NOUN--- NN
and--- CCONJ--- CC
visual--- ADJ--- JJ
data--- NOUN--- NNS
.--- PUNCT--- .
Our--- DET--- PRP$
alignment--- NOUN--- NN
model--- NOUN--- NN
is--- AUX--- VBZ
based--- VERB--- VBN
on--- ADP--- IN
a--- DET--- DT
novel--- ADJ--- JJ
combination--- NOUN--- 

### POS Tagging

In [5]:
doc

Apple isn't looking for to buy U.K. startup for $1 billion

In [6]:
for token in doc:
    print(f'{token.text:{10}}{token.lemma_:{10}}{token.pos_:{10}}{token.is_stop:{10}}') # 0 - False, 1- True

Apple     Apple     PROPN              0
is        be        AUX                1
n't       not       PART               1
looking   look      VERB               0
for       for       ADP                1
to        to        PART               1
buy       buy       VERB               0
U.K.      U.K.      PROPN              0
startup   startup   NOUN               0
for       for       ADP                1
$         $         SYM                0
1         1         NUM                0
billion   billion   NUM                0


### Dependency Parasing

In [7]:
for chunk in doc.noun_chunks:
    print(f'{chunk.text:{20}}{chunk.root.text:{20}}{chunk.root.dep_:{20}}') 

Apple               Apple               nsubj               
U.K. startup        startup             dobj                


### Named entity recognition

In [8]:
for ent in doc.ents:
    print(f'{ent.text:{20}}{ent.label_:{20}}')

Apple               ORG                 
U.K.                GPE                 
$1 billion          MONEY               


### Sentence segmantaion

In [9]:
doc1 = nlp("The Internet of things is a system of interrelated computing devices. mechanical and digital machines provided with unique identifiers and the ability to transfer data. it is human-to-human or human-to-computer interaction")

In [10]:
for sent in doc1.sents:
    print(sent)

The Internet of things is a system of interrelated computing devices.
mechanical and digital machines provided with unique identifiers and the ability to transfer data.
it is human-to-human or human-to-computer interaction


In [11]:
doc2 = nlp("hai...how are you...what are you doing?")

In [12]:
for sent in doc2.sents:
    print(sent)

hai...
how are you...
what are you doing?


### Visualization

In [13]:
from spacy import displacy

In [14]:
doc

Apple isn't looking for to buy U.K. startup for $1 billion

In [15]:
displacy.render(doc, style= "dep")

In [16]:
displacy.render(doc, style= "dep", options={"compact":True, 'distance':100})

In [17]:
displacy.render(doc, style= "ent")

## Rule Based Text Phrase Extraction and Matching using SpaCy

In [18]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import span
from spacy import displacy

In [19]:
doc=nlp("Hello World!")

In [20]:
for token in doc:
    print(token)

Hello
World
!


In [21]:
pattern =[{"LOWER":"hello", "OP":"?"},{"IS_PUNCT":True,  "OP":"?"},{"LOWER":"world"}]

In [22]:
matcher = Matcher(nlp.vocab)
matcher.add("Helloworld", None, pattern)

In [23]:
doc =nlp("Hello, World!")

In [24]:
matches =matcher(doc)

matches

[(7909505024684541438, 0, 3),
 (7909505024684541438, 1, 3),
 (7909505024684541438, 2, 3)]

In [25]:
for token in doc:
    print(token)

Hello
,
World
!


In [26]:
for match_id, start, end in matches:
    string_id =nlp.vocab.strings[match_id]
    span =doc[start:end]
    print(match_id, string_id, start, end, span.text)

7909505024684541438 Helloworld 0 3 Hello, World
7909505024684541438 Helloworld 1 3 , World
7909505024684541438 Helloworld 2 3 World


### Regular Expression

In [27]:
text = "my mobile is not 89686.  it is 9789567878"

In [28]:
import re

In [29]:
re.search(r'\d{10}', text)

<re.Match object; span=(31, 41), match='9789567878'>

In [30]:
re.search(r'\w{2}', text)

<re.Match object; span=(0, 2), match='my'>

In [31]:
re.findall(r'\d{3,10}', text)

['89686', '9789567878']

In [32]:
re.findall(r'\w{3,10}', text)

['mobile', 'not', '89686', '9789567878']

#### wildcard text

In [33]:
re.findall(r'm.....', text)

['my mob']

In [34]:
re.findall(r'\d$', text)

['8']

In [35]:
text1 = "3 years mobile"

In [36]:
re.findall(r'^\d', text1)

['3']

In [37]:
re.findall(r'[^\d]+', text1) # remove digit

[' years mobile']

In [38]:
re.findall(r'[^\D]+', text1) # only digit

['3']

In [39]:
text3 = "free hands-on videos total-free"

In [40]:
re.findall(r'[\w]+-[\w]+', text3)

['hands-on', 'total-free']

In [41]:
text = "Google is announced new pixel at Google I/O ,Google I/O is great place to work and upadtes availble in Google"

In [42]:
pattern = [{"TEXT":"Google"},{"TEXT":"I", "OP":"?"},{"TEXT":"/" , "OP":"?"},{"TEXT":"O" , "OP":"?"}]

In [43]:
def callback_method(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity =doc[start:end]
    print(entity.text)

In [44]:
matcher = Matcher(nlp.vocab)
matcher.add("Google", callback_method, pattern )

In [45]:
doc= nlp(text)

In [46]:
matcher(doc)

Google
Google
Google I
Google I/
Google I/O
Google
Google I
Google I/
Google I/O
Google


[(11578853341595296054, 0, 1),
 (11578853341595296054, 6, 7),
 (11578853341595296054, 6, 8),
 (11578853341595296054, 6, 9),
 (11578853341595296054, 6, 10),
 (11578853341595296054, 11, 12),
 (11578853341595296054, 11, 13),
 (11578853341595296054, 11, 14),
 (11578853341595296054, 11, 15),
 (11578853341595296054, 24, 25)]

In [47]:
doc

Google is announced new pixel at Google I/O ,Google I/O is great place to work and upadtes availble in Google

In [48]:
matches = Matcher(nlp.vocab)

In [49]:
matched_sents= []

In [50]:
doc =nlp("('i'd say that facebook is evil. Facebook is pretty cool, right?')")

In [51]:
doc

('i'd say that facebook is evil. Facebook is pretty cool, right?')

In [52]:
pattern = [{"LOWER":"facebook"},{"LEMMA":"be"},{"POS":"ADV", "OP":"*"},{"POS":"ADJ"}]

In [53]:
def callback_method_fb (matcher, doc, i, matches):
    matches_id, start, end = matches [i]
    span = doc[start:end]
    
    match_ents = [{"start": span.start_char -sent.start_char,
                   "end": span.end_char- sent.start_char,
                   "label": "MATCH"}]
    
    matched_sents.append({"text": sent.text, "ents":match_ents})

In [54]:
matcher.add("fb", callback_method_fb, pattern)

In [55]:
matches = matcher(doc)

In [56]:
matches

[(8017838677478259815, 6, 9), (8017838677478259815, 10, 14)]

In [57]:
matched_sents

[{'text': 'what are you doing?',
  'ents': [{'start': -5, 'end': 11, 'label': 'MATCH'}]},
 {'text': 'what are you doing?',
  'ents': [{'start': 13, 'end': 36, 'label': 'MATCH'}]}]

In [58]:
displacy.render(matched_sents, style='ent', manual=True)

### Phone Numbers

In [68]:
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "dddd"},
           {"ORTH": "-", "OP": "?"}, {"SHAPE": "dddd"}]
matcher.add("PHONE_NUMBER", None, pattern)

doc = nlp("Call me at (123) 4566 7890 ")
print([t.text for t in doc])

['Call', 'me', 'at', '(', '123', ')', '4566', '7890']


In [69]:
matches = matcher(doc)
matches

[(10788718092470551940, 3, 8)]

In [70]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

(123) 4566 7890


### Email Address Matching

In [71]:
pattern =[{"TEXT":{"REGEX":"[a-zA-Z0-9-_.]+0[a-zA-Z0-9-_.]+"}}]

In [72]:
matcher.add("EMAIL_ADDRESS", None, pattern)

doc = nlp("Email me at mugeshraja06@gmail.com ")
print([t.text for t in doc])

['Email', 'me', 'at', 'mugeshraja06@gmail.com']


In [73]:
matches = matcher(doc)
matches

[(16571425990740197027, 3, 4)]

In [74]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

mugeshraja06@gmail.com


### Emoji Matching

In [75]:
pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  # Negative emoji

# Add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

In [77]:
pos_patterns

[[{'ORTH': '😀'}],
 [{'ORTH': '😃'}],
 [{'ORTH': '😂'}],
 [{'ORTH': '🤣'}],
 [{'ORTH': '😊'}],
 [{'ORTH': '😍'}]]

In [78]:
neg_patterns

[[{'ORTH': '😞'}],
 [{'ORTH': '😠'}],
 [{'ORTH': '😩'}],
 [{'ORTH': '😢'}],
 [{'ORTH': '😭'}],
 [{'ORTH': '😒'}]]

In [81]:
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment

In [82]:
matcher.add("HAPPY", label_sentiment, *pos_patterns)  # Add positive pattern
matcher.add("SAD", label_sentiment, *neg_patterns)  # Add negative pattern

In [83]:
# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])

In [89]:
doc = nlp("Truth is 😀 #im_Batman")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = doc.vocab.strings[match_id]  # Look up string ID
    span = doc[start:end]
    print(string_id, span.text)

HAPPY 😀
HASHTAG #im_Batman


### Efficient Pharse matching

In [90]:
from spacy.matcher import PhraseMatcher

In [91]:
matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]

In [92]:
# Only run nlp.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", None, *patterns)

In [93]:
doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")

In [94]:
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

Angela Merkel
Barack Obama
Washington, D.C.


In [95]:
matches

[(3766102292120407359, 2, 4),
 (3766102292120407359, 7, 9),
 (3766102292120407359, 19, 22)]

### Custom Rule based entity recognition

In [96]:
from spacy.pipeline import EntityRuler

In [97]:
ruler = EntityRuler(nlp)

In [98]:
patterns = [{"label": "ORG", "pattern": "Apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

In [99]:
doc = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Apple', 'ORG'), ('first', 'ORDINAL'), ('San Francisco', 'GPE')]
