# Finding words, phrases, names and concepts

In [6]:
from spacy.lang.en import English
nlp = English()

## Doc, token, Span Lexical objects 

In [7]:
# Doc objevt

doc = nlp('Hello World!')

for token in doc:
    print(token.text)

Hello
World
!


In [8]:
# token

token = doc[1]
print(token.text)

World


In [9]:
# Span object

doc = nlp('Hello World!')

span = doc[1:3]
print(span.text)

World!


In [11]:
# Lexical Attributes

doc = nlp('It costs $5.')

print('Index:', [token.i for token in doc])
print('Text:', [token.text for token in doc])

print('is_alpha:', [token.is_alpha for token in doc])
print('is_punct:', [token.is_punct for token in doc])
print('like_num:', [token.like_num for token in doc])

Index: [0, 1, 2, 3, 4]
Text: ['It', 'costs', '$', '5', '.']
is_alpha: [True, True, False, False, False]
is_punct: [False, False, False, False, True]
like_num: [False, False, False, True, False]


## Model packages

In [15]:
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp('She ate the pizza')

for token in doc:
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [16]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


## Predicting named entities

In [17]:
doc = nlp('Apple is looking at buying UK startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
UK GPE
$1 billion MONEY


In [20]:
print(spacy.explain('GPE'))
print(spacy.explain('NNP'))
print(spacy.explain('dobj'))

Countries, cities, states
noun, proper singular
direct object


## Rule based matching (Matcher)

In [60]:
# eg 1:

import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

pattern = [{'TEXT' : 'iphone'}, {'TEXT': 'X'}]

# matcher.add('IPhone_PATTER', None, pattern)

matcher.add('IPhone', [pattern])

doc = nlp('Upcoming iphone X release date leak')

matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

iphone X


In [75]:
# eg 2:


import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

pattern = [{'IS_DIGIT' : True}, 
           {'LOWER': 'fifa'},
           {'LOWER': 'world'},
           {'LOWER': 'cup'},
           {'IS_PUNCT' : True}
          ]

matcher.add('fifa', [pattern])

doc = nlp('2018 FIFA world cup: France won!')

matches = matcher(doc)
print(matches)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)


[(7778788780550260102, 0, 5)]
2018 FIFA world cup:


In [76]:
# eg 3:


import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

pattern = [{'LEMMA' : 'love', 'POS': 'VERB'},
           {'POS': 'NOUN'}
          ]

matcher.add('love', [pattern])

doc = nlp('I loved cats but now I love dogs more.')

matches = matcher(doc)
print(matches)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

[(3702023516439754181, 1, 3), (3702023516439754181, 6, 8)]
loved cats
love dogs


In [77]:
# eg 4:

import spacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

matcher.add('SolarPower', [pattern1,pattern2,pattern3])
doc = nlp("The Solar Power industry continues to grow a solarpower increases. Solar-power is good")
found_matches = matcher(doc)
print(found_matches)


for _,start,end in found_matches:
    span = doc[start:end]
    print(span)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]
Solar Power
solarpower
Solar-power


## Using operators and quantifiers

In [80]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

pattern = [{'LEMMA' : 'buy'}, 
           {'POS': 'DET', 'OP':'?'},
           {'POS': 'NOUN'}
          ]

matcher.add('love', [pattern])

doc = nlp("I bought a smartphone. Now I'm buying apps")

matches = matcher(doc)
print(matches)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

[(3702023516439754181, 1, 4), (3702023516439754181, 8, 10)]
bought a smartphone
buying apps


## Stop words in Spacy

In [79]:
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'therein', 'much', 'bottom', 'been', 'everyone', 'although', 'eight', 'yet', 'than', 'herein', '‘d', 'we', '’re', 'the', 'whom', 'elsewhere', 'both', 'mostly', 'top', 'down', 'four', 'too', 'two', 'somewhere', 'take', 'one', 'whereas', 'well', 'fifty', '‘m', 'where', 'so', 'may', 'front', 'nevertheless', 'hereafter', 'however', 'such', 'through', 'then', 'using', 'several', 'next', "'m", 'otherwise', 'becoming', 'him', 'thereafter', 'out', 'call', 'against', 'still', 'was', 'anyway', 'either', 'own', 'us', 'always', 'less', 'for', 'ten', 'afterwards', 'his', 'twenty', 'perhaps', 'each', 'became', "'d", 'noone', 'just', 'hers', 'is', '‘s', 'n‘t', 'now', 'around', 'nothing', 'ever', 'unless', 'whatever', 'she', 'yourself', 'besides', 'together', 'never', 'there', '‘re', 'often', 'keep', 'fifteen', 'former', 'same', 'even', 'about', 'already', 'beforehand', 'very', 'after', 'its', 'whereupon', 'neither', "n't", 'nor', 'show', 'hereupon', 'along', 'to', 'throughout', 'sometime', 'more', '