# **Chapter 1**

# Finding words, phrases, names and concepts

In [1]:
from spacy.lang.en import English
nlp = English()

## Doc, token, Span Lexical objects 

In [2]:
# Doc objevt

doc = nlp('Hello World!')

for token in doc:
    print(token.text)

Hello
World
!


In [3]:
# token

token = doc[1]
print(token.text)

World


In [4]:
# Span object

doc = nlp('Hello World!')

span = doc[1:3]
print(span.text)

World!


In [5]:
# Sentence is a doc
# 2 or more words in a sentence is span (generally done by slicing [:])
# Word in a sentence is token (generally done by [])

doc = nlp('Manoj is a good boy')
span = nlp('Manoj is a good boy')[0:2]
token = nlp('Manoj is a good boy')[0]

print(type(doc))
print(type(span))
print(type(token))

<class 'spacy.tokens.doc.Doc'>
<class 'spacy.tokens.span.Span'>
<class 'spacy.tokens.token.Token'>


In [6]:
# Lexical Attributes

doc = nlp('It costs $5.')

print('Index:', [token.i for token in doc])
print('Text:', [token.text for token in doc])

print('is_alpha:', [token.is_alpha for token in doc])
print('is_punct:', [token.is_punct for token in doc])
print('like_num:', [token.like_num for token in doc])

Index: [0, 1, 2, 3, 4]
Text: ['It', 'costs', '$', '5', '.']
is_alpha: [True, True, False, False, False]
is_punct: [False, False, False, False, True]
like_num: [False, False, False, True, False]


## Model packages

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

doc = nlp('She ate the pizza')

for token in doc:
    print(token.text, token.pos_)

She PRON
ate VERB
the DET
pizza NOUN


In [8]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


## Predicting named entities

In [9]:
doc = nlp('Apple is looking at buying UK startup for $1 billion')

for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
UK GPE
$1 billion MONEY


In [10]:
print(spacy.explain('GPE'))
print(spacy.explain('NNP'))
print(spacy.explain('dobj'))

Countries, cities, states
noun, proper singular
direct object


## Rule based matching (Matcher)

In [11]:
# eg 1:

import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

pattern = [{'TEXT' : 'iphone'}, {'TEXT': 'X'}]

# matcher.add('IPhone_PATTER', None, pattern)

matcher.add('IPhone', [pattern])

doc = nlp('Upcoming iphone X release date leak')

matches = matcher(doc)
print(matches)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

[(6139850947917451393, 1, 3)]
iphone X


In [12]:
# eg 2:


import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

pattern = [{'IS_DIGIT' : True}, 
           {'LOWER': 'fifa'},
           {'LOWER': 'world'},
           {'LOWER': 'cup'},
           {'IS_PUNCT' : True}
          ]

matcher.add('fifa', [pattern])

doc = nlp('2018 FIFA world cup: France won!')

matches = matcher(doc)
print(matches)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)


[(7778788780550260102, 0, 5)]
2018 FIFA world cup:


In [13]:
# eg 3:


import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

pattern = [{'LEMMA' : 'love', 'POS': 'VERB'},
           {'POS': 'NOUN'}
          ]

matcher.add('love', [pattern])

doc = nlp('I loved cats but now I love dogs more.')

matches = matcher(doc)
print(matches)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

[(3702023516439754181, 1, 3), (3702023516439754181, 6, 8)]
loved cats
love dogs


In [14]:
# eg 4:

import spacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

pattern1 = [{'LOWER':'solarpower'}]
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
pattern3 = [{'LOWER':'solar'},{'LOWER':'power'}]

matcher.add('SolarPower', [pattern1,pattern2,pattern3])
doc = nlp("The Solar Power industry continues to grow a solarpower increases. Solar-power is good")
found_matches = matcher(doc)
print(found_matches)


for _,start,end in found_matches:
    span = doc[start:end]
    print(span)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]
Solar Power
solarpower
Solar-power


## Using operators and quantifiers

In [15]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_sm')

matcher = Matcher(nlp.vocab)

pattern = [{'LEMMA' : 'buy'}, 
           {'POS': 'DET', 'OP':'?'},
           {'POS': 'NOUN'}
          ]

matcher.add('love', [pattern])

doc = nlp("I bought a smartphone. Now I'm buying apps")

matches = matcher(doc)
print(matches)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

[(3702023516439754181, 1, 4), (3702023516439754181, 8, 10)]
bought a smartphone
buying apps


## Stop words in Spacy

In [16]:
from spacy.lang.en.stop_words import STOP_WORDS
print(STOP_WORDS)

{'nor', 'anywhere', 'except', 'itself', 'among', 'all', 'him', 'really', 'they', 'beside', 'an', 'almost', 'both', 'during', 'we', 'towards', '‘re', 'another', '‘m', 'such', 'always', 'empty', 'because', 'nevertheless', 'he', 'become', 'was', 'latter', '’ve', 'ever', 'top', 'did', 'whereafter', '’d', 'to', 'she', "'ve", 'forty', 'last', 'what', 'as', 'me', 'indeed', 'so', 'or', 'else', 'into', '’s', 'could', 'once', 'many', 'ours', 'back', 'has', 'would', 'before', 'doing', "'s", 'part', 'keep', 'throughout', 'onto', 'wherever', 'us', 'above', 'but', 'thereafter', '‘d', 'amongst', 'in', 'go', 'never', 'still', 'am', 'cannot', 'does', 'on', 'when', 'around', 'sixty', 'behind', 'any', 'thereby', 'this', 'now', 'became', 'should', 'bottom', 'being', 'four', 'often', 'i', 'with', 'hereupon', 'wherein', 'move', 'why', 'less', 'regarding', 'there', 'ourselves', 'seeming', 'something', 'who', 'whole', 'three', 'her', 'whom', 'other', 'about', 'enough', 'hence', 'show', 'more', 'will', 'a', 'e