In [21]:
import spacy
nlp = spacy.blank("en")
nlp.add_pipe(nlp.create_pipe('sentencizer'))


In [20]:
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [None]:
# CHAP 1 
# doc, spans and tokens
# Import spaCy and create the English nlp object
import spacy

nlp = spacy.blank("en")

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:-1]
print(tree_kangaroos_and_narwhals.text)

tree kangaroos
tree kangaroos and narwhals


In [None]:
# lexical atributes
import spacy
nlp = spacy.blank("en")

doc = nlp (
    "in 1999, more than 60% of people in East Asia were in extreme poverty."
    "Now less then 4% are."
)

for token in doc:
    if token.like_num:
        next_token = doc[token.i + 1]
        if next_token.text == "%":
            print("Percentage found:", token.text)
        
  

,
%
Percentage found: 60
%
Percentage found: 4


In [None]:
# Loading pipelines

import spacy

# print(spacy.explain("ROOT"))

nlp = spacy.load("en_core_web_sm")

text = "It's official: Apple is the first U.S. public company to readh a $1 trillion market value"

doc = nlp(text)
print(doc.text)

It's official: Apple is the first U.S. public company to readh a $1 trillion market value


In [None]:
# predicting linguistic annotations # 1

import spacy

nlp = spacy.load("en_core_web_sm")

text = "It's official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

It          PRON      nsubj     
's          AUX       ccomp     
official    ADJ       acomp     
:           PUNCT     punct     
Apple       PROPN     nsubj     
is          AUX       ROOT      
the         DET       det       
first       ADJ       amod      
U.S.        PROPN     nmod      
public      ADJ       amod      
company     NOUN      attr      
to          PART      aux       
reach       VERB      relcl     
a           DET       det       
$           SYM       quantmod  
1           NUM       compound  
trillion    NUM       nummod    
market      NOUN      compound  
value       NOUN      dobj      


In [40]:
# predicting linguistic annotations # 2  

import spacy 

nlp = spacy.load("en_core_web_sm")

text = "It's official: Apple is the first U.S. public company to reach a $1 trillion market value"

doc = nlp(text)

for ent in doc.ents:
    print(ent.text,ent.label_)


Apple ORG
first ORDINAL
U.S. GPE
$1 trillion MONEY


In [87]:
# predicting named entities in context 

import spacy

nlp = spacy.load("en_core_web_sm")
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"
doc = nlp(text)


print(ent.text, ent.label_)
iphone_x = doc[1:3]
print("Missing entitys: ",iphone_x)


Apple ORG
Missing entitys:  iPhone X


In [None]:
# add iphone X with label "PRODUCT" by adding entityRuler

import spacy

nlp = spacy.load("en_core_web_sm")
text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

print(nlp.pipe_names)

# create entityRuler pipeline
entity_ruler = nlp.create_pipe("entity_ruler")
# add entityRuler before ner pipeline
ruler = nlp.add_pipe(entity_ruler, before="ner")
print(nlp.pipe_names)
patterns = [
    {"label": "PRODUCT" , "pattern": "iPhone X"}
]

entity_ruler.add_patterns(patterns)

doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)

['tagger', 'parser', 'ner']
['tagger', 'parser', 'entity_ruler', 'ner']
iPhone X PRODUCT
Apple ORG


In [None]:
# Rule-based matching
# => match on Doc objects, not just strings
# => Match on tokens and token attributes
# => Use am model's predictions

# Match exact the token \Concatenated\

import spacy 
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"TEXT":"iPhone"}, {"TEXT":"X"}]
matcher.add("IPHONE_PATTERN",[pattern])
doc = nlp("Upcoming iPhone X release date leaked")
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)



iPhone X
IPHONE_PATTERN


In [10]:
# Match lexical attributes

import spacy 
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [
    {"IS_DIGIT": True},
    {"LOWER": "fifa"},
    {"LOWER": "world"},
    {"LOWER": "cup"},
    {"IS_PUNCT": True}
]
matcher.add("IPHONE_PATTERN",[pattern])
doc = nlp("2018 FIFA World Cup: France won!")
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

print("Matches: ", [doc[start:end].text for match_id, start, end in matches])    

2018 FIFA World Cup:
Matches:  ['2018 FIFA World Cup:']


In [None]:
# Matching other token attributes

import spacy 
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [
    {"LEMMA": "hate", "POS": "VERB"},
    {"POS": "NOUN"}
]
matcher.add("IPHONE_PATTERN",[pattern])
doc = nlp("I loved dogs but now I hate cats more.")
matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

hate cats


In [None]:
# Example	            Description
# {"OP": "!"}	        Negation: match 0 times
# {"OP": "?"}	        Optional: match 0 or 1 times
# {"OP": "+"}	        Match 1 or more times
# {"OP": "*"}	        Match 0 or more times

In [9]:
# using the matcher

import spacy 
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

matcher = Matcher(nlp.vocab)
pattern = [{"TEXT": "iPhone"}, {"TEXT":"X"}]

matcher.add("IPHONE",[pattern])
matches = matcher(doc)

print("Matches: ", [doc[start:end].text for match_id, start,end in matches])


Matches:  ['iPhone X']


In [None]:
# writing match patterns
# part1

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp ("After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper.")

pattern = [{"TEXT":"iOS"},{"IS_DIGIT": True}]
matcher.add("IPHONE_OS",[pattern])
matches = matcher(doc)
print("Matches: ",[doc[start:end].text for match_id,start,end in matches])


Matches:  ['iOS 7', 'iOS 11', 'iOS 10']


In [17]:
# writing match patterns
# part 2

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
doc = nlp(
    "i downloaded Fortnite on my laptop and can't open the game at all. Help? "
    "so when I was downloading Minecraft, I got the Windows version where it "
    "is the '.zip' folder and I used the default program to unpack it... do "
    "I also need to download Winzip?"
)

matcher = Matcher(nlp.vocab)
pattern = [{"LEMMA":"download"},{"POS":"PROPN"}]
matcher.add("download_pattern",[pattern])
matches = matcher(doc)
print("Result: ",[doc[start:end].text for match_id, start,end in matches])

Result:  ['downloaded Fortnite', 'downloading Minecraft']


In [23]:
# writing match patterns
# part 3
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)
# Write a pattern for adjective plus one or two nouns
pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}, {"POS": "NOUN", "OP": "+"}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("ADJ_NOUN_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

Total matches found: 1
Match found: optional voice responses
