# Advanced NLP with spaCy
## Chapter 1
> Introduction to spaCy

In [17]:
# importing spaCy
# also there are packages for other languages like German, Spanish

# Import the German language class
# from spacy.lang.de import German

# Import the Spanish language class
# from spacy.lang.es import Spanish

# Create the nlp object
from spacy.lang.en import English
nlp = English()
nlp

<spacy.lang.en.English at 0x7f7ebbfe4d10>

In [18]:
# when we process any data from this nlp object spacy creates doc object (documentation) .
# Doc object contains tokens 
doc = nlp("hello world!")
for token in doc:
    print(token.text)

hello
world
!


In [19]:
# to get token at specific index you can use doc[1]
token1 = doc[0]
token1.text

'hello'

In [20]:
# span is just group of two or more tokens in doc 
# we can use list comprehensions to create span
span = doc[1:3]
span.text

'world!'

In [21]:
## Lexical Attributes
doc = nlp("It cost $5 dollers")
print("index : ",[token.i for token in doc])
print("token text : ",[token.text for token in doc])


print("is_alpha : ",[token.is_alpha for token in doc])
print("is_punctuation : ",[token.is_punct for token in doc])
print("is_num : ",[token.like_num for token in doc])


index :  [0, 1, 2, 3, 4]
token text :  ['It', 'cost', '$', '5', 'dollers']
is_alpha :  [True, True, False, False, True]
is_punctuation :  [False, False, False, False, False]
is_num :  [False, False, False, True, False]


In [22]:
# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i+1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

Percentage found: 60
Percentage found: 4


In [23]:
# statistical models
# it enables spacy to predict POS, named entities, syntanctic dependencies

# en_core_web_sm -> small package has all the capabilies based on web text
# this pre-trained model package can be download using spacy download command
# python -m spacy download en_core_web_sm 

import spacy
nlp = spacy.load('en_core_web_sm')
# this package contains binary weights for spacy to make predictions, vocabulary and meta information
# (language and pipeline)
nlp

<spacy.lang.en.English at 0x7f7eba3fad10>

In [25]:
doc = nlp("she ate the pizza.")
# In spacy , attributes returns string ends with _
for token in doc:
    print(token,token.pos_)
# attribute without the _ returns int (id value)

she PRON
ate VERB
the DET
pizza NOUN
. PUNCT


In [26]:
# dependency label scheme
for token in doc:
    print(token,token.pos_, token.dep_, token.head.text)
    # head is like the parent token the current token attached to

she PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate
. PUNCT punct ate


In [27]:
# If you don't any thing in spacy just use spacy explain
spacy.explain("det")

'determiner'

In [28]:
doc = nlp("Apple is looking at buying UK startup for $1 billion .")
# for named entities in text 
for entities in doc.ents:
    print(entities.text, entities.label_)

Apple ORG
UK GPE
$1 billion MONEY


In [29]:
spacy.explain("GPE")

'Countries, cities, states'

In [39]:
## Rule Based Matching
# spacy matcher works on not only on strings also on docs
# we use dictionaries to specify the  pattern

from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

# List of dicts per token
# Match exact tokens
# pattern = [{"TEXT":"iphone"}]

#match lexical attributes
# pattern = [{"LOWER":"iphone"}]

#match any token attributes
# pattern = [{"LEMMA":"buy"},{"POS":'NOUN'}]

# optional
# pattern = [{"LEMMA":"buy","OP":"?"},{"POS":'NOUN'}]
# ! negate ,
# ? 0/1 
# + 1 or many 
# * zero or many
pattern = [{"TEXT":"iphone"},{"TEXT":"X"}]

matcher.add("IPHONE_Pattern",None, pattern)

doc = nlp("Upcoming iphone X release date is leaked")

matches = matcher(doc)

for match_id, start , end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)
    
matches

iphone X


[(14405892316545644887, 1, 3)]

In [None]:
doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", None, pattern)
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)