# Import spaCy and create a document

In [1]:
import spacy

In [31]:
# !python -m spacy download en_core_web_sm

# sm = small English model

In [6]:
nlp = spacy.load('en_core_web_sm')

In [7]:
doc = nlp("Tea is healthy and calming, don't you think?")

# Tokenizing

In [9]:
for token in doc:
    print(token)

Tea
is
healthy
and
calming
,
do
n't
you
think
?


In [10]:
token.lemma_

'?'

In [11]:
token.is_stop

False

# Lemmas and stopwords

In [17]:
print("{} \t\t{} \t\t{}".format('Token', 'Lemma', 'Stopword'))
print("-"*40)
for token in doc:
    print("{}\t\t{}\t\t{}".format(str(token), token.lemma_, token.is_stop))

Token 		Lemma 		Stopword
----------------------------------------
Tea		tea		False
is		be		True
healthy		healthy		False
and		and		True
calming		calm		False
,		,		False
do		do		True
n't		not		True
you		-PRON-		True
think		think		False
?		?		False


# The same for French

In [30]:
# !python -m spacy download fr_core_news_sm

In [21]:
nlp_fr = spacy.load('fr_core_news_sm')

In [34]:
phrase = nlp_fr("La conscience de n'être sur la terre qu'en sursis, d'une mort qui,\
                quoi qu'il arrive, arrivera, sans espoir de salut. ")

In [35]:
for token in phrase:
    print(token)

La
conscience
de
n'
être
sur
la
terre
qu'
en
sursis
,
d'
une
mort
qui
,
               
quoi
qu'
il
arrive
,
arrivera
,
sans
espoir
de
salut
.


##  NB: lemmatizing strange?  (qu' ,   n' ,  arrive  ,  pronoun is not captured)

In [36]:
print("{} \t\t\t{} \t\t\t{}".format('Token', 'Lemma', 'Stopword'))
print("-"*70)
for token in phrase:
    print("{}\t\t\t{}\t\t\t{}".format(str(token), token.lemma_, token.is_stop))

Token 			Lemma 			Stopword
----------------------------------------------------------------------
La			le			True
conscience			conscience			False
de			de			True
n'			n'			True
être			être			True
sur			sur			True
la			le			True
terre			terre			False
qu'			que			True
en			en			True
sursis			sursis			False
,			,			False
d'			de			True
une			un			True
mort			mort			False
qui			qui			True
,			,			False
               			               			False
quoi			quoi			True
qu'			qu'			True
il			il			True
arrive			arrive			False
,			,			False
arrivera			arriver			False
,			,			False
sans			sans			True
espoir			espoir			False
de			de			True
salut			salut			False
.			.			False


# Pattern matching

Another common NLP task is matching tokens or phrases within chunks of text or whole documents. You can do pattern matching with regular expressions, but spaCy's matching capabilities tend to be easier to use.

To match individual tokens, you create a Matcher. When you want to match a list of terms, it's easier and more efficient to use PhraseMatcher. For example, if you want to find where different smartphone models show up in some text, you can create patterns for the model names of interest. First you create the PhraseMatcher itself.

In [38]:
from spacy.matcher import PhraseMatcher

# The matcher is created using the vocabulary of the 'small English' model.
# Attribute LOWER for case-insensitive matching
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

In [41]:
terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']

# create a list of terms to match in the text.
patterns = [nlp(text) for text in terms]

In [46]:
patterns

[Galaxy Note, iPhone 11, iPhone XS, Google Pixel]

In [47]:
# Add these patterns to the matcher as "Terminology List"

# Add a match-rule to the phrase-matcher. 
# A match-rule consists of: an ID key, an on_match callback, and one or more patterns.
matcher.add("TerminologyList", None, *patterns)

In [57]:
# Then you create a document from the text to search and use the phrase matcher 
# to find where the terms occur in the text.

# Borrowed from https://daringfireball.net/linked/2019/09/21/patel-11-pro
text_doc = nlp("Glowing review overall, and some really interesting side-by-side "
               "photography tests pitting the iPhone 11 Pro against the "
               "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.") 
matches = matcher(text_doc)

# The matches here are a tuple of the match id and the positions of the start and end of the phrase.
# (vocabulary includes commas and dashes, and count starts at zero)

print(matches)

[(3766102292120407359, 17, 19), (3766102292120407359, 22, 24), (3766102292120407359, 30, 32), (3766102292120407359, 33, 35)]


In [59]:
match_id, start, end = matches[2]
print(nlp.vocab.strings[match_id], text_doc[start:end])

TerminologyList iPhone XS
