# Rule based Matching in Linguistic Features

Link [here](https://spacy.io/usage/linguistic-features#rule-based-matching)

In [1]:
# Adding Patterns https://spacy.io/usage/linguistic-features#adding-patterns

# Importing spaCy
import spacy

In [2]:
# Load the language model
nlp = spacy.load('en')

In [3]:
# Importing Matcher from spacy
from spacy.matcher import Matcher

# The matcher must always share the same vocab with the documents it will operate on.
matcher = Matcher(nlp.vocab)
# add match ID "HelloWorld" with no callback and one pattern
pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}]

# The second argument lets you pass in an optional callback function "on_match" to invoke on a successful match.
matcher.add('HelloWorld', None, pattern)

# Getting the document from the sentence
doc = nlp(u'Hello, world! Hello world!')

# the matcher will only return the matches and not do anything else, like merge entities or assign labels
# we can implement custom logic by passing in a callback function as the on_match argument on add()
matches = matcher(doc)

# TODO: spacy.io Documentation update, it refers as [('HelloWorld', 0, 2)]
print('first sentence match -', matches)
# Should match and return the result, TODO: Should it return 2 ?
assert len(matches) == 1

# New sentence which doesnt match the matcher
# TODO: this seems to be the 64bit hash id, use stringstore to retrieve the original string ?
doc = nlp(u'another arbitary sentence')
matches = matcher(doc)
assert len(matches) == 0

first sentence match - [(15578876784678163569, 0, 3)]


In [4]:
# Adding Phrase Patterns

# If you need to match large terminology lists, prefer to use PhraseMatcher

# Importing PhraseMatcher from spacy
from spacy.matcher import PhraseMatcher

# Creating new PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
# Preparing terminology list
terminology_list = ['Barack Obama', 'Angela Merkel', 'Washington, D.C.']
# Preparing the patterns

patterns = [nlp(text) for text in terminology_list]
print('Patterns - ', [patrn.text for patrn in patterns])
# Adding to matcher
matcher.add('TerminologyList', None, *patterns)

doc = nlp(u"German Chancellor Angela Merkel and US President Barack Obama "
          u"converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
# TODO: Why all them matched with same id?
print('Matches ', matches)

Patterns -  ['Barack Obama', 'Angela Merkel', 'Washington, D.C.']
Matches  [(3766102292120407359, 2, 4), (3766102292120407359, 7, 9), (3766102292120407359, 19, 22)]


In [5]:
# Adding on_match rules
# https://spacy.io/usage/linguistic-features#on_match