## Pattern matching

- bug같은데, doc1 일부만 패턴 매칭이되는 현상이 종종 발생

In [1]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy import displacy
nlp = spacy.load('en')

In [2]:
doc1 = nlp(u"Typically, warfarin (Coumadin, Jantoven), used to prevent blood clots, usually works well and isn't bothersome, but serious internal bleeding can happen in the wrong situation.")
doc1

Typically, warfarin (Coumadin, Jantoven), used to prevent blood clots, usually works well and isn't bothersome, but serious internal bleeding can happen in the wrong situation.

In [3]:
matcher = Matcher(nlp.vocab)

### 패턴 추가

In [4]:
pattern =[{'LOWER':'warfarin'}]
matcher.add('drug', None, pattern)
matches = matcher(doc1)
matches

[(475376273668575235, 2, 3)]

### 구절 패턴 추가 가능

In [12]:
p_matcher = PhraseMatcher(nlp.vocab)
p_patterns = [nlp(text) for text in [u'blood clots',\
                                     u'serious internal bleeding']]
p_matcher.add('PHENOTYPE', None, *p_patterns)

In [13]:
p_matches = p_matcher(doc1)
p_matches

[(478917403606056401, 12, 14), (478917403606056401, 24, 27)]

In [14]:
for match_id, start, end in p_matches:
    print(nlp.vocab.strings[match_id], ":", doc1[start:end])

PHENOTYPE : blood clots
PHENOTYPE : serious internal bleeding


### 패턴 매칭 이벤트를 활용해 entity 추가

In [18]:
#  %%pixie_debugger
    
PHENOTYPE = nlp.vocab.strings['PHENOTYPE']

def on_match(matcher, doc, i, matches):
    print(matches)
    for m in matches:
        match_id, start, end = m
        doc.ents += ((PHENOTYPE, start, end),)
#     print(doc.ents)
    
e_matcher = PhraseMatcher(nlp.vocab)
e_matcher.add('PHENOTYPE', on_match, 
              nlp(u'blood clots'), nlp(u'serious internal bleeding')
             )

for m in e_matcher(doc1):
    print(m)

[(478917403606056401, 12, 14), (478917403606056401, 24, 27)]
[(478917403606056401, 12, 14), (478917403606056401, 24, 27)]
(478917403606056401, 12, 14)
(478917403606056401, 24, 27)


In [19]:
#entity recognition
displacy.render(doc1, style='ent', jupyter=True, options={'distance':90})