## SpaCy

Reference
- https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da
- http://yujuwon.tistory.com/entry/spaCy-%EC%82%AC%EC%9A%A9%ED%95%98%EA%B8%B0-Rule-based-Matching?category=540768

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
example = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [4]:
sent = preprocess(example)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [5]:
# NP noun phrase
# DT optional dterminer
# JJ any number of adjectives
# NN noun

pattern = 'NP: {<DT>?<JJ>*<NN>}'

### Chunking

In [6]:
# chunking
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

In [7]:
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [8]:
# IOB = standard way to represent chunk structures in files

In [9]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

In [10]:
iob_tagged = tree2conlltags(cs)
iob_tagged

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]

In [11]:
# conlltags2tree to convert the tag sequences into a chunk tree
from nltk import ne_chunk
ne_tree = ne_chunk(iob_tagged)
print(ne_tree)

(S
  (GPE European/JJ/O)
  authorities/NNS/O
  fined/VBD/O
  (PERSON Google/NNP/O)
  a/DT/B-NP
  record/NN/I-NP
  $/$/O
  5.1/CD/O
  billion/CD/O
  on/IN/O
  Wednesday/NNP/O
  for/IN/O
  abusing/VBG/O
  its/PRP$/O
  power/NN/B-NP
  in/IN/O
  the/DT/B-NP
  mobile/JJ/I-NP
  phone/NN/I-NP
  market/NN/B-NP
  and/CC/O
  ordered/VBD/O
  the/DT/B-NP
  company/NN/I-NP
  to/TO/O
  alter/VB/O
  its/PRP$/O
  practices/NNS/O)


### Spacy

In [12]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import pandas as pd
nlp = en_core_web_sm.load()

In [13]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')

In [14]:
pprint([(X.text, X.label_) for X in doc.ents])
#NORD : nationalities or religious or political groups

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [15]:
# BILUO tagging scheme to describe the entity boundaries
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

# B : first token of a multi-token entity
# I : inner token of a multi-token entity
# L : final token of a multi-token entity
# U : single-token entity
# O : non-entity token

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [16]:
list(filter(lambda x: x.ent_iob_!='O', [X for X in doc]))

[European, Google, $, 5.1, billion, Wednesday]

In [17]:
# display
displacy.render(doc, jupyter=True, style='ent')

In [18]:
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')
doc

Apple is looking at buying U.K. startup for $1 billion

In [19]:
doc1 = nlp("Typically, warfarin (Coumadin, Jantoven), used to prevent blood clots, usually works well and isn't bothersome, but serious internal bleeding can happen in the wrong situation.")
doc1

Typically, warfarin (Coumadin, Jantoven), used to prevent blood clots, usually works well and isn't bothersome, but serious internal bleeding can happen in the wrong situation.

### Chunking

In [20]:
for chn in doc1.noun_chunks:
    print(chn.text, chn.label_, chn.root.text)

warfarin NP warfarin
(Coumadin, Jantoven NP Jantoven
blood clots NP clots
serious internal bleeding NP bleeding
the wrong situation NP situation


## Token & attributes

In [21]:
lemmas = [] #원형
poss = [] #품사
tags = [] #태크
deps = [] #의존성
shapes = [] #모양
alphas = [] #알파벳 여부
stopwords = [] 
entities = []

for token in doc:
    lemmas.append(token.lemma_)
    poss.append(token.pos_)
    tags.append(token.tag_)
    deps.append(token.dep_)
    shapes.append(token.shape_)
    alphas.append(token.is_alpha)
    stopwords.append(token.is_stop)
    entities.append(token.ent_type)

In [22]:
compare_df = pd.DataFrame({
                'origin':doc,
                'lemma':lemmas,
                'pos':poss,
                'tag':tags,
                'deps':deps,
                'shape':shapes,
                'is_alphabet':alphas,
                'is_stopword':stopwords,
                'entity':entities,
            }, columns=['origin', 'lemma', 'pos', 'tag', 'deps', 'entity','is_alphabet', 'is_stopword'])
compare_df

Unnamed: 0,origin,lemma,pos,tag,deps,entity,is_alphabet,is_stopword
0,Apple,apple,PROPN,NNP,nsubj,381,True,False
1,is,be,VERB,VBZ,aux,0,True,True
2,looking,look,VERB,VBG,ROOT,0,True,False
3,at,at,ADP,IN,prep,0,True,True
4,buying,buy,VERB,VBG,pcomp,0,True,False
5,U.K.,u.k.,PROPN,NNP,compound,382,False,False
6,startup,startup,NOUN,NN,dobj,0,True,False
7,for,for,ADP,IN,prep,0,True,True
8,$,$,SYM,$,quantmod,391,False,False
9,1,1,NUM,CD,compound,391,False,False


In [23]:
spacy.explain('ADP')

'adposition'

In [24]:
# 고유명사
spacy.explain('PROPN')

'proper noun'

In [25]:
spacy.explain('NOUN')

'noun'

In [26]:
spacy.explain('NNP')

'noun, proper singular'

## attrbute 확장

In [27]:
from spacy.tokens import Doc, Span, Token

drugs = ['valproic acid', 'efavirenz', 'glycerol', 'dexbrompheniramine']
is_drug = lambda x: x.text in drugs
has_drug = lambda obj: any([t.text in drugs for t in obj])

In [28]:
Token.set_extension('is_drug', getter=is_drug, force=True)
Doc.set_extension('has_drug', getter=has_drug, force=True)
Span.set_extension('has_drug', getter=has_drug, force=True)

In [29]:
doc2 = nlp(u"Typically, warfarin and efavirenz, used to prevent blood clots, usually works well with dexbrompheniramine and isn't bothersome, but serious internal bleeding can happen in the wrong situation.")
doc2

Typically, warfarin and efavirenz, used to prevent blood clots, usually works well with dexbrompheniramine and isn't bothersome, but serious internal bleeding can happen in the wrong situation.

In [30]:
doc2._.get('has_drug')

True

In [31]:
doc._.get('has_drug')

False

## display

In [32]:
#dependency
from spacy import displacy
displacy.render(doc, style='dep', jupyter=True, options={'distance':80})

In [33]:
# nouns
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

Apple Apple nsubj looking
U.K. startup startup dobj buying


#### nsubj = 주격명사구
#### dobj = 직접 목적어

In [34]:
# tree내 각 항목 확인
for token in doc:
    if len([c for c in token.children])>0:
        print(token.text, token.dep_, [c for c in token.children])

looking ROOT [Apple, is, at]
at prep [buying]
buying pcomp [startup, for]
startup dobj [U.K.]
for prep [billion]
billion pobj [$, 1]


In [35]:
# 의존관계를 하나의 스트링으로 표시하는게 가능한가
for i in range(len(doc)):
    token = doc[i]
    if len([c for c in token.children])>0:
        lefts = token.n_lefts
        right = token.n_rights
        print(token.text, ':', doc[i-lefts:i+right+1])

looking : Apple is looking at
at : at buying
buying : buying U.K. startup
startup : U.K. startup
for : for $
billion : $1 billion


## 약물, 부작용 명이 들어간 경우만 구절로 꺼내고 싶은 경우에는...

In [36]:
for i in range(len(doc1)):
    token = doc1[i]
    if len([c for c in token.children])>0:
        lefts = token.n_lefts
        right = token.n_rights
        if any(x in str(doc1[i-lefts:i+right+1]) for x in ['blood', 'bleeding', 'warfarin']):
            print(token.text, ':' , token.tag_ , ':', doc1[i-lefts:i+right+1])

warfarin : NN : warfarin (Coumadin,
prevent : VB : to prevent blood
clots : NNS : blood clots
works : VBZ : prevent blood clots, usually works well and isn't bothersome
bleeding : NN : serious internal bleeding
happen : VB : bleeding can happen in the


In [37]:
spacy.explain('NN')

'noun, singular or mass'

In [38]:
spacy.explain('NNS')

'noun, plural'

In [39]:
# head단어들만 찾는다면
root = [token for token in doc if token.head ==token][0]
root

looking

In [40]:
root1 = [token for token in doc1 if token.head ==token][0]
root1

works

In [41]:
subject = list(root.rights)[0]
subject

at

In [42]:
subject1_r = list(root1.rights)
subject1_l = list(root1.lefts)
print('right: ', subject1_r)
print('left: ', subject1_l)

right:  [well, and, is, but, happen]
left:  [Typically, ,, warfarin, ,, usually]


In [43]:
#tree 조상찾기
for desc in subject.subtree:
    print(desc)
    assert subject is desc or subject.is_ancestor(desc)
    print([anc.text for anc in desc.ancestors])

at
['looking']
buying
['at', 'looking']
U.K.
['startup', 'buying', 'at', 'looking']
startup
['buying', 'at', 'looking']
for
['buying', 'at', 'looking']
$
['billion', 'for', 'buying', 'at', 'looking']
1
['billion', 'for', 'buying', 'at', 'looking']
billion
['for', 'buying', 'at', 'looking']
