### Named Entity Recognition (NER)

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
# doc -> ents: 속성 / ents(): 함수
# entity = 엔티티

def show_ents(doc):
    if doc.ents:
        for ent in doc.ents:
            print(ent.text + ' - ' + ent.label_ + ' - ' + str(spacy.explain(ent.label_)))
    else:
        print('Named Entities 없음') # doc.text

In [3]:
doc = nlp(u"May I go to Washington, DC next May to see the Washington Monument?")

show_ents(doc)

Washington - GPE - Countries, cities, states
DC - GPE - Countries, cities, states
next May - DATE - Absolute or relative dates or periods
the Washington Monument - ORG - Companies, agencies, institutions, etc.


In [4]:
doc = nlp(u"Can I please borrow 500 dollars from you to buy some Microsoft stock?")

for ent in doc.ents:
    print(ent.text, ent.start, ent.end, ent.start_char, ent.end_char, ent.label_)

500 dollars 4 6 20 31 MONEY
Microsoft 11 12 53 62 ORG


In [5]:
# ent.start -> 단어 수
# ent.start_char -> 한 글자씩 셈.
for ent in doc.ents:
    print(ent.start_char)

20
53


In [6]:
doc.text[20] # 텍스트 형태로 -> 한 글자씩 셈.

'5'

In [7]:
doc.text[4]

'I'

In [8]:
doc.text

'Can I please borrow 500 dollars from you to buy some Microsoft stock?'

In [13]:
doc = nlp(u'Tesla to build a U.K. factory for $6 million')

show_ents(doc)

U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [14]:
# entity 추가
from spacy.tokens import Span

ORG = doc.vocab.strings[u'ORG']

new_ent = Span(doc, 0, 1, label = ORG)

In [15]:
doc.ents

(U.K., $6 million)

In [17]:
doc.ents = list(doc.ents) + [new_ent]

In [18]:
show_ents(doc)

Tesla - ORG - Companies, agencies, institutions, etc.
U.K. - GPE - Countries, cities, states
$6 million - MONEY - Monetary values, including unit


In [19]:
doc = nlp(u"Our company plans to introduce a new vacuum cleaner."
         u"If successful, the vacuum cleaner will be our first product.")

In [20]:
show_ents(doc) # entity 출력

first - ORDINAL - "first", "second", etc.


In [25]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)
phrase_list = ['vacuum cleaner', 'vacuum-cleaner']
phrase_pattern = [nlp(text) for text in phrase_list]

In [26]:
phrase_pattern # pattern 만들어짐.

[vacuum cleaner, vacuum-cleaner]

In [27]:
matcher.add('newproduct', None, *phrase_pattern)

In [28]:
found_matcher = matcher(doc)
print(found_matcher)

[(2689272359382549672, 7, 9), (2689272359382549672, 14, 16)]


In [35]:
doc[7:9]

vacuum cleaner

In [44]:
for match in found_matcher:
    print(match[1])

7
14


In [30]:
from spacy.tokens import Span

# entity 추가
PRODUCT = doc.vocab.strings[u'PRODUCT']

new_ent = [Span(doc, match[1], match[2], label = PRODUCT) for match in found_matcher]

In [31]:
doc.ents = list(doc.ents) + new_ent

In [32]:
show_ents(doc)

vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
vacuum cleaner - PRODUCT - Objects, vehicles, foods, etc. (not services)
first - ORDINAL - "first", "second", etc.


In [33]:
len([ent for ent in doc.ents if ent.label_ == 'PRODUCT'])

2

In [36]:
# 명사
doc = nlp(u"Autonomous cars shift insurance liability toward manufacturers.")

for chunk in doc.noun_chunks:
    print(chunk.text + ' - ' + chunk.root.text + ' - ' + chunk.root.dep_ + ' - ' + chunk.root.head.text)

Autonomous cars - cars - nsubj - shift
insurance liability - liability - dobj - shift
manufacturers - manufacturers - pobj - toward


### 시각화

In [37]:
from spacy import displacy

doc = nlp(u"Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 millions."
         u"By contrast, Sony sold only 7 thousand Walkman music players.")

displacy.render(doc, style = 'ent', jupyter = True) # entity

In [46]:
for sent in doc.sents:
    print(nlp(sent.text))

Over the last quarter Apple sold nearly 20 thousand iPods for a profit of $6 millions.
By contrast, Sony sold only 7 thousand Walkman music players.


In [39]:
for sent in doc.sents:
    displacy.render(nlp(sent.text), style = 'ent', jupyter = True)

In [40]:
# 제어
options = {'ents': {'ORG', 'PRODUCT'}}

displacy.render(doc, style = 'ent', jupyter = True, options = options)

In [42]:
colors = {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ents": ["ORG", "PRODUCT"], "colors": colors}
displacy.render(doc, style="ent", jupyter = True, options = options)

In [43]:
displacy.serve(doc, style = 'ent', options = options)




Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


In [48]:
from pathlib import Path
svg = displacy.render(doc, style = "ent", options = options)

In [49]:
svg

In [50]:
output_path = Path("displacy_sample.svg")
output_path.open("w", encoding = "utf-8").write(svg)

TypeError: write() argument must be str, not None

In [None]:
# 감정 분석 -> 문장을 분리