In [1]:
import spacy
import random
import pandas as pd
nlp = spacy.load('en')
from spacy import displacy

In [2]:
doc = nlp('Apple is looking at buying U.K. startup for $1 billion')
doc

Apple is looking at buying U.K. startup for $1 billion

In [3]:
doc1 = nlp("Typically, warfarin (Coumadin, Jantoven), used to prevent blood clots, usually works well and isn't bothersome, but serious internal bleeding can happen in the wrong situation.")
doc1

Typically, warfarin (Coumadin, Jantoven), used to prevent blood clots, usually works well and isn't bothersome, but serious internal bleeding can happen in the wrong situation.

## Named Entity Recognition

- entity types: 
    - PERSON
    - NORP (nationalities or religious or political)
    - FAC (building, airports, highways...)
    - ORG
    - GPE (countries, cities, states)
    - LOC (location)
    - PRODUCT
    - EVNET
    - WORK_OF_ART
    - LAW
    - LANGUAGE
    - DATE
    - TIME
    - PERCENT
    - MONEY
    - QUANTITY
    - ORDINAL
    - CARDINAL

In [4]:
#entity recognition
displacy.render(doc, style='ent', jupyter=True, options={'distance':90})

In [5]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [6]:
# 인식되는 엔티티 추가 필요
for ent in doc1.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Coumadin 21 29 ORG
Jantoven 31 39 GPE


In [7]:
#BIO - B = entity시작, O = entity밖, I = entity내부
for token in doc:
    print(token.text, token.ent_iob_, token.ent_type_)

Apple B ORG
is O 
looking O 
at O 
buying O 
U.K. B GPE
startup O 
for O 
$ B MONEY
1 I MONEY
billion I MONEY


## add additional entity type

In [8]:
doc1

Typically, warfarin (Coumadin, Jantoven), used to prevent blood clots, usually works well and isn't bothersome, but serious internal bleeding can happen in the wrong situation.

In [9]:
for ent in doc1.ents:
    print(ent.text, ent.label_)

Coumadin ORG
Jantoven GPE


In [10]:
#위의 약물이름, 증상을 예제로 추가한다면
from spacy.tokens import Span

In [11]:
drug_st_idx = [i for i in range(len(doc1)) if doc1[i].text==u'warfarin'][0]
print(drug_st_idx)

2


In [12]:
phenotype_st_idx1 = [i for i in range(len(doc1)) if doc1[i].text==u'blood'][0]
phenotype_st_idx2 = [i for i in range(len(doc1)) if doc1[i].text==u'serious'][0]
print(phenotype_st_idx1, phenotype_st_idx2)

12 24


In [13]:
ents = []
DRUG = doc1.vocab.strings.add('DRUG')
drug_ent= Span(doc1, drug_st_idx, drug_st_idx+1, label=DRUG)
ents.append(drug_ent)
ents.extend(list(doc1.ents))

In [14]:
PHENOTYPE = doc1.vocab.strings.add('PHENOTYPE')
ents.append(Span(doc1, phenotype_st_idx1, phenotype_st_idx1+2, label=PHENOTYPE))
ents.append(Span(doc1, phenotype_st_idx2, phenotype_st_idx2+3, label=PHENOTYPE))
doc1.ents = ents

In [15]:
displacy.render(doc1, style='ent', jupyter=True, options={'distance':90})

## Custom Training

### custom training required GoldParse
- statistical models

example running from
- https://spacy.io/usage/training#train-entity

### entity 학습은 BILUO scheme

In [16]:
train_data = [
    ("Uber blew through $1 million a week", {'entities':[(0, 4, 'ORG')]}),
    ("Test C purchased a machine learning start up last week", {'entities':[(0, 6, 'ORG')]}),
    ("Android Pay expands to Canada", {'entities':[(0, 11, 'PRODUCT'), (23, 30, 'GPE')]}),
    ("Spotify steps up Asia expansion", {'entities':[(0, 8, "ORG"), (17, 21, "LOC")]}),
    ("Google Maps launches location sharing", {'entities':[(0, 11, "PRODUCT")]}),
    ("Google rebrands its business apps", {'entities':[(0, 6, "ORG")]}),
    ("look what i found on google! 😂", {'entities':[(21, 27, "PRODUCT")]})]

In [17]:
optimizer = nlp.begin_training()
for i in range(20):
    random.shuffle(train_data)
    for text, annotations in train_data:
        nlp.update([text], [annotations], drop=0.5, sgd=optimizer)
nlp.to_disk('./model')



In [18]:
doc3 = nlp('Test C has acquired an Android Pay company last week')
doc3

Test C has acquired an Android Pay company last week

In [19]:
for ent in doc3.ents:
    print(ent, ent.label_)

Test C ORG
Android Pay PRODUCT
