# 命名實體標注(Named Entity Recognition, NER)

## 載入相關套件

In [1]:
from transformers import pipeline

## 載入模型

In [2]:
nlp = pipeline("ner")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english)


## 測試

In [3]:
# 測試資料
sequence = "Hugging Face Inc. is a company based in New York City. " \
           "Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge."

# 推測答案
import pandas as pd
df = pd.DataFrame(nlp(sequence))
df

Unnamed: 0,entity,score,index,word,start,end
0,I-ORG,0.999511,1,Hu,0,2
1,I-ORG,0.989597,2,##gging,2,7
2,I-ORG,0.99797,3,Face,8,12
3,I-ORG,0.999376,4,Inc,13,16
4,I-LOC,0.999341,11,New,40,43
5,I-LOC,0.999193,12,York,44,48
6,I-LOC,0.999341,13,City,49,53
7,I-LOC,0.986336,19,D,79,80
8,I-LOC,0.939624,20,##UM,80,82
9,I-LOC,0.912138,21,##BO,82,84


## 結合Tokenizer

In [4]:
# 載入相關套件
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# 結合分詞器(Tokenizer)
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [8]:
# NER 類別 
label_list = [
    "O",       # 非實體
    "B-MISC",  # 雜項實體的開頭，接在另一雜項實體的後面
    "I-MISC",  # 雜項實體
    "B-PER",   # 人名的開頭，接在另一人名的後面
    "I-PER",   # 人名
    "B-ORG",   # 組織的開頭，接在另一組織的後面
    "I-ORG",   # 組織
    "B-LOC",   # 地名的開頭，接在另一地名的後面
    "I-LOC"    # 地名
]

# 測試資料
sequence = "Hugging Face Inc. is a company based in New York City. " \
           "Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge."

# 推測答案
inputs = tokenizer(sequence, return_tensors="pt")
tokens = inputs.tokens()

outputs = model(**inputs).logits
predictions = torch.argmax(outputs, dim=2)

for token, prediction in zip(tokens, predictions[0].numpy()):
    print((token, model.config.id2label[prediction]))

('[CLS]', 'O')
('Hu', 'I-ORG')
('##gging', 'I-ORG')
('Face', 'I-ORG')
('Inc', 'I-ORG')
('.', 'O')
('is', 'O')
('a', 'O')
('company', 'O')
('based', 'O')
('in', 'O')
('New', 'I-LOC')
('York', 'I-LOC')
('City', 'I-LOC')
('.', 'O')
('Its', 'O')
('headquarters', 'O')
('are', 'O')
('in', 'O')
('D', 'I-LOC')
('##UM', 'I-LOC')
('##BO', 'I-LOC')
(',', 'O')
('therefore', 'O')
('very', 'O')
('##c', 'O')
('##lose', 'O')
('to', 'O')
('the', 'O')
('Manhattan', 'I-LOC')
('Bridge', 'I-LOC')
('.', 'O')
('[SEP]', 'O')
