# 다중 언어 개체명 인식

# Data set

In [1]:
import pandas as pd

toks = "Jeff Dean is a computer scientist at Google in California".split()
lbls = ["B-PER", "I-PER", "O", "O", "O", "O", "O", "B-ORG", "O", "B-LOC"]
df = pd.DataFrame(data = [toks,lbls], index = ["Tokens","Tags"])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,Jeff,Dean,is,a,computer,scientist,at,Google,in,California
Tags,B-PER,I-PER,O,O,O,O,O,B-ORG,O,B-LOC


In [2]:
from datasets import get_dataset_config_names
xtreme_subsets=get_dataset_config_names('xtreme')
print(f"XTREME 서브셋 개수 : {len(xtreme_subsets)}")

XTREME 서브셋 개수 : 183


In [3]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [4]:
from datasets import load_dataset
load_dataset("xtreme", name ='PAN-X.de')

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 20000
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'langs'],
        num_rows: 10000
    })
})

In [5]:
from collections import defaultdict
from datasets import DatasetDict

langs =['de','fr','it','en']
fracs=[0.629,0.229,0.084,0.059]
# 키가 없는 경우 DatasetDict를 반환한다.
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    # 다국어 말뭉치를 로드합니다.
    ds = load_dataset("xtreme", name = f"PAN-X.{lang}")
    # 각 분할을 언어 비율에 따라 다운샘플링하고 섞는다.
    for split in ds:
        panx_ch[lang][split]=(
            ds[split].shuffle(seed=0)
                    .select(range(int(frac*ds[split].num_rows)))
        )

In [6]:
import pandas as pd

pd.DataFrame({lang: [panx_ch[lang]['train'].num_rows] for lang in langs},
            index=['Number of training example'])

Unnamed: 0,de,fr,it,en
Number of training example,12580,4580,1680,1180


In [7]:
element = panx_ch['de']["train"][0]
for key, value in element.items():
    print(f"{key}: {value}")

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [8]:
for key, value in panx_ch['de']["train"].features.items():
    print(f"{key}: {value}")

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [9]:
panx_ch['de']['train'].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)

In [10]:
tags = panx_ch['de']['train'].features['ner_tags'].feature
print(tags)

ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


In [11]:
def create_tag_name(batch):
    return {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}

panx_de = panx_ch['de'].map(create_tag_name)

In [12]:
de_example=panx_de['train'][0]
pd.DataFrame([de_example['tokens'], de_example['ner_tags_str']],
            ['Tokens','tags'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


# 한국어 테스트 하여 확인하기


In [13]:
ko_dataset=load_dataset("xtreme",'PAN-X.ko')

In [14]:
tags=ko_dataset['train'].features['ner_tags'].feature

def create_ner_str_ko(batch):
    return {'ner_tags_str' : [tags.int2str(idx) for idx in batch['ner_tags']]}
panx_ko=ko_dataset.map(create_ner_str_ko)

In [15]:
panx_ko_example = panx_ko['train'][0]
pd.DataFrame(
    data = [panx_ko_example['tokens'],panx_ko_example['ner_tags_str']],
    index = ['tokens', 'ner_tags_str']
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
tokens,현재,대한민국,K리그,챌린지의,서울,이랜드,FC에서,활약하고,있다,.
ner_tags_str,O,B-LOC,B-ORG,I-ORG,B-ORG,I-ORG,I-ORG,O,O,O


In [16]:
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset['ner_tags_str']:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient='index')                
    

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


# XLM-R 토큰화

In [17]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
from transformers import AutoTokenizer
bert_model_name = "google-bert/bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)


In [19]:
text = "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

In [20]:
import pandas as pd
pd.DataFrame([bert_tokens,xlmr_tokens],
            ['BERT','XLM-R'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
BERT,[CLS],Jack,Spa,##rrow,loves,New,York,!,[SEP],
XLM-R,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>


In [21]:
"".join(xlmr_tokens).replace(u"\u2581", " ")

'<s> Jack Sparrow loves New York!</s>'

In [22]:
'''
바디만 로드하고 헤드는 직접 만들어 토큰 분류를 실습해본다.


'''

'\n바디만 로드하고 헤드는 직접 만들어 토큰 분류를 실습해본다.\n\n\n'

In [23]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

# 클래스 상속 및 초기화
class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    #모델 파라미터 입력 후 해당 파라미터로 초기화
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # 모델 바디를 로드합니다.
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # 토큰 분류 헤드를 준비합니다.
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # 가중치를 로드하고 초기화합니다.
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                labels=None, **kwargs):
        # 모델 바디를 사용해 인코더 표현을 얻습니다.
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs)
        # 인코더 표현을 헤드에 통과시킵니다.
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # 손실을 계산합니다.
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # 모델 출력 객체를 반환합니다.
        return TokenClassifierOutput(loss=loss, logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)

In [24]:
# autoconfig 작성시 필요한 룩업 테이블작성
index2tag = {idx : tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [25]:
from transformers import AutoConfig
# auto config의 from_pretrained를 통해 모델 이름, 클레스 갯수, 룩업 테이블 설정.
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                        num_labels = tags.num_classes,
                                        id2label= index2tag, label2id=tag2index)

In [26]:
import torch
#디바이스 설정 및 모델 로드 , 로드시 from_pretrained를 통해 적어놓은 config를 넣어 초기화 한다.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config = xlmr_config)
             .to(device))

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
print(text)
xlmr_tokenizer.encode(text, return_tensors="pt")


Jack Sparrow loves New York!


tensor([[    0, 21763, 37456, 15555,  5161,     7,  2356,  5753,    38,     2]])

In [28]:
#텍스트를 인코딩 해서 토큰과 inputids에 대해서 확인 해본다.
input_ids=xlmr_tokenizer.encode(text, return_tensors='pt')
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], 
            index = ['Tokens','Input IDs'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2


In [29]:
# 테스트 text의 input_ids를 텐서로 바꾸고, 학습되지 않은 모델에 넣고 로짓값을 출력한다.
outputs=xlmr_model(input_ids.to(device)).logits
# 출력한 로짓값은 (batch, seq, label_size)이므로  label shape에서 최대값을 구한다. -> (batch, seq)
predictions = torch.argmax(outputs,dim=-1)
print(f"시퀀스에 있는 토큰 갯수 : {len(xlmr_tokens)}")
print(f"출력크기 : {predictions.shape}")

시퀀스에 있는 토큰 갯수 : 10
출력크기 : torch.Size([1, 10])


In [30]:
# 헬퍼 함수 생성한다.
def tag_text(text, tags, model, tokenizer):
    # 입력받은 토크나이저에 텍스트를 넣고 토큰을 뽑는다.
    tokens =tokenizer(text).tokens()
    # xlmr토크나이저에 텍스트를 넣고 input_ids를 뽑는다.
    input_ids = xlmr_tokenizer(text, return_tensors='pt').input_ids.to(device)
    # 모델의 아웃풋으로 seqouput을 뽑아낸다
    outputs = model(input_ids)[0]
    #(batch, seq , label) - > label argmax
    predictions = torch.argmax(outputs, dim=2)
    # ner로 바꾼다. 
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    # 판다스 시각화 확인
    return pd.DataFrame([tokens, preds], index = ['Tokens', 'Tags'])

In [31]:
words, labels = de_example["tokens"], de_example["ner_tags"]

In [32]:
# 토크나이저에 부분단어 토큰화된 토큰 넣어서 input ids 출력.
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
# 토큰화 된 input을 다시 토큰으로 바꿔 어떻게 토큰화 되는지 확인해 본다.
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens], index=["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [33]:
# 토크나이저로 변환된 토큰을 word_ids()를통해 원래 토큰의 인덱스를 붙혀 확인한다.
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


In [34]:
token_ids = xlmr_tokenizer(de_example['tokens'],is_split_into_words=True)
tokenized_labels = xlmr_tokenizer.convert_ids_to_tokens(token_ids['input_ids'])
word_ids=token_ids.word_ids()

previous_word_idx = None
label_ids = []

for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx

processed_labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids ]
# for word_idx in word_ids:
pd.DataFrame([tokenized_labels, word_ids, label_ids, processed_labels],
            ["Token", "Word_ids","Label ids",'processed_labels'])    

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Token,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word_ids,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label ids,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
processed_labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [35]:
# 새롭게 변환한 input_ids에서 두개로 나눠진것중 하나만 선택하여 학습에 반영하기 위해 처리한다.
# ex Danziget -> 4,4,4 -> 5,-100,-100

# 이전 인덱스 저장 변수 
previous_word_idx = None
#최종 label 저장 list
label_ids = []

# 워드 인덱스를 넣어 인덱스를 구함
for word_idx in word_ids:
    #워드 인덱스가 None이거나,(시작 종료 토큰), 이전과 같으면 softmax에서 IGN로 사용하는 -100 입력
    if word_idx is None or word_idx == previous_word_idx:
        label_ids.append(-100)
    elif word_idx != previous_word_idx:
        label_ids.append(labels[word_idx])
    previous_word_idx = word_idx
labels = [index2tag[l] if l != -100 else "IGN" for l in label_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]


pd.DataFrame([tokens, word_ids, label_ids, labels], index = index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label IDs,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [36]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples['tokens'], truncation = True,
                                     is_split_into_words=True)
    labels = []

    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index = idx)
        prebious_word_idx = None
        label_ids =[]

        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs