## char단위 bio태그를 tokenizer 사용하여 token단위 bio태그로 바꾸기 

## klue data loading 

In [1]:
with open('../../file/klue-ner-v1_train.tsv', 'r') as f:
    corpus = f.read()

In [2]:
corpus = corpus.split('\n\n')

In [3]:
corpus[1]

'## klue-ner-v1_train_00001_nsmc\t<한군데:QT>서 필름을 너무 낭비한 작품입니다.\n한\tB-QT\n군\tI-QT\n데\tI-QT\n서\tO\n \tO\n필\tO\n름\tO\n을\tO\n \tO\n너\tO\n무\tO\n \tO\n낭\tO\n비\tO\n한\tO\n \tO\n작\tO\n품\tO\n입\tO\n니\tO\n다\tO\n.\tO'

In [4]:
splitted_corpus = [[sen.split('\t') for sen in sentence.split('\n')] for sentence in corpus if '\t' in sentence]
splitted_corpus = [[sen for sen in sentence if len(sen) > 1] for sentence in splitted_corpus]

In [5]:
corpus_pair = [[[char, bio] 
                     for char, bio in corpus 
                     if bio[:2] in ['B-', 'I-', 'O']] 
                    for corpus in splitted_corpus]

In [6]:
corpus_char = [''.join([char for char, bio in corpus]) for corpus in corpus_pair]
corpus_bio = [[bio for char, bio in corpus] for corpus in corpus_pair]

## tokenizer loading 

In [7]:
from tokenizers import Tokenizer

In [8]:
tokenizer_path =  '~/torch_study/paper/file/bert/vocab.json'
tokenizer = Tokenizer.from_file(tokenizer_path)

In [9]:
tokenized_char = [tokenizer.encode(corpus).tokens for corpus in corpus_char]

In [10]:
tokenizer.encode('안녕하십니까').offsets

[(0, 2), (2, 3), (3, 6)]

In [11]:
special_token = ['[UNK]', '[SEP]', '[CLS]']

In [12]:
import re

In [13]:
def get_token_labels(text, label, tokenizer):
    tokenized = tokenizer.encode(text)
    token_word = tokenized.tokens
    offset = tokenized.offsets
    index = 0
    token_labels = []
    label_clean = [lbl for txt, lbl in list(zip(text, label)) if txt.strip()]
    for token_off, token in zip(offset, token_word):
        len_token_clean = token_off[1] - token_off[0] 
        token_labels.append(label_clean[index:index+len_token_clean][0]) # 가장 첫번째 bio 태그를 태그로 사용
        index += len_token_clean
    return token_labels

In [14]:
idx = 10

In [15]:
corpus_char[idx]

'중국 후난(湖南)성 창샤(長沙)시 우자링(五家岭)가 한 시장에서 14일 오전 10시 15분께 칼부림 사건이 일어나 5명이 숨지고 1명이 부상했다고 중신넷이 14일 보도했다.'

In [16]:
tokenizer.encode(corpus_char[idx]).tokens

['중국',
 '후',
 '##난',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '성',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '시',
 '우',
 '##자',
 '##링',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '[UNK]',
 '가',
 '한',
 '시장',
 '##에',
 '##서',
 '14',
 '##일',
 '오전',
 '10',
 '##시',
 '15',
 '##분',
 '##께',
 '칼',
 '##부',
 '##림',
 '사건',
 '##이',
 '일어나',
 '5',
 '##명',
 '##이',
 '숨',
 '##지',
 '##고',
 '1',
 '##명',
 '##이',
 '부상',
 '##했',
 '##다고',
 '중',
 '##신',
 '##넷',
 '##이',
 '14',
 '##일',
 '보도',
 '##했',
 '##다',
 '.']

In [17]:
labels = get_token_labels(corpus_char[idx], corpus_bio[idx], tokenizer)

In [18]:
corpus_bio[0][:10]

['O', 'O', 'O', 'B-LC', 'I-LC', 'I-LC', 'I-LC', 'I-LC', 'I-LC', 'O']

In [19]:
from itertools import zip_longest

In [20]:
list(zip(corpus_char[idx], corpus_bio[idx]))

[('중', 'B-LC'),
 ('국', 'I-LC'),
 (' ', 'I-LC'),
 ('후', 'I-LC'),
 ('난', 'I-LC'),
 ('(', 'I-LC'),
 ('湖', 'I-LC'),
 ('南', 'I-LC'),
 (')', 'I-LC'),
 ('성', 'I-LC'),
 (' ', 'I-LC'),
 ('창', 'I-LC'),
 ('샤', 'I-LC'),
 ('(', 'I-LC'),
 ('長', 'I-LC'),
 ('沙', 'I-LC'),
 (')', 'I-LC'),
 ('시', 'I-LC'),
 (' ', 'I-LC'),
 ('우', 'I-LC'),
 ('자', 'I-LC'),
 ('링', 'I-LC'),
 ('(', 'I-LC'),
 ('五', 'I-LC'),
 ('家', 'I-LC'),
 ('岭', 'I-LC'),
 (')', 'I-LC'),
 ('가', 'I-LC'),
 (' ', 'O'),
 ('한', 'O'),
 (' ', 'O'),
 ('시', 'O'),
 ('장', 'O'),
 ('에', 'O'),
 ('서', 'O'),
 (' ', 'O'),
 ('1', 'B-DT'),
 ('4', 'I-DT'),
 ('일', 'I-DT'),
 (' ', 'O'),
 ('오', 'B-TI'),
 ('전', 'I-TI'),
 (' ', 'I-TI'),
 ('1', 'I-TI'),
 ('0', 'I-TI'),
 ('시', 'I-TI'),
 (' ', 'I-TI'),
 ('1', 'I-TI'),
 ('5', 'I-TI'),
 ('분', 'I-TI'),
 ('께', 'O'),
 (' ', 'O'),
 ('칼', 'O'),
 ('부', 'O'),
 ('림', 'O'),
 (' ', 'O'),
 ('사', 'O'),
 ('건', 'O'),
 ('이', 'O'),
 (' ', 'O'),
 ('일', 'O'),
 ('어', 'O'),
 ('나', 'O'),
 (' ', 'O'),
 ('5', 'B-QT'),
 ('명', 'I-QT'),
 ('이', 'O'),


In [21]:
list(zip_longest(tokenized_char[idx], labels))

[('중국', 'B-LC'),
 ('후', 'I-LC'),
 ('##난', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('성', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('시', 'I-LC'),
 ('우', 'I-LC'),
 ('##자', 'I-LC'),
 ('##링', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('[UNK]', 'I-LC'),
 ('가', 'I-LC'),
 ('한', 'O'),
 ('시장', 'O'),
 ('##에', 'O'),
 ('##서', 'O'),
 ('14', 'B-DT'),
 ('##일', 'I-DT'),
 ('오전', 'B-TI'),
 ('10', 'I-TI'),
 ('##시', 'I-TI'),
 ('15', 'I-TI'),
 ('##분', 'I-TI'),
 ('##께', 'O'),
 ('칼', 'O'),
 ('##부', 'O'),
 ('##림', 'O'),
 ('사건', 'O'),
 ('##이', 'O'),
 ('일어나', 'O'),
 ('5', 'B-QT'),
 ('##명', 'I-QT'),
 ('##이', 'O'),
 ('숨', 'O'),
 ('##지', 'O'),
 ('##고', 'O'),
 ('1', 'B-QT'),
 ('##명', 'I-QT'),
 ('##이', 'O'),
 ('부상', 'O'),
 ('##했', 'O'),
 ('##다고', 'O'),
 ('중', 'B-OG'),
 ('##신', 'I-OG'),
 ('##넷', 'I-OG'),
 ('##이', 'O'),
 ('14', 'B-DT'),
 ('##일', 'I-DT'),
