## char단위 bio태그를 tokenizer 사용하여 token단위 bio태그로 바꾸기 

## klue data loading 

In [1]:
with open('../../file/klue-ner-v1_train.tsv', 'r') as f:
    corpus = f.read()

In [2]:
corpus = corpus.split('\n\n')

In [3]:
corpus[1]

'## klue-ner-v1_train_00001_nsmc\t<한군데:QT>서 필름을 너무 낭비한 작품입니다.\n한\tB-QT\n군\tI-QT\n데\tI-QT\n서\tO\n \tO\n필\tO\n름\tO\n을\tO\n \tO\n너\tO\n무\tO\n \tO\n낭\tO\n비\tO\n한\tO\n \tO\n작\tO\n품\tO\n입\tO\n니\tO\n다\tO\n.\tO'

In [4]:
splitted_corpus = [[sen.split('\t') for sen in sentence.split('\n')] for sentence in corpus if '\t' in sentence]
splitted_corpus = [[sen for sen in sentence if len(sen) > 1] for sentence in splitted_corpus]

In [5]:
corpus_pair = [[[char, bio] for char, bio in corpus if bio[0] in ['B', 'I', 'O']] for corpus in splitted_corpus]

In [6]:
corpus_char = [''.join([char for char, bio in corpus]) for corpus in corpus_pair]
corpus_bio = [[bio for char, bio in corpus] for corpus in corpus_pair]

## tokenizer loading 

In [7]:
from tokenizers import Tokenizer

In [8]:
tokenizer_path =  '/home/long8v/torch_study/paper/file/bert/vocab.json'
tokenizer = Tokenizer.from_file(tokenizer_path)

In [11]:
special_token = ['[UNK]', '[SEP]', '[CLS]']

In [112]:
def get_token_labels(text, label, tokenizer):
    def decode_spm(tokens):
        return ''.join(
            [token.replace('##', '')
             if token.startswith('##') else f' {token}'
             for token in tokens
             if token not in [',', '.', '?']])
    token_word = tokenizer.encode(text).tokens
    index = 0
    token_labels = []
    label_clean = [lbl for txt, lbl in list(zip(text, label)) if txt.strip()]
    for token_idx, token in enumerate(token_word):
        if token not in special_token:
            token_clean = token.replace('##', '')
            len_token_clean = len(token_clean)
        else: # [UNK] 토큰 일 때, 원래 토큰 길이를 찾아야 함
            token_clean = decode_spm(token_word[token_idx + 1:])
            token_clean_before = decode_spm(token_word[:token_idx])
            len_token_clean = text.find(''.join(token_clean)) - len(token_clean_before)
        token_labels.append(label_clean[index:index+len_token_clean][0]) # 가장 첫번째 bio 태그를 태그로 사용
        index += len_token_clean
    return token_labels

In [113]:
tokenized_char[0][:10]

['특히', '영동', '##고속도로', '강', '##릉', '방향', '문', '##막', '##휴', '##게']

In [114]:
corpus_char[0]

'특히 영동고속도로 강릉 방향 문막휴게소에서 만종분기점까지 5㎞ 구간에는 승용차 전용 임시 갓길차로제를 운영하기로 했다.'

In [115]:
labels = get_token_labels(corpus_char[0], corpus_bio[0], tokenizer)

In [116]:
corpus_bio[0][:10]

['O', 'O', 'O', 'B-LC', 'I-LC', 'I-LC', 'I-LC', 'I-LC', 'I-LC', 'O']

In [117]:
from itertools import zip_longest

In [118]:
list(zip(corpus_char[0], corpus_bio[0]))

[('특', 'O'),
 ('히', 'O'),
 (' ', 'O'),
 ('영', 'B-LC'),
 ('동', 'I-LC'),
 ('고', 'I-LC'),
 ('속', 'I-LC'),
 ('도', 'I-LC'),
 ('로', 'I-LC'),
 (' ', 'O'),
 ('강', 'B-LC'),
 ('릉', 'I-LC'),
 (' ', 'O'),
 ('방', 'O'),
 ('향', 'O'),
 (' ', 'O'),
 ('문', 'B-LC'),
 ('막', 'I-LC'),
 ('휴', 'I-LC'),
 ('게', 'I-LC'),
 ('소', 'I-LC'),
 ('에', 'O'),
 ('서', 'O'),
 (' ', 'O'),
 ('만', 'B-LC'),
 ('종', 'I-LC'),
 ('분', 'I-LC'),
 ('기', 'I-LC'),
 ('점', 'I-LC'),
 ('까', 'O'),
 ('지', 'O'),
 (' ', 'O'),
 ('5', 'B-QT'),
 ('㎞', 'I-QT'),
 (' ', 'O'),
 ('구', 'O'),
 ('간', 'O'),
 ('에', 'O'),
 ('는', 'O'),
 (' ', 'O'),
 ('승', 'O'),
 ('용', 'O'),
 ('차', 'O'),
 (' ', 'O'),
 ('전', 'O'),
 ('용', 'O'),
 (' ', 'O'),
 ('임', 'O'),
 ('시', 'O'),
 (' ', 'O'),
 ('갓', 'O'),
 ('길', 'O'),
 ('차', 'O'),
 ('로', 'O'),
 ('제', 'O'),
 ('를', 'O'),
 (' ', 'O'),
 ('운', 'O'),
 ('영', 'O'),
 ('하', 'O'),
 ('기', 'O'),
 ('로', 'O'),
 (' ', 'O'),
 ('했', 'O'),
 ('다', 'O'),
 ('.', 'O')]

In [119]:
list(zip_longest(tokenized_char[0], labels))

[('특히', 'O'),
 ('영동', 'B-LC'),
 ('##고속도로', 'I-LC'),
 ('강', 'B-LC'),
 ('##릉', 'I-LC'),
 ('방향', 'O'),
 ('문', 'B-LC'),
 ('##막', 'I-LC'),
 ('##휴', 'I-LC'),
 ('##게', 'I-LC'),
 ('##소', 'I-LC'),
 ('##에', 'O'),
 ('##서', 'O'),
 ('만', 'B-LC'),
 ('##종', 'I-LC'),
 ('##분', 'I-LC'),
 ('##기', 'I-LC'),
 ('##점', 'I-LC'),
 ('##까', 'O'),
 ('##지', 'O'),
 ('[UNK]', 'B-QT'),
 ('구', 'O'),
 ('##간', 'O'),
 ('##에', 'O'),
 ('##는', 'O'),
 ('승', 'O'),
 ('##용', 'O'),
 ('##차', 'O'),
 ('전용', 'O'),
 ('임시', 'O'),
 ('갓', 'O'),
 ('##길', 'O'),
 ('##차', 'O'),
 ('##로', 'O'),
 ('##제', 'O'),
 ('##를', 'O'),
 ('운영', 'O'),
 ('##하', 'O'),
 ('##기', 'O'),
 ('##로', 'O'),
 ('했', 'O'),
 ('##다', 'O'),
 ('.', 'O')]