In [1]:
import torch
from typing import List, Dict, Tuple, Any
from kobert_transformers import get_tokenizer
from gluonnlp.data import SentencepieceTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# tokenizer = get_tokenizer()
# tok = tokenizer('뽀로로는 남극에 사는 펭귄이 아니다.', padding=True, truncation=True) # Has Input_ids, token_type_ids, attention_mask
# tokenizer.convert_ids_to_tokens(tok['input_ids'])
# print(tokenizer.convert_)

In [3]:
label_list = ['PS', 'FD', 'TR', 'AF', 'OG', 'LC', 'CV', 'DT', 'TI', 'TI', 'QT', 'EV', 'AM', 'PT', 'MT', "TM"] 
label_fin = ['O']
label_fin += ['B-' + i for i in label_list]
label_fin += ['I-' + i for i in label_list]
label_to_idx = {label: idx for idx, label in enumerate(label_fin)}
idx_to_label = {idx: label for idx, label in enumerate(label_fin)}
print(label_to_idx, idx_to_label)

{'O': 0, 'B-PS': 1, 'B-FD': 2, 'B-TR': 3, 'B-AF': 4, 'B-OG': 5, 'B-LC': 6, 'B-CV': 7, 'B-DT': 8, 'B-TI': 10, 'B-QT': 11, 'B-EV': 12, 'B-AM': 13, 'B-PT': 14, 'B-MT': 15, 'B-TM': 16, 'I-PS': 17, 'I-FD': 18, 'I-TR': 19, 'I-AF': 20, 'I-OG': 21, 'I-LC': 22, 'I-CV': 23, 'I-DT': 24, 'I-TI': 26, 'I-QT': 27, 'I-EV': 28, 'I-AM': 29, 'I-PT': 30, 'I-MT': 31, 'I-TM': 32} {0: 'O', 1: 'B-PS', 2: 'B-FD', 3: 'B-TR', 4: 'B-AF', 5: 'B-OG', 6: 'B-LC', 7: 'B-CV', 8: 'B-DT', 9: 'B-TI', 10: 'B-TI', 11: 'B-QT', 12: 'B-EV', 13: 'B-AM', 14: 'B-PT', 15: 'B-MT', 16: 'B-TM', 17: 'I-PS', 18: 'I-FD', 19: 'I-TR', 20: 'I-AF', 21: 'I-OG', 22: 'I-LC', 23: 'I-CV', 24: 'I-DT', 25: 'I-TI', 26: 'I-TI', 27: 'I-QT', 28: 'I-EV', 29: 'I-AM', 30: 'I-PT', 31: 'I-MT', 32: 'I-TM'}


# Preprocess Part

In [4]:
'''
We will return the label of given words, using the ne_lists
We use BIO-tagging
'''
def tagging(words: List[str], ne_lists: List[Dict[str, Any]]) -> List[str] :
    results = [i if i in ['[CLS]', '[SEP]', '[PAD]'] else 'O' for i in words] # If token is not Special, initialize 'O' tag
    ps_words = [i.replace('##', '').replace('▁','') for i in words]
    ne_cnt = len(ne_lists)
    ne_idx = -1
    ne_label = 0

    for idx, word in enumerate(ps_words) :
        if results[idx] != 'O' or word == '' or word == '[UNK]':
            continue
        if word == '[UNK]' :
            continue
        # Now condition check
        if ne_idx >= 0 : 
            nw_word = ne_lists[ne_idx]['form'][ne_label:]
        else :
            nw_word = ''

        # I-tag condition
        if (len(nw_word) > 0) & (nw_word.startswith(word)) & (results[idx-1][0] == 'B' or results[idx-1][0] == 'I') :
            results[idx] = 'I-' + ne_lists[ne_idx]['label'][:2]
            ne_label += len(word)
        else : # B-tag condition
            back_idx = ne_idx
            back_label = ne_label
            while ne_idx + 1 < ne_cnt :
                ne_idx += 1
                ne_label = 0
                nw_word = ne_lists[ne_idx]['form']
                if (len(nw_word) > 0) & (nw_word.startswith(word)) :
                    results[idx] = 'B-' + ne_lists[ne_idx]['label'][:2]
                    ne_label += len(word)
                    break
            if ne_idx + 1 == ne_cnt and ne_label == 0:
                ne_idx = back_idx
                ne_label = back_label

    return results


In [5]:
sentence = "태안군의회, 2019년‘군민중심’의정성과 빛났다!"
ne = [
        {
            "id": 1,
            "form": "태안군의회",
            "label": "OGG_POLITICS",
            "begin": 0,
            "end": 5
        },
        {
            "id": 2,
            "form": "2019년",
            "label": "DT_YEAR",
            "begin": 7,
            "end": 12
        }
]

tokenizer = get_tokenizer()
tok = tokenizer(sentence, padding=True, truncation=True) # Has Input_ids, token_type_ids, attention_mask
tokens_word = tokenizer.convert_ids_to_tokens(tok['input_ids'])
print(tokens_word, tagging(tokens_word, ne), sep='\n')

['[CLS]', '▁태', '안', '군의', '회', ',', '▁20', '19', '년', '‘', '군', '민', '중심', '’', '의', '정', '성과', '▁빛', '났다', '!', '[SEP]']
['[CLS]', 'B-OG', 'I-OG', 'I-OG', 'I-OG', 'O', 'B-DT', 'I-DT', 'I-DT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '[SEP]']


# Json loads & dataframe preprocess

In [6]:
import json
import pandas as pd

with open('./dataset/NLNE2202211219.json', "r") as f :
    bef_data = json.load(f)

bef_data = bef_data['document']

df_tot = pd.DataFrame(columns=['form', 'NE'])

for r in bef_data :
    df_tot = df_tot.append(pd.DataFrame.from_records(r['sentence'], columns=['form', 'NE']))

df_tot.dropna(how='any', inplace=True)

In [8]:
# def convert_df(data: List[Any]) -> pd.DataFrame:
#     return pd.DataFrame.from_records(data['sentence'], columns=['form'])
# ex = bef_data[0]
# ex = ex['sentence']
# print(type(ex), ex[1], sep='\n')
# df = pd.DataFrame.from_records(ex, columns=['form', 'NE'])
# df