In [1]:
import torch
from typing import List, Dict, Tuple, Any
from kobert_transformers import get_tokenizer
from gluonnlp.data import SentencepieceTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
tokenizer = get_tokenizer()
tok = tokenizer('뽀로로는 남극에 사는 펭귄이 아니다.', padding=True, truncation=True) # Has Input_ids, token_type_ids, attention_mask
res = tokenizer.convert_ids_to_tokens(tok['input_ids'])
print(type(tok['attention_mask']))

<class 'list'>


In [None]:
label_list = ['PS', 'FD', 'TR', 'AF', 'OG', 'LC', 'CV', 'DT', 'TI', 'TI', 'QT', 'EV', 'AM', 'PT', 'MT', "TM"] 
label_fin = ['O']
label_fin += ['B-' + i for i in label_list]
label_fin += ['I-' + i for i in label_list]
label_to_idx = {label: idx for idx, label in enumerate(label_fin)}
idx_to_label = {idx: label for idx, label in enumerate(label_fin)}
print(label_to_idx, idx_to_label)

# Preprocess Part

In [None]:
'''
We will return the label of given words, using the ne_lists
We use BIO-tagging
'''
def tagging(words: List[str], ne_lists: List[Dict[str, Any]]) -> List[str] :
    results = [i if i in ['[CLS]', '[SEP]', '[PAD]'] else 'O' for i in words] # If token is not Special, initialize 'O' tag
    ps_words = [i.replace('##', '').replace('▁','') for i in words]
    ne_cnt = len(ne_lists)
    ne_idx = -1
    ne_label = 0

    for idx, word in enumerate(ps_words) :
        if results[idx] != 'O' or word == '' or word == '[UNK]':
            continue
        if word == '[UNK]' :
            continue
        # Now condition check
        if ne_idx >= 0 : 
            nw_word = ne_lists[ne_idx]['form'][ne_label:]
        else :
            nw_word = ''

        # I-tag condition
        if (len(nw_word) > 0) & (nw_word.startswith(word)) & (results[idx-1][0] == 'B' or results[idx-1][0] == 'I') :
            results[idx] = 'I-' + ne_lists[ne_idx]['label'][:2]
            ne_label += len(word)
        else : # B-tag condition
            back_idx = ne_idx
            back_label = ne_label
            while ne_idx + 1 < ne_cnt :
                ne_idx += 1
                ne_label = 0
                nw_word = ne_lists[ne_idx]['form']
                if (len(nw_word) > 0) & (nw_word.startswith(word)) :
                    results[idx] = 'B-' + ne_lists[ne_idx]['label'][:2]
                    ne_label += len(word)
                    break
            if ne_idx + 1 == ne_cnt and ne_label == 0:
                ne_idx = back_idx
                ne_label = back_label

    return results


In [None]:
sentence = "태안군의회, 2019년‘군민중심’의정성과 빛났다!"
ne = [
        {
            "id": 1,
            "form": "태안군의회",
            "label": "OGG_POLITICS",
            "begin": 0,
            "end": 5
        },
        {
            "id": 2,
            "form": "2019년",
            "label": "DT_YEAR",
            "begin": 7,
            "end": 12
        }
]

tokenizer = get_tokenizer()
tok = tokenizer(sentence, padding=True, truncation=True) # Has Input_ids, token_type_ids, attention_mask
tokens_word = tokenizer.convert_ids_to_tokens(tok['input_ids'])
print(tokens_word, tagging(tokens_word, ne), sep='\n')

# Json loads & dataframe preprocess

In [None]:
import json
import pandas as pd

def load_files(path='./dataset/NLNE2202211219.json') :
    with open(path, "r") as f :
        bef_data = json.load(f)

    bef_data = bef_data['document']

    df_tot = pd.DataFrame(columns=['form', 'NE'])

    for r in bef_data :
        df_tot = df_tot.append(pd.DataFrame.from_records(r['sentence'], columns=['form', 'NE']))

    df_tot.dropna(how='any', inplace=True)

    return df_tot

In [None]:
tt = load_files()
tt.iloc[0]

In [None]:
# def convert_df(data: List[Any]) -> pd.DataFrame:
#     return pd.DataFrame.from_records(data['sentence'], columns=['form'])
# ex = bef_data[0]
# ex = ex['sentence']
# print(type(ex), ex[1], sep='\n')
# df = pd.DataFrame.from_records(ex, columns=['form', 'NE'])
# df

# DataLoader

In [None]:
import os
import torch
from torch.utils.data import Dataset

# Define DataLoader, with tokenizer
# Have to define collect_fn, to gather attention mask and another information
# tok = tokenizer('뽀로로는 남극에 사는 펭귄이 아니다.', padding=True, truncation=True) # Has Input_ids, token_type_ids, attention_mask
# tokenizer.convert_ids_to_tokens(tok['input_ids'])
df = load_files()
texts = df['form']
ne = df['NE']

class CustomDataset(Dataset) :
    def __init__(self, texts, labels, tokenizer, max_len) -> None:
        self.tokenizer = tokenizer 
        self.texts = texts
        self.labels = labels
        self.max_len = max_len

    def __len__(self) :
        return len(self.df)
    
    def __getitem__(self, index) -> Any:
        # tokenizer
        input = self.texts[index]
        sentence = self.tokenizer(input, max_len = self.max_len, padding = True, truncation = True) # Input_ids, token_type_ids, attention_mask
        tags = tagging(self.tokenizer.convert_ids_to_tokens(sentence['input_ids']))
        return {
            'sentence' : input,
            'input_ids' : sentence['input_ids'],
            'token_type_id' : sentence['token_type_ids'],
            'attention_mask' : sentence['attention_mask'],
            'labels' : tags
        }