In [1]:
## import packages

import torch
from transformers import BertConfig, BertModel, BertForPreTraining, BertTokenizer
from unicodedata import normalize

In [2]:
## tokenizing preprocessing
tokenizer_subchar = BertTokenizer.from_pretrained('pretrained/vocab_snu_subchar12367.txt', do_lower_case=False)
tokenizer_char = BertTokenizer.from_pretrained('pretrained/vocab_snu_char16424.txt', do_lower_case=False)

# convert a string into sub-char
def to_subchar(string):
    return normalize('NFKD', string)



In [3]:
sentence = '데이터 처리를 위한 문자열 예시입니다. 뱃사람. 춥다.'

print("subchar: ", tokenizer_subchar.tokenize(to_subchar(sentence)))
print("   char: ", tokenizer_char.tokenize(sentence))

subchar:  ['데이터', '처리', '##를', '위한', '문자', '##열', '예', '##시', '##입니다', '.', '배', '##ᆺ', '##사람', '.', '추', '##ᆸ다', '.']
   char:  ['데이터', '처리', '##를', '위한', '문자', '##열', '예', '##시', '##입니다', '.', '뱃', '##사람', '.', '춥', '##다', '.']


In [4]:
## Downstream tasks (Naver Sentiment Movie Corpus)
## Based on 'KR-BERT character Bidirectional WordPiece'

Inference

In [5]:
import argparse
import pickle
import json
import torch
import torch.optim as optim
from pathlib import Path
from torch.utils.data import DataLoader
from pretrained.tokenization_ranked import FullTokenizer as KBertRankedTokenizer
from transformers import BertTokenizer, BertConfig
from model.net import SentenceClassifier
from model.data import Corpus
from model.utils import PreProcessor, PadSequence
from model.metric import evaluate, acc
from utils import Config, CheckpointManager, SummaryManager
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter

ModuleNotFoundError: No module named 'tensorflow'

In [7]:
# set path

ptr_dir = Path('pretrained')
data_dir = Path('data')
model_dir = Path('checkpoints')

In [8]:
# load configs and vocab

ptr_config = Config(ptr_dir / 'config_char16424_ranked.json')
data_config = Config(data_dir / 'config.json')
model_config = Config('finetuning_config.json')
with open(ptr_config.config, mode="r") as io :
    bert_config = json.loads(io.read())

vocab = pickle.load(open(ptr_config.vocab, mode='rb'))

In [9]:
# load preprocessor

ptr_tokenizer = KBertRankedTokenizer(ptr_config.tokenizer, do_lower_case=False)
pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token))
preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence, subchar='False')

[CONVERT A STRING INTO CHAR]


In [10]:
config = BertConfig(**bert_config)
model = SentenceClassifier(config, num_classes=model_config.num_classes, vocab=preprocessor.vocab)
bert_pretrained = torch.load('checkpoints/best_snu_char16424_ranked.tar', map_location=torch.device('cpu'))
model.load_state_dict(bert_pretrained['model_state_dict'], strict=False)

<All keys matched successfully>

In [16]:
txt_1 = '이런걸 왜 돈 주고 보냐.. 진짜 최악임'
token_1 = torch.tensor(preprocessor.preprocess(txt_1)).view(1, -1)

txt_2 = '짱 좋아 최고야 올해의 영화상 드립니다'
token_2 = torch.tensor(preprocessor.preprocess(txt_2)).view(1, -1)

txt_3 = '그냥 나쁘지 않은 킬링타임 영화.. 근데 굳이 영화관에서 볼 필요는 없는듯?'
token_3 = torch.tensor(preprocessor.preprocess(txt_3)).view(1, -1)

txt_4 = '감독은 영화 접어라'
token_4 = torch.tensor(preprocessor.preprocess(txt_4)).view(1, -1)

tokens = torch.cat((token_1, token_2, token_3, token_4), 0)
print('test_tokens:')
print(tokens)

test_tokens:
tensor([[    2,   693,  1598,  1499,   873,  4239,   192,   790,     5,     5,
          2964,  6386,   419,     3,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,   

In [17]:
model.eval()

output = model(tokens)
print(['NEGATIVE' if o < 0.5 else 'POSITIVE' for o in output.max(dim=1)[1]])

['NEGATIVE', 'POSITIVE', 'POSITIVE', 'NEGATIVE']


In [27]:
print(output)

tensor([[ 2.8851, -2.8254],
        [-2.2497,  2.4242],
        [-1.7892,  1.9156],
        [ 1.3974, -1.4130]], grad_fn=<AddmmBackward0>)