## Entity Taggging

In [26]:
import pandas as pd
import numpy as np
from mxnet.gluon import nn, rnn
from mxnet import gluon, autograd
import gluonnlp as nlp
from mxnet import nd 
import mxnet as mx
import time
import itertools
from tqdm import tqdm
import multiprocessing as mp

In [27]:
train_raw = pd.read_csv("data/trainset.txt",names=['intent', 'entity', 'sentence'], sep='\t')
validation_raw = pd.read_csv("data/test_hidden.txt",names=['intent', 'entity', 'sentence'], sep='\t')
#validation_raw = pd.read_csv("data/validation.txt",names=['intent', 'entity', 'sentence'], sep='\t')

In [28]:
train_raw.head(30)

Unnamed: 0,intent,entity,sentence
0,area,EECCCCCCCCCCCCCCCCCCC,자강의 면적은 얼마 정도되는지 알려줄래
1,birth_date,CCCCCCCCCCCCEEECCCCCCCCCCCC,WIKI PEDIA로 변재일 생년월일을 알고 싶어
2,age,EEEEEEEEEEECCCCCCCCCCCCCCCCC,남쪽 물고기자리 알파 나이가 위키백과사전으로 얼마야
3,length,EEEECCCCCCCCCCCCCCCCCC,삼양터널의 총 길이 위키백과사전에서 뭐야
4,birth_place,EEEEEECCCCCCCCCCC,코니 윌리스의 태어난 곳은 뭐지
5,weight,CCCCCCCCCCCCEEEECCCCCCCCCCCCC,WIKI백과사전 검색 AA12의 무게가 얼만지 찾아봐
6,definition,CCCCCCCCCCCCCEEECCCCCCCC,WIKIPEDIA백과로 라이프 찾아서 말해줘
7,height,EEEEEEEECCCCCCCCCCCCCCCCCCC,송파 헬리오시티 구조물 높이 위키 피디아에서 뭐야
8,birth_date,CCCEEEEEECCCCCCCCCCCCCCC,검색 HLKVAM 언제 출생했는지를 검색해라
9,height,CCCCCCCCEEEEEECCCCCCCC,위키 피디아에 푸조 508 전고가 몇이야


#### 데이터 전처리

In [29]:
train_dataset = [(l, d) for d,l in zip(train_raw['entity'], train_raw['sentence'])]
valid_dataset = [(l, d) for d,l in zip(validation_raw['entity'], validation_raw['sentence'])]

In [30]:
seq_len = 32

length_clip = nlp.data.PadSequence(seq_len, pad_val="<pad>")

def preprocess(data):
    sent, entity = data
    char_sent = list(str(sent))
    char_entity = list(str(entity))
    return(length_clip(char_sent), len(sent),length_clip(char_entity))

def preprocess_dataset(dataset):
    start = time.time()
    with mp.Pool() as pool:
        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
    end = time.time()
    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'
          .format(end - start, len(dataset)))
    return dataset


In [31]:
train_preprocessed  = preprocess_dataset(train_dataset)
valid_preprocessed  = preprocess_dataset(valid_dataset)

Done! Tokenizing Time=0.28s, #Sentences=9000
Done! Tokenizing Time=0.17s, #Sentences=1000


In [32]:
counter_sent   = nlp.data.count_tokens(itertools.chain.from_iterable([c for c, _, _ in train_preprocessed]))
counter_entity = nlp.data.count_tokens(itertools.chain.from_iterable([c for _,_, c in train_preprocessed]))

In [33]:
vocab_sent = nlp.Vocab(counter_sent, bos_token=None, eos_token=None, min_freq=15)
vocab_entity = nlp.Vocab(counter_entity, bos_token=None, eos_token=None, unknown_token=None ,min_freq=15)

In [34]:
vocab_sent.idx_to_token[:10], vocab_entity.idx_to_token[:10], 

(['<unk>', '<pad>', ' ', 'I', '이', '색', '검', '의', '지', '아'],
 ['<pad>', 'C', 'E'])

In [35]:
train_preprocessed_encoded  = [(vocab_sent[sent], length ,vocab_entity[entity])  for sent, length ,entity in train_preprocessed ]
valid  = [(vocab_sent[sent], length ,vocab_entity[entity])  for sent, length ,entity in valid_preprocessed ]

In [36]:
train, test = nlp.data.train_valid_split(train_preprocessed_encoded, valid_ratio=0.1)

In [37]:
nbatch = 30
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Stack(),
                                      nlp.data.batchify.Stack('float32'),
                                      nlp.data.batchify.Stack())

train_dataloader  = gluon.data.DataLoader(train, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)
test_dataloader  = gluon.data.DataLoader(test, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)
valid_dataloader  = gluon.data.DataLoader(valid, batch_size=nbatch, batchify_fn=batchify_fn, shuffle=True)

#### 모델링 

In [38]:
class EntityTagger(gluon.HybridBlock):
    def __init__(self, vocab_size, vocab_out_size, num_embed, hidden_size, **kwargs):
        super(EntityTagger, self).__init__(**kwargs)
        self.hidden_size = hidden_size 
        self.vocab_out_size = vocab_out_size
        with self.name_scope():
            self.embed = nn.Embedding(input_dim=vocab_size, output_dim=num_embed)
            self.bigru = rnn.GRU(self.hidden_size, dropout=0.2, bidirectional=True)
            self.dense_prev = nn.Dense(10, flatten=False)
            self.dense = nn.Dense(self.vocab_out_size, flatten=False)  
            
    def hybrid_forward(self, F ,inputs, length):
        em_out = self.embed(inputs)
        bigruout = self.bigru(em_out)
        masked_encoded = F.SequenceMask(bigruout,
                                        sequence_length=length,
                                        use_sequence_length=True).transpose((1,0,2))
        dense_out = self.dense_prev(masked_encoded)
        outs = self.dense(dense_out) 
        return(outs)

In [39]:
ctx = mx.cpu()

model = EntityTagger(vocab_size = len(vocab_sent.idx_to_token), vocab_out_size=len(vocab_entity.idx_to_token), 
                     num_embed=50, hidden_size=30)

In [40]:
model.initialize(mx.initializer.Xavier(), ctx=ctx)

In [41]:
trainer = gluon.Trainer(model.collect_params(),"Adam")
loss = gluon.loss.SoftmaxCELoss() 

In [42]:
model.hybridize()

In [43]:
model

EntityTagger(
  (embed): Embedding(481 -> 50, float32)
  (bigru): GRU(None -> 30, TNC, dropout=0.2, bidirectional)
  (dense_prev): Dense(None -> 10, linear)
  (dense): Dense(None -> 3, linear)
)

In [50]:
def evaluate_accuracy(model, data_iter, ctx=ctx):
    corrected = 0
    n = 0
    for i, (data, length, label) in enumerate(data_iter):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        length = length.as_in_context(ctx)
        output = model(data.T, length)
        predictions = nd.argmax(output, axis=2)
        tf = predictions.astype('int64') == label
        for i in range(length.shape[0]):
            l = int(length[i].asscalar())
            corrected += nd.sum(tf[i][:l]).asscalar() == l
            n += 1
        #acc.update(preds=predictions, labels=label)
    return(corrected/n)

In [51]:
def calculate_loss(model, data_iter, loss_obj, ctx=ctx):
    test_loss = []
    for i, (te_data, te_length, te_label) in enumerate(data_iter):
        te_data = te_data.as_in_context(ctx)
        te_label = te_label.as_in_context(ctx)
        te_length = te_length.as_in_context(ctx)
        te_output = model(te_data.T, te_length)
        loss_te = loss_obj(te_output, te_label)
        curr_loss = nd.mean(loss_te).asscalar()
        test_loss.append(curr_loss)
    return(np.mean(test_loss))

In [None]:
epochs = 100


tot_test_loss = []
tot_test_accu = []
tot_train_loss = []
tot_train_accu = []
tot_valid_accu = [] 
for e in range(epochs):
    #batch training 
    for i, (data, length, label) in enumerate(tqdm(train_dataloader)):
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        length = length.as_in_context(ctx)
        with autograd.record():
            output = model(data.T, length)
            loss_ = loss(output, label)
            loss_.backward()
        trainer.step(data.shape[0])

    #caculate test loss
    if e % 10 == 0: 
        test_loss = calculate_loss(model, test_dataloader, loss_obj = loss, ctx=ctx) 
        train_loss = calculate_loss(model, train_dataloader, loss_obj = loss, ctx=ctx) 
        test_accu = evaluate_accuracy(model, test_dataloader,  ctx=ctx)
        train_accu = evaluate_accuracy(model, train_dataloader,  ctx=ctx)
        valid_accu = evaluate_accuracy(model, valid_dataloader,  ctx=ctx)

        print("Epoch %s. Train Loss: %s, Test Loss : %s," \
        " Test Accuracy : %s," \
        " Train Accuracy : %s : Valid Accuracy : %s" % (e, train_loss, test_loss, test_accu, train_accu, valid_accu))    
        tot_test_loss.append(test_loss)
        tot_train_loss.append(train_loss)
        tot_test_accu.append(test_accu)
        tot_train_accu.append(train_accu)
        tot_valid_accu.append(valid_accu)


  0%|          | 0/270 [00:00<?, ?it/s][A
  3%|▎         | 7/270 [00:00<00:04, 60.40it/s][A
  5%|▍         | 13/270 [00:00<00:04, 57.98it/s][A
  7%|▋         | 19/270 [00:00<00:04, 57.28it/s][A
  9%|▉         | 25/270 [00:00<00:04, 56.46it/s][A
 13%|█▎        | 35/270 [00:00<00:03, 63.84it/s][A
 16%|█▌        | 43/270 [00:00<00:03, 65.45it/s][A
 19%|█▊        | 50/270 [00:00<00:03, 63.26it/s][A
 21%|██        | 56/270 [00:00<00:03, 62.18it/s][A
 23%|██▎       | 62/270 [00:01<00:03, 61.32it/s][A
 25%|██▌       | 68/270 [00:01<00:03, 60.60it/s][A
 27%|██▋       | 74/270 [00:01<00:03, 58.36it/s][A
 30%|██▉       | 80/270 [00:01<00:03, 56.60it/s][A
 32%|███▏      | 86/270 [00:01<00:03, 56.53it/s][A
 34%|███▍      | 92/270 [00:01<00:03, 55.47it/s][A
 36%|███▌      | 97/270 [00:01<00:03, 54.13it/s][A
 38%|███▊      | 103/270 [00:01<00:03, 54.11it/s][A
 40%|████      | 109/270 [00:02<00:02, 54.12it/s][A
 43%|████▎     | 115/270 [00:02<00:02, 54.16it/s][A
 45%|████▌     | 1

Epoch 0. Train Loss: nan, Test Loss : nan, Test Accuracy : 0.0, Train Accuracy : 0.0 : Valid Accuracy : 0.0



  4%|▍         | 12/270 [00:00<00:04, 56.57it/s][A
  7%|▋         | 18/270 [00:00<00:04, 54.95it/s][A
  9%|▉         | 24/270 [00:00<00:04, 55.15it/s][A
 11%|█         | 30/270 [00:00<00:04, 55.46it/s][A
 15%|█▍        | 40/270 [00:00<00:03, 61.92it/s][A
 18%|█▊        | 48/270 [00:00<00:03, 63.24it/s][A
 20%|██        | 55/270 [00:00<00:03, 61.87it/s][A
 23%|██▎       | 62/270 [00:01<00:03, 60.99it/s][A
 25%|██▌       | 68/270 [00:01<00:03, 60.29it/s][A
 27%|██▋       | 74/270 [00:01<00:03, 58.90it/s][A
 30%|██▉       | 80/270 [00:01<00:03, 57.06it/s][A
 32%|███▏      | 86/270 [00:01<00:03, 56.75it/s][A
 34%|███▍      | 92/270 [00:01<00:03, 56.56it/s][A
 36%|███▋      | 98/270 [00:01<00:03, 56.29it/s][A
 39%|███▊      | 104/270 [00:01<00:02, 56.02it/s][A
 41%|████      | 110/270 [00:02<00:02, 54.85it/s][A
 43%|████▎     | 115/270 [00:02<00:02, 54.31it/s][A
 45%|████▍     | 121/270 [00:02<00:02, 54.39it/s][A
 48%|████▊     | 129/270 [00:02<00:02, 55.29it/s][A
 51%|█

KeyboardInterrupt: 

#### Model export and Visualize 

In [None]:
model.export("model")

Netron으로 네트워크 시각화 

- https://lutzroeder.github.io/netron/
- 저장된 `model-symbol.json`을 입력해 시각화 

In [None]:
load_model = gluon.nn.SymbolBlock.imports("model-symbol.json", ['data0', 'data1'], "model-0000.params")

In [None]:
def get_entitytag(sent):
    sent_len = len(sent)
    coded_sent = vocab_sent[length_clip(sent)]
    co = nd.array(coded_sent).expand_dims(axis=1)
    ret_code = load_model(co, nd.array([sent_len,]))
    ret_seq = vocab_entity.to_tokens(ret_code.argmax(axis=2)[0].asnumpy().astype('int').tolist())
    return(''.join(ret_seq))

In [None]:
get_entitytag("모두의 연구소에 대해서 찾아줘")

### TODO
- Test Accuracy 95% 이상 올리기
- test_hidden 셋의 성능 90% 이상 올리기 
- Entity Tagging과 Intent Classification을 MultiTask Learning으로 통합해보기(성능이 좋아지나? 나빠지나?)