In [1]:
import json
import read_data
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from optparse import OptionParser
import torch.autograd as autograd
from copy import deepcopy
import os
import sys
import pprint
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"]= "0"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import torch.backends.cudnn as cudnn
cudnn.benchmark = True
torch.manual_seed(1)

from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

import eval_srl

loading TRAINING data...
# of sentences: 34856
# of arg_types
	total: 124873 (3.5825 arg-per-sent)
	unique: 12
	for each: Counter({'ARG1': 68451, 'ARG0': 18568, 'ARG3': 11060, 'ARGM-LOC': 6468, 'ARG2': 4935, 'ARGM-MNR': 4098, 'ARGM-TMP': 3423, 'ARGM-EXT': 2986, 'ARGM-CAU': 1819, 'ARGM-INS': 1426, 'ARGM-DIR': 1357, 'ARGM-PRP': 282})


# Option

In [28]:
model_dir = './result/model-bert'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
model_path = model_dir+'/model.pt'

dev_sent = 500

In [29]:
from datetime import datetime
start_time = datetime.now()
today = start_time.strftime('%Y-%m-%d')

In [30]:
# load data
data = read_data.load_trn_data()
trn_conll = read_data.load_trn_nlp()

In [31]:
# input data
# [
#     ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15'], 
#     ['인사동에', '들어서면', '다종다양의', '창호지,', '도자기', '등', '고미술품들이', '진열장에', '즐비하게', '널려져', '있는', '것을', '볼', '수', '있다.'], 
#     ['ARGM-LOC', '-', '-', '-', '-', '-', 'ARG1', 'ARG1', '-', '-', '-', 'ARG1', '-', '-', '-']
# ]

def get_input_data(data):
    result = []
    for sent in data:
        sent_list = []
        
        tok_idx = []
        tok_str = []
        tok_arg = []
        for token in sent:
            tok_idx.append(token[0])
            tok_str.append(token[1])
            tok_arg.append(token[2])
            
        sent_list.append(tok_idx)
        sent_list.append(tok_str)
        sent_list.append(tok_arg)
        result.append(sent_list)
    return result
        
input_data = get_input_data(data)

In [32]:
div = len(input_data) - dev_sent

dev = input_data[div:]
trn = input_data[:div]
gold_file = './dev.data'
print('')
print('### dev data:', len(dev), 'sents')

with open(gold_file,'w') as f:
    dev_list = []
    for i in dev:
        dev_list += i[2]
        
    json.dump(dev_list, f)
    
gold_to_see = './dev.tosee'
with open(gold_to_see,'w') as f:
    dev_list = []
    for i in dev:
        dev_list.append(i[2])
        
    json.dump(dev_list, f)


### dev data: 500 sents


In [57]:
def prepare_idx():
    dp_to_ix, arg_to_ix, morp_to_ix = {},{},{}
    dp_to_ix['null'] = 0
    morp_to_ix['null'] = 0
    
    for sent in trn_conll:
        for token in sent:
            dp = token[11]
            if dp not in dp_to_ix:
                dp_to_ix[dp] = len(dp_to_ix)
                
            morphs = token[2].split('+')
            for morp in morphs:
                if morp not in morp_to_ix:
                    morp_to_ix[morp] = len(morp_to_ix)
    args = ['ARG0', 'ARG1', 'ARG2', 'ARG3', 'ARGM-CAU', 'ARGM-CND', 'ARGM-DIR', 'ARM-DIS', 'ARGM-INS', 'ARGM-LOC', 'ARCM-MNR', 'ARCM-NEG', 'ARCM-PRD', 'ARCM-PRP', 'ARCM-TMP', 'ARCM-ADV', 'ARCM-EXT', '-']
    for i in args:
        if i not in arg_to_ix:
            arg_to_ix[i] = len(arg_to_ix)
    return dp_to_ix, arg_to_ix, morp_to_ix
dp_to_ix, arg_to_ix, morp_to_ix = prepare_idx()
DP_VOCAB_SIZE = len(dp_to_ix)
ARG_VOCAB_SIZE = len(arg_to_ix)
MORP_VOCAB_SIZE = len(morp_to_ix)
print('DP_VOCAB_SIZE:',DP_VOCAB_SIZE)
print('ARG_VOCAB_SIZE:',ARG_VOCAB_SIZE)
print('MORP_VOCAB_SIZE:', MORP_VOCAB_SIZE)

DP_VOCAB_SIZE: 32
ARG_VOCAB_SIZE: 18
MORP_VOCAB_SIZE: 63376


In [58]:
configuration = {'token_dim': 60,
                 'feat_dim': 1,
                 'dp_dim': 4,
                 'arg_dim': 4,
                 'lu_pos_dim': 5,
                 'dp_label_dim': 10,
                 'lstm_input_dim': 100,
                 'lstm_dim': 64,
                 'lstm_depth': 2,
                 'hidden_dim': 64,
                 'position_feature_dim': 5,
                 'num_epochs': 10,
                 'learning_rate': 0.001,
                 'dropout_rate': 0.01,
                 'pretrained_embedding_dim': 300,
                 'model_dir': model_dir,
                 'model_path': model_path,
                 }
print('\n### CONFIGURATION ###\n')
pprint.pprint(configuration)
print('')

DPDIM = configuration['dp_dim']
ARGDIM = configuration['arg_dim']
LSTMINPDIM = configuration['lstm_input_dim']
FEATDIM = configuration['feat_dim']
HIDDENDIM = configuration['hidden_dim']
LSTMDEPTH = configuration['lstm_depth']
DROPOUT_RATE = configuration['dropout_rate']
learning_rate = configuration['learning_rate']
NUM_EPOCHS = configuration['num_epochs']

print('\n### YOUR MODEL WILL BE SAVED TO', model_path, '###\n')

with open(model_dir+'/config.json', 'w') as f:
    json.dump(configuration, f, ensure_ascii=False, indent=4)


### CONFIGURATION ###

{'arg_dim': 4,
 'dp_dim': 4,
 'dp_label_dim': 10,
 'dropout_rate': 0.01,
 'feat_dim': 1,
 'hidden_dim': 64,
 'learning_rate': 0.001,
 'lstm_depth': 2,
 'lstm_dim': 64,
 'lstm_input_dim': 100,
 'lu_pos_dim': 5,
 'model_dir': './result/model-bert',
 'model_path': './result/model-bert/model.pt',
 'num_epochs': 10,
 'position_feature_dim': 5,
 'pretrained_embedding_dim': 300,
 'token_dim': 60}


### YOUR MODEL WILL BE SAVED TO ./result/model-bert/model.pt ###



In [59]:
def get_pred_idxs(conll):
    result = []
    preds = [0 for i in range(len(conll))]
    for i in range(len(conll)):
        tok = conll[i]
        if tok[10].startswith('V'):
            preds = [0 for item in range(len(conll))]
            preds[i] = 1
            result.append(preds)
            
    return result

In [61]:
def get_arg_idxs(pred_idx, conll):
    arg_idxs = [0 for i in range(len(conll))]
    for i in range(len(conll)):
        tok = conll[i]
        if int(tok[8]) == pred_idx:
            
#             arg_idxs[i] = 1
            
            arg_pos = tok[-1]
            if arg_pos[:2] == 'NP':
                arg_idxs[i] = 1
                
    return arg_idxs

In [62]:
def get_feature(pred_idxs, conll):
    result = []
    for i in pred_idxs:
        features = []
        for j in range(len(i)):
            pred_idx = i[j]
        for j in range(len(i)):
            feature = []                
            feature.append(i[j])
            features.append(feature)                
        result.append(features)
        
    return result

In [63]:
def prepare_sequence(seq, to_ix):
    vocab = list(to_ix.keys())
    idxs = []
    for w in seq:
        if w in vocab:
            idxs.append(to_ix[w])
        else:
            idxs.append(0)  

    return torch.tensor(idxs).cuda()

In [64]:
def get_dps(conll):
    dps = []
    for tok in conll:
        dp = tok[10]
        dps.append(dp)
    return dps

In [65]:
def get_sentence_vec(tokens, conll):
    result = []
    for i in range(len(tokens)):
        token = tokens[i]
        morps = conll[i][2].split('+')
#         morp_ix = prepare_sequence(morps, morp_to_ix)
        result.append(morps)
    return result

# BERT model

In [55]:
# 만약 한국어를 쓸 경우 - uncased
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')

12/12/2018 14:11:34 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt from cache at /home/hahmyg/.pytorch_pretrained_bert/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729
12/12/2018 14:11:35 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz from cache at /home/hahmyg/.pytorch_pretrained_bert/731c19ddf94e294e00ec1ba9a930c69cc2a0fd489b25d3d691373fae4c0986bd.4e367b0d0155d801930846bb6ed98f8a7c23e0ded37888b29caa37009a40c7b9
12/12/2018 14:11:35 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /home/hahmyg/.pytorch_pretrained_bert/731c19ddf94e294e00ec1ba9a930c69cc2a0fd489b25d3d691373fae4c0986bd.4e367b0d0155d801930846bb6ed98f8a7c23e0ded37888b29caa37009a40c7b9 to temp dir /tmp/t

In [56]:
def bert_tokenizer(text):
    orig_tokens = text.split(' ')
    bert_tokens = []
    orig_to_tok_map = []
    bert_tokens.append("[CLS]")
    for orig_token in orig_tokens:
        orig_to_tok_map.append(len(bert_tokens))
        bert_tokens.extend(tokenizer.tokenize(orig_token))
    bert_tokens.append("[SEP]")
    
    return orig_tokens, bert_tokens, orig_to_tok_map

# text = '안녕하세요? 저는 한국인 입니다.'
# orig_tokens, bert_tokens, orig_to_tok_map = bert_tokenizer(text)

# print(orig_tokens)
# print(bert_tokens)
# print(orig_to_tok_map)

['안녕하세요?', '저는', '한국인', '입니다.']
['[CLS]', '안', '##녕', '##하', '##세', '##요', '?', '저', '##는', '한국', '##인', '입', '##니다', '.', '[SEP]']
[1, 7, 9, 11]


In [76]:
text = '안녕하세요? 저는 한국인 입니다.'
orig_tokens, bert_tokens, orig_to_tok_map = bert_tokenizer(text)

print(orig_tokens)
print(bert_tokens)
print(orig_to_tok_map)

indexed_tokens = tokenizer.convert_tokens_to_ids(bert_tokens)
tokens_tensor = torch.tensor([indexed_tokens])

print(indexed_tokens)
print(tokens_tensor)

['안녕하세요?', '저는', '한국인', '입니다.']
['[CLS]', '안', '##녕', '##하', '##세', '##요', '?', '저', '##는', '한국', '##인', '입', '##니다', '.', '[SEP]']
[1, 7, 9, 11]
[101, 9521, 118741, 35506, 24982, 48549, 136, 9663, 11018, 48556, 12030, 9645, 48345, 119, 102]
tensor([[   101,   9521, 118741,  35506,  24982,  48549,    136,   9663,  11018,
          48556,  12030,   9645,  48345,    119,    102]])


In [71]:
from pytorch_pretrained_bert import BertConfig
from pytorch_pretrained_bert import BertForTokenClassification

In [79]:
input_ids = torch.LongTensor([[31, 51, 99]])
input_mask = torch.LongTensor([[0, 1, 2]])
token_type_ids = torch.LongTensor([[0, 0, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 3
model = BertForTokenClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask)

In [80]:
print(logits)

tensor([[[-0.0013, -0.0269, -0.0012],
         [ 0.0144, -0.0177, -0.0063],
         [ 0.0164, -0.0211, -0.0155]]], grad_fn=<ThAddBackward>)
