In [9]:
import json
import read_data
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from optparse import OptionParser
import torch.autograd as autograd
from copy import deepcopy
import os
import sys
import pprint
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"]= "0"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import torch.backends.cudnn as cudnn
cudnn.benchmark = True
torch.manual_seed(1)

from seqeval.metrics import accuracy_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, modeling
# from pytorch_pretrained_bert.modeling import BertForTokenClassification

import eval_srl

import warnings
warnings.filterwarnings('ignore')

import random
import numpy as np
import mxnet as mx
from mxnet import gluon
import gluonnlp as nlp

In [12]:
model_dir = './result/model-bert'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
model_path = model_dir+'/model.pt'

dev_sent = 100

In [13]:
from datetime import datetime
start_time = datetime.now()
today = start_time.strftime('%Y-%m-%d')

In [14]:
# load data
data = read_data.load_trn_data()
trn_conll = read_data.load_trn_nlp()

In [15]:
# input data
# [
#     ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15'], 
#     ['인사동에', '들어서면', '다종다양의', '창호지,', '도자기', '등', '고미술품들이', '진열장에', '즐비하게', '널려져', '있는', '것을', '볼', '수', '있다.'], 
#     ['ARGM-LOC', '-', '-', '-', '-', '-', 'ARG1', 'ARG1', '-', '-', '-', 'ARG1', '-', '-', '-']
# ]

def get_input_data(data):
    result = []
    for sent in data:
        sent_list = []
        
        tok_idx = []
        tok_str = []
        tok_arg = []
        for token in sent:
            tok_idx.append(token[0])
            tok_str.append(token[1])
            tok_arg.append(token[2])
            
        sent_list.append(tok_idx)
        sent_list.append(tok_str)
        sent_list.append(tok_arg)
        result.append(sent_list)
    return result
        
input_data = get_input_data(data)

# gen TRN and DEV data

In [16]:
div = len(input_data) - dev_sent

dev = input_data[div:]
trn = input_data[:div]
gold_file = './dev.data'
print('dev data:', len(dev), 'sents')

with open(gold_file,'w') as f:
    dev_list = []
    for i in dev:
        dev_list += i[2]
        
    json.dump(dev_list, f)

dev data: 100 sents


In [17]:
def prepare_idx():
    dp_to_ix, arg_to_ix, morp_to_ix = {},{},{}
    dp_to_ix['null'] = 0
    morp_to_ix['null'] = 0
    
    for sent in trn_conll:
        for token in sent:
            dp = token[11]
            if dp not in dp_to_ix:
                dp_to_ix[dp] = len(dp_to_ix)
                
            morphs = token[2].split('+')
            for morp in morphs:
                if morp not in morp_to_ix:
                    morp_to_ix[morp] = len(morp_to_ix)
    args = ['ARG0', 'ARG1', 'ARG2', 'ARG3', 'ARGM-CAU', 'ARGM-CND', 'ARGM-DIR', 'ARM-DIS', 'ARGM-INS', 'ARGM-LOC', 'ARCM-MNR', 'ARCM-NEG', 'ARCM-PRD', 'ARCM-PRP', 'ARCM-TMP', 'ARCM-ADV', 'ARCM-EXT', '-']
    for i in args:
        if i not in arg_to_ix:
            arg_to_ix[i] = len(arg_to_ix)
    return dp_to_ix, arg_to_ix, morp_to_ix
dp_to_ix, arg_to_ix, morp_to_ix = prepare_idx()
DP_VOCAB_SIZE = len(dp_to_ix)
ARG_VOCAB_SIZE = len(arg_to_ix)
MORP_VOCAB_SIZE = len(morp_to_ix)
print('DP_VOCAB_SIZE:',DP_VOCAB_SIZE)
print('ARG_VOCAB_SIZE:',ARG_VOCAB_SIZE)
print('MORP_VOCAB_SIZE:', MORP_VOCAB_SIZE)

DP_VOCAB_SIZE: 32
ARG_VOCAB_SIZE: 18
MORP_VOCAB_SIZE: 63376


In [10]:
# Load BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual')
bert_model = BertModel.from_pretrained('bert-base-multilingual')

12/10/2018 21:55:15 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-vocab.txt from cache at /home/hahmyg/.pytorch_pretrained_bert/3f396e8b6d1942457b908bd7f351fa991ead4c4adef76c76189a9ace12841860.535306b226c42cebebbc0dabc83b92ab11260e9919e21e2ab0beb301f267b4c7
12/10/2018 21:55:16 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual.tar.gz from cache at /home/hahmyg/.pytorch_pretrained_bert/e359baa6e6b29d9971ac7525c83e8cad6f15dce9d8ec9bfdeafa149a7a2191c9.5e2593d7d76d4df2b618714d71af902c02a5f51c1b2d050399e1cb36b7bb2eeb
12/10/2018 21:55:16 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /home/hahmyg/.pytorch_pretrained_bert/e359baa6e6b29d9971ac7525c83e8cad6f15dce9d8ec9bfdeafa149a7a2191c9.5e2593d7d76d4df2b618714d71af902c02a5f51c1b2d050399e1cb36b7bb2eeb to temp dir /tmp/tmpla1t_kn3
1

In [19]:
def tokenization(tokens):
    bert_tokens = []
    orig_to_token_map = []
    bert_tokens.append("[CLS]")
    for i in range(len(tokens)):
        origin_token = tokens[i]
        orig_to_token_map.append(len(bert_tokens))
        bert_tokens.extend(tokenizer.tokenize(origin_token))
    
    return bert_tokens, orig_to_token_map  

In [20]:
def get_pred_idxs(conll):
    result = []
    preds = [0 for i in range(len(conll))]
    for i in range(len(conll)):
        tok = conll[i]
        if tok[10].startswith('V'):
            preds = [0 for item in range(len(conll))]
            preds[i] = 1
            result.append(preds)
            
    return result

In [21]:
def get_arg_idxs(pred_idx, conll):
    arg_idxs = [0 for i in range(len(conll))]
    for i in range(len(conll)):
        tok = conll[i]
        if int(tok[8]) == pred_idx:
            arg_pos = tok[-1]
            if arg_pos[:2] == 'NP':
                arg_idxs[i] = 1
                
    return arg_idxs

In [22]:
def get_feature(pred_idxs, conll):
    result = []
    for i in pred_idxs:
#         print(i)
        features = []
        for j in range(len(i)):
            pred_idx = i[j]
            if pred_idx == 1:
                arg_idxs = get_arg_idxs(j, conll)
#                 print(arg_idxs)
        for j in range(len(i)):
            feature = []                
            feature.append(i[j])
            feature.append(arg_idxs[j])
            features.append(feature)                
        result.append(features)
        
    return result

In [23]:
def prepare_sequence(seq, to_ix):
    vocab = list(to_ix.keys())
    idxs = []
    for w in seq:
        if w in vocab:
            idxs.append(to_ix[w])
        else:
            idxs.append(0)  

    return torch.tensor(idxs).cuda()

In [24]:
def get_dps(conll):
    dps = []
    for tok in conll:
        dp = tok[10]
        dps.append(dp)
    return dps

# Model