In [1]:
import json
import sys
sys.path.append('../')

from KAIST_frame_parser.src import dataio, etri
from KAIST_frame_parser.src import targetid
import torch
from torch import nn
import os
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from pytorch_pretrained_bert import BertTokenizer, BertConfig, BertModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

from KAIST_frame_parser.koreanframenet.src import conll2textae
from KAIST_frame_parser.koreanframenet import koreanframenet

from pprint import pprint

Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
### Korean FrameNet ###
	# contact: hahmyg@kaist, hahmyg@gmail.com #



In [2]:
language = 'en'
version = 1.7
if language == 'en':
    framenet = 'fn'+str(version)
    fn_dir = '/disk_4/resource/fn'+str(version)
    trn, dev, tst = dataio.load_fn_data(fn_dir)
elif language == 'ko':
    framenet = 'kfn'+str(version)
    kfn = koreanframenet.interface(version=version)
    trn, dev, tst = kfn.load_data()
    
try:
    target_dir = os.path.dirname(os.path.abspath( __file__ ))
except:
    target_dir = '.'
    
# save your model to
model_dir = target_dir+'/models/'+framenet+'/'
result_dir = target_dir+'/result/'

In [3]:
print(trn[0])

[['Paula_Zahn', ':', 'Questions', 'about', 'the', 'facts', 'or', 'what', 'were', 'presented', 'as', 'facts', 'that', 'led', 'the', 'United', 'States', 'into', 'the', 'war', 'in', 'Iraq', 'spilled', 'into', 'open', 'warfare', 'today', 'on', 'the', 'Senate', 'floor', '.'], ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'spill.v', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'Fluidic_motion', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['O', 'O', 'B-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'I-Fluid', 'O', 'B-Goal', 'I-Goal', 'I-Goal', 'B-Time', 'B-Place', 'I-Place', 'I-Place', 'I-Place', 'O']]


In [4]:
try:
    target_dir = os.path.dirname(os.path.abspath( __file__ ))
except:
    target_dir = '.'
data_path = target_dir+'/koreanframenet/resource/info/'
with open(data_path+framenet+'_lu2idx.json','r') as f:
    lu2idx = json.load(f)
with open(data_path+'fn1.7_frame2idx.json','r') as f:
    frame2idx = json.load(f)      
with open(data_path+framenet+'_lufrmap.json','r') as f:
    lufrmap = json.load(f)
with open(data_path+'fn1.7_fe2idx.json','r') as f:
    arg2idx = json.load(f)
idx2frame = dict(zip(frame2idx.values(),frame2idx.keys()))
idx2lu = dict(zip(lu2idx.values(),lu2idx.keys()))
idx2arg = dict(zip(arg2idx.values(),arg2idx.keys()))

In [5]:
try:
    dir_path = os.path.dirname( os.path.abspath( __file__ ))
except:
    dir_path = '.'

In [18]:
class for_BERT():
    
    def __init__(self, mode='training', language='ko', version=1.0):
        version = str(version)
        self.mode = mode
        if language == 'en':
            data_path = dir_path+'/koreanframenet/resource/info/fn'+version+'_'
        else:
            data_path = dir_path+'/koreanframenet/resource/info/kfn'+version+'_'
        with open(data_path+'lu2idx.json','r') as f:
            self.lu2idx = json.load(f)
        with open(dir_path+'/koreanframenet/resource/info/fn1.7_frame2idx.json','r') as f:
            #self.sense2idx = json.load(f)
            self.frame2idx = json.load(f)
        with open(data_path+'lufrmap.json','r') as f:
            #self.lusensemap = json.load(f)
            self.lufrmap = json.load(f)
        with open(dir_path+'/koreanframenet/resource/info/fn1.7_fe2idx.json','r') as f:
            self.arg2idx = json.load(f)
        with open(dir_path+'/koreanframenet/resource/info/fn1.7_frargmap.json','r') as f:
            self.frargmap = json.load(f)
        with open(dir_path+'/koreanframenet/resource/info/fn1.7_bio_fe2idx.json','r') as f:
            self.bio_arg2idx = json.load(f)
        with open(dir_path+'/koreanframenet/resource/info/fn1.7_bio_frargmap.json','r') as f:
            self.bio_frargmap = json.load(f)

        self.idx2frame = dict(zip(self.frame2idx.values(),self.frame2idx.keys()))
        self.idx2lu = dict(zip(self.lu2idx.values(),self.lu2idx.keys()))
        self.idx2arg = dict(zip(self.arg2idx.values(),self.arg2idx.keys()))
        self.idx2bio_arg = dict(zip(self.bio_arg2idx.values(),self.bio_arg2idx.keys()))

        # load pretrained BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
        
        # load BERT tokenizer with untokenizing frames
        never_split_tuple = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
        added_never_split = []
        added_never_split.append('<tgt>')
        for frame in self.frame2idx:
            added_never_split.append('['+frame+']')
        added_never_split_tuple = tuple(added_never_split)
        never_split_tuple += added_never_split_tuple
        vocab_file_path = dir_path+'/data/bert-multilingual-cased-dict-add-frames'
        self.tokenizer_with_frame = BertTokenizer(vocab_file_path, do_lower_case=False, max_len=512, never_split=never_split_tuple)

    def idx2tag(self, predictions, model='frameid'):
        if model == 'frameid':
            pred_tags = [self.idx2frame[p_i] for p in predictions for p_i in p]
        elif model == 'argclassification':
            pred_tags = [self.idx2arg[p_i] for p in predictions for p_i in p]
        elif model == 'argid':
            pred_tags = [self.idx2bio_arg[p_i] for p in predictions for p_i in p]
        return pred_tags
    
    def get_masks(self, datas, model='frameid'):
        if model == 'frameid':
            mapdata = self.lufrmap
            num_label = len(self.frame2idx)
        elif model == 'argclassification':
            mapdata = self.frargmap
            num_label = len(self.arg2idx)
        elif model == 'argid':
            mapdata = self.bio_frargmap
            num_label = len(self.bio_arg2idx)
        masks = []
        for idx in datas:
            mask = torch.zeros(num_label)
            try:
                candis = mapdata[str(int(idx[0]))]
            except KeyboardInterrupt:
                raise
            except:
                candis = mapdata[int(idx[0])]
            for candi_idx in candis:
                mask[candi_idx] = 1
            masks.append(mask)
        masks = torch.stack(masks)
        return masks
    
    
    # bert tokenizer and assign to the first token
    def bert_tokenizer(self, text):
        orig_tokens = text.split(' ')
        bert_tokens = []
        orig_to_tok_map = []
        bert_tokens.append("[CLS]")
        for orig_token in orig_tokens:
            orig_to_tok_map.append(len(bert_tokens))
            bert_tokens.extend(self.tokenizer.tokenize(orig_token))
        bert_tokens.append("[SEP]")

        return orig_tokens, bert_tokens, orig_to_tok_map
    
    # bert tokenizer and assign to the last token
    def bert_tokenizer_assign_to_last_token(self, text):
        orig_tokens = text.split(' ')
        bert_tokens = []
        orig_to_tok_map = []
        bert_tokens.append("[CLS]")
        for orig_token in orig_tokens:
            bert_tokens.extend(self.tokenizer.tokenize(orig_token))
            orig_to_tok_map.append(len(bert_tokens)-1)
        bert_tokens.append("[SEP]")

        return orig_tokens, bert_tokens, orig_to_tok_map
    
    def bert_tokenizer_with_frame(self, text, frame):
        orig_tokens = text.split(' ')
        bert_tokens = []
        orig_to_tok_map = []
        bert_tokens.append("["+frame+"]")
        for orig_token in orig_tokens:
            bert_tokens.extend(self.tokenizer_with_frame.tokenize(orig_token))
            orig_to_tok_map.append(len(bert_tokens)-1)
        bert_tokens.append("[SEP]")
        
        return orig_tokens, bert_tokens, orig_to_tok_map
    
    def convert_to_bert_input_frameid(self, input_data):
        tokenized_texts, lus, frames = [],[],[]

        for i in range(len(input_data)):    
            data = input_data[i]
            text = ' '.join(data[0])
            orig_tokens, bert_tokens, orig_to_tok_map = self.bert_tokenizer(text)
            tokenized_texts.append(bert_tokens)

            ori_lus = data[1]    
            lu_sequence = []
            for i in range(len(bert_tokens)):
                if i in orig_to_tok_map:
                    idx = orig_to_tok_map.index(i)
                    l = ori_lus[idx]
                    lu_sequence.append(l)
                else:
                    lu_sequence.append('_')
            lus.append(lu_sequence)        

            if self.mode == 'training':
                ori_frames = data[2]    
                frame_sequence = []
                for i in range(len(bert_tokens)):
                    if i in orig_to_tok_map:
                        idx = orig_to_tok_map.index(i)
                        l = ori_frames[idx]
                        frame_sequence.append(l)
                    else:
                        frame_sequence.append('_')
                frames.append(frame_sequence)

        input_ids = pad_sequences([self.tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

        tgt_seq, lu_seq, frame_seq = [],[],[]
        for sent_idx in range(len(lus)):
            lu_items = lus[sent_idx]
            tgt,lu = [],[]
            for idx in range(len(lu_items)):
                if lu_items[idx] != '_':
                    if len(tgt) == 0:
                        tgt.append(idx)
                        lu.append(self.lu2idx[lu_items[idx]])
            tgt_seq.append(tgt)
            lu_seq.append(lu)
            
            if self.mode == 'training':
                frame_items = frames[sent_idx]
                frame = []
                for idx in range(len(frame_items)):
                    if frame_items[idx] != '_':
                        if len(frame) == 0:
                            frame.append(self.frame2idx[frame_items[idx]])
                frame_seq.append(frame)
            
            

        attention_masks = [[float(i>0) for i in ii] for ii in input_ids]    
        data_inputs = torch.tensor(input_ids)
        data_tgt_idx = torch.tensor(tgt_seq)
        data_lus = torch.tensor(lu_seq)
        data_frames = torch.tensor(frame_seq)
        data_masks = torch.tensor(attention_masks)
        
        if self.mode == 'training':
            bert_inputs = TensorDataset(data_inputs, data_tgt_idx, data_lus, data_frames, data_masks)
        else:
            bert_inputs = TensorDataset(data_inputs, data_tgt_idx, data_lus,data_masks)
        return bert_inputs

In [19]:
bert_io = for_BERT(mode='training', language=language, version=version)

In [21]:
text = '나는 <tgt> 밥을 <tgt> 먹었다'
frame = 'Ingesting'

orig_tokens, bert_tokens, orig_to_tok_map = bert_io.bert_tokenizer_with_frame(text, frame)
print(orig_tokens)
print(bert_tokens)

['나는', '<tgt>', '밥을', '<tgt>', '먹었다']
['[Ingesting]', '나는', '<tgt>', '밥', '##을', '<tgt>', '먹', '##었다', '[SEP]']


# gen BERT input representation

In [None]:
# print('generate BERT input representation ...')
# bert_io = dataio.for_BERT(mode='training', language=language, version=version)

# trn_data = bert_io.convert_to_bert_input_arg_classifier(trn)
# trn_sampler = RandomSampler(trn_data)
# trn_dataloader = DataLoader(trn_data, sampler=trn_sampler, batch_size=batch_size)

# dev_data = bert_io.convert_to_bert_input_arg_classifier(dev)
# dev_sampler = RandomSampler(dev_data)
# dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size)

# tst_data = bert_io.convert_to_bert_input_arg_classifier(tst)
# tst_sampler = RandomSampler(tst_data)
# tst_dataloader = DataLoader(tst_data, sampler=tst_sampler, batch_size=batch_size)
# print('... is done')