In [1]:
import json
from src import dataio, etri
import targetid
import torch
from torch import nn
import os
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

from koreanframenet.src import conll2textae

from konlpy.tag import Kkma
from pprint import pprint

Using TensorFlow backend.


Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
### Korean FrameNet ###
	# contact: hahmyg@kaist, hahmyg@gmail.com #

pos: [('헤밍웨이', 'UN'), ('는', 'JX')]
targets: []
pos: [('미국', 'NNP'), ('에서', 'JKM')]
targets: ['미국']
pos: [('태어나', 'VV'), ('었', 'EPT'), ('다', 'EFN'), ('.', 'SF')]
targets: ['태어나']
result
[[['헤밍웨이는', '미국에서', '태어났다.'], ['_', '미국.n', '_']], [['헤밍웨이는', '미국에서', '태어났다.'], ['_', '_', '태어나다.v']]]


# SETTINGS

In [2]:
try:
    dir_path = os.path.dirname( os.path.abspath( __file__ ))
except:
#     dir_path = '.'
    dir_path = '/disk_4/resource'

version = 1.1
frameid_model_path = dir_path+'/models/kfn/frameid-'+str(version)+'.pt'

In [3]:
data_path = './koreanframenet/resource/info/kfn'+str(version)+'_'

with open(data_path+'lu2idx.json','r') as f:
    lu2idx = json.load(f)
with open('./koreanframenet/resource/info/fn1.7_frame2idx.json','r') as f:
    sense2idx = json.load(f)      
with open(data_path+'lufrmap.json','r') as f:
    lusensemap = json.load(f)
    
idx2sense = dict(zip(sense2idx.values(),sense2idx.keys()))
idx2lu = dict(zip(lu2idx.values(),lu2idx.keys()))

In [4]:
class models():
    def __init__(self, mode='parser', version=1.0):
        self.version = version
        self.mode = mode
        self.bert_io = dataio.for_BERT(mode=self.mode, version=self.version)
        self.frameid_model = torch.load(frameid_model_path)
        
    def frame_identifier(self, tgt_data):
        bert_inputs = self.bert_io.convert_to_bert_input_frameid(tgt_data)
        dataloader = DataLoader(bert_inputs, sampler=None, batch_size=1)
        
        predictions, scores, candis = [], [], []
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_tgt_idxs, b_lus, b_masks = batch
            with torch.no_grad():
                logits = self.frameid_model(b_input_ids, token_type_ids=None, tgt_idxs=b_tgt_idxs, 
                                lus=b_lus, attention_mask=b_masks)
            logits = logits.detach().cpu().numpy()      
            masks = self.bert_io.get_masks(b_lus, model='frameid').to(device)
            
            for lu in b_lus:
                candi_idx = lusensemap[str(int(lu))]
                candi = [idx2sense[c] for c in candi_idx]
                candi_txt = ','.join(candi)
                candi_txt = str(len(candi))+'\t'+candi_txt
                candis.append(candi_txt)
            for b_idx in range(len(logits)):
                logit = logits[b_idx]
                mask = masks[b_idx]
                b_pred_idxs, b_pred_logits = [],[]
                for fr_idx in range(len(mask)):
                    if mask[fr_idx] > 0:
                        b_pred_idxs.append(fr_idx)
                        b_pred_logits.append(logit[0][fr_idx].item())
                b_pred_idxs = torch.tensor(b_pred_idxs)
                b_pred_logits = torch.tensor(b_pred_logits)
                sm = nn.Softmax()
                b_pred_logits = sm(b_pred_logits).view(1, -1)
                score, indice = b_pred_logits.max(1)                
                prediction = b_pred_idxs[indice]
                predictions.append([int(prediction)])
                score = float(score)
                scores.append(score)
        pred_tags = self.bert_io.idx2tag(predictions)       
        conll, tuples = [],[]
        for i in range(len(tgt_data)):
            instance = tgt_data[i]
            tokens, targets = instance[0], instance[1]
            frames = ['_' for i in range(len(targets))]
            for t in range(len(targets)):
                if targets[t] != '_':
                    frames[t] = pred_tags[i]
            instance.append(frames)
            conll.append(instance)
            
            tup = (pred_tags[i], scores[i])
            tuples.append(tup)
            
        return conll, tuples

In [5]:
fn_models = models(mode='parser', version=version)

In [6]:
kkma = Kkma()
def doc2sents(text):
    result = []
    n = 0
    sents = text.split('. ')
    for sent in sents:
        if len(sent) >0:
            sent = sent+'.'
            stringuri = 'test_0_'+str(n)
            tup = (sent, stringuri)
            result.append(tup)
            n +=1
    return result

In [59]:
def result2triples(text, conll, tuples, stringuri):
    triples = []
    triple = (str(stringuri), 'nif:isString', text)
    triples.append(triple)
    # for target_id
    if len(conll) > 0:
        for i in range(len(conll)):
            instance = conll[i]
            tokens, targets, frames, args = instance[0], instance[1], instance[2], instance[3]
            for tok in range(len(tokens)):
                if frames[tok] != '_':
                    frame = 'frame:'+frames[tok]
                    lu = targets[tok]
            triple = (frame, 'frdf:provinence', str(stringuri))
            triples.append(triple)
            triple = (frame, 'frdf:lu', lu)
            triples.append(triple)
            triple = (frame, 'frdf:score', str(tuples[i][1]))
            triples.append(triple)
            
            #args to triples
            for idx in range(len(args)):
                arg_tag = args[idx]
                arg_tokens = []
                if arg_tag.startswith('B'):
                    fe_tag = arg_tag.split('-')[1]
                    arg_tokens.append(tokens[idx])
                    next_idx = idx + 1
                    while next_idx < len(args) and args[next_idx] == 'I-'+fe_tag:
                        arg_tokens.append(tokens[next_idx])
                        next_idx +=1
                            
                    arg_text = ' '.join(arg_tokens)
                    triple = (frame, 'frdf:arg', arg_text)
                    triples.append(triple)
                        
    return triples

In [8]:
def sent2pa(text):
    nlp = etri.getETRI(text)    
    conll_2009 = etri.getETRI_CoNLL2009(nlp)
    predicate_argument = etri.phrase_parser(conll_2009, nlp)
    return predicate_argument

In [31]:
def arg_identifier(conll):
    text = ' '.join(conll[0][0])
    pas = sent2pa(text)
    
    result = []
    for anno in conll:
        args = ['O' for i in range(len(anno[1]))]
        for idx in range(len(anno[1])):
            if anno[1][idx] != '_':
                lu = anno[1][idx]
                frame = anno[2][idx]
                target_idx = idx
        for pa in pas:
            if target_idx == pa['predicate']['id']:
                for argument in pa['arguments']:
                    for idx in range(len(argument['tokens'])):
                        token_id = argument['tokens'][idx]
                        if idx == 0:
                            bio = 'B-'
                        else:
                            bio = 'I-'
                        fe = 'ARG'
                        args[token_id] = bio+fe
        new_anno = anno
        new_anno.append(args)
        result.append(new_anno)
    return result

In [50]:
def main(data, sentence_id):
    input_data = dataio.preprocessor(data)
    text = data
    tgt_data = targetid.baseline(input_data)
    fid_data, fid_result  = fn_models.frame_identifier(tgt_data)    
    argid_data = arg_identifier(fid_data)    
    
    framegraph = result2triples(text, argid_data, fid_result, sentence_id)
    textae = conll2textae.get_textae(argid_data)
        
    result = {}
    result['graph'] = framegraph
    result['textae'] = textae
    return result

In [60]:
text = '헤밍웨이는 1899년 7월 21일 미국 일리노이에서 태어났고, 1961년 7월 2일 아이다호 주에서 사망했다.'
stringuri = 'test:offset_0_53'
parsed = main(text, stringuri)
pprint(parsed['graph'])

[('test:offset_0_53',
  'nif:isString',
  '헤밍웨이는 1899년 7월 21일 미국 일리노이에서 태어났고, 1961년 7월 2일 아이다호 주에서 사망했다.'),
 ('frame:Origin', 'frdf:provinence', 'test:offset_0_53'),
 ('frame:Origin', 'frdf:lu', '미국.n'),
 ('frame:Origin', 'frdf:score', '1.0'),
 ('frame:Being_born', 'frdf:provinence', 'test:offset_0_53'),
 ('frame:Being_born', 'frdf:lu', '태어나다.v'),
 ('frame:Being_born', 'frdf:score', '1.0'),
 ('frame:Being_born', 'frdf:arg', '헤밍웨이는'),
 ('frame:Being_born', 'frdf:arg', '1899년 7월 21일'),
 ('frame:Being_born', 'frdf:arg', '미국 일리노이에서'),
 ('frame:Political_locales', 'frdf:provinence', 'test:offset_0_53'),
 ('frame:Political_locales', 'frdf:lu', '주.n'),
 ('frame:Political_locales', 'frdf:score', '0.9832083582878113'),
 ('frame:Death', 'frdf:provinence', 'test:offset_0_53'),
 ('frame:Death', 'frdf:lu', '사망.n'),
 ('frame:Death', 'frdf:score', '1.0'),
 ('frame:Death', 'frdf:arg', '헤밍웨이는'),
 ('frame:Death', 'frdf:arg', '1961년 7월 2일'),
 ('frame:Death', 'frdf:arg', '아이다호 주에서')]




In [61]:
text = '어니스트 헤밍웨이는 미국의 소설가이자 저널리스트이다. 1854년 노벨 문학상을 수상하였다. 헤밍웨이는 1899년 7월 21일 일리노이주에서 태어났다. 헤밍웨이는 풀린 파이퍼와 이혼한 뒤 마사 겔혼과 재혼하였다. 헤밍웨이는 1961년 아이다호 주에서 62세의 나이에 자살했다.'
sents = doc2sents(text)
framegraphs = []
for text, stringuri in sents:
    parsed = main(text, stringuri)
    pprint(parsed['graph'])
    print('')
#     framegraphs+=(parsed['framegraph'])
pprint(framegraphs)



[('test_0_0', 'nif:isString', '어니스트 헤밍웨이는 미국의 소설가이자 저널리스트이다.'),
 ('frame:Origin', 'frdf:provinence', 'test_0_0'),
 ('frame:Origin', 'frdf:lu', '미국.n'),
 ('frame:Origin', 'frdf:score', '1.0')]

[('test_0_1', 'nif:isString', '1854년 노벨 문학상을 수상하였다.'),
 ('frame:Leadership', 'frdf:provinence', 'test_0_1'),
 ('frame:Leadership', 'frdf:lu', '수상.n'),
 ('frame:Leadership', 'frdf:score', '1.0'),
 ('frame:Leadership', 'frdf:arg', '1854년'),
 ('frame:Leadership', 'frdf:arg', '노벨 문학상을')]

[('test_0_2', 'nif:isString', '헤밍웨이는 1899년 7월 21일 일리노이주에서 태어났다.'),
 ('frame:Being_born', 'frdf:provinence', 'test_0_2'),
 ('frame:Being_born', 'frdf:lu', '태어나다.v'),
 ('frame:Being_born', 'frdf:score', '1.0'),
 ('frame:Being_born', 'frdf:arg', '헤밍웨이는'),
 ('frame:Being_born', 'frdf:arg', '1899년 7월 21일'),
 ('frame:Being_born', 'frdf:arg', '일리노이주에서')]

[('test_0_3', 'nif:isString', '헤밍웨이는 풀린 파이퍼와 이혼한 뒤 마사 겔혼과 재혼하였다.'),
 ('frame:Change_of_phase', 'frdf:provinence', 'test_0_3'),
 ('frame:Change_of_phase', 'frdf:lu', '풀리다.