In [2]:
import json
from nltk.corpus import framenet as fn
import dataio

In [161]:
trn, dev, tst = dataio.load_framenet_data('1.1')


### loading Korean FrameNet 1.1 data...
	# of instances in training data: 17838
	# of instances in dev data: 2548
	# of instances in test data: 5097


In [162]:
kfn = trn+dev+tst

In [16]:
print(kfn[0])

[['태풍', 'Hugo가', '남긴', '피해들과', '회사', '내', '몇몇', '주요', '부서들의', '저조한', '실적들을', '반영하여,', 'Aetna', 'Life', 'and', 'Casualty', 'Co.의', '3분기', '순이익이', '182.6', '백만', '달러', '또는', '주당', '1.63', '달러로', '22', '%', '하락하였다.'], ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '이익.n', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_', 'Earnings_and_losses', '_', '_', '_', '_', '_', '_', '_', '_', '_', '_'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Earner', 'I-Earner', 'I-Earner', 'I-Earner', 'I-Earner', 'B-Time', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]


In [23]:
with open('../resource/info/FN17_frame2id.json','r') as f:
    f2id = json.load(f)
with open('../resource/info/fn1.7_frame2idx.json','r') as f:
    frame2idx = json.load(f)

In [148]:
def gen_overall_fe2idx():
    fes = []
    for f in f2id:
        fid = f2id[f]
        for item in fn.frame(fid).FE:
            fe = item
            fe = fe.replace('-','_')
            fes.append(fe)
    fes = list(set(fes))
    fes.sort()
    fe2idx = {}
    for fe in fes:
        fe2idx[fe] = len(fe2idx)
    
    with open('../resource/info/fn1.7_fe2idx.json','w') as f:
        json.dump(fe2idx, f, ensure_ascii=False, indent=4)
    return fe2idx
        
fe2idx = gen_overall_fe2idx()
print('# of fes:', len(fe2idx))

def gen_overall_frargmap():
    frargmap = {}
    for f in f2id:
        fid = f2id[f]
        fidx = frame2idx[f]
        fes = []
        for item in fn.frame(fid).FE:
            item = item.replace('-','_')
            fe = fe2idx[item]
            fes.append(fe)
        fes = list(set(fes))
        fes.sort()
        
        frargmap[fidx] = fes
        
    with open('../resource/info/fn1.7_frargmap.json', 'w') as f:
        json.dump(frargmap, f, ensure_ascii=False, indent=4)
        
    return frargmap
        
frargmap = gen_overall_frargmap()
print('# of frames:', len(frargmap))

# of fes: 1285
# of frames: 1221


In [32]:
idx2frame = dict(zip(frame2idx.values(),frame2idx.keys()))
idx2fe = dict(zip(fe2idx.values(),fe2idx.keys()))

In [163]:
def get_wrong_fe():
    n = 0
    errors = []
    texts = {}
    for item in kfn:
        frames, args = item[2], item[3]
        for f in frames:
            if f != '_':
                frame = f
                frame_idx = frame2idx[frame]
        text = []
        fes = []
        for idx in range(len(args)):
            arg = args[idx]
            t = item[0][idx]
            if arg.startswith('B'):
                fe = arg.split('-')[-1]
                fes.append(fe)
                if frame+'-'+fe in texts:
                    text = texts[frame+'-'+fe]
                    text.append(t)
                    texts[frame+'-'+fe] = text
                else:
                    text = []
                    text.append(t)
                    texts[frame+'-'+fe] = text
                    
        frarg = frargmap[frame_idx]
        frarg = [idx2fe[i] for i in frarg]
        
        for idx in range(len(fes)):
            fe = fes[idx]
            if fe not in fe2idx:
                z = frame+'-'+fe
                errors.append(z)
                
    errors = list(set(errors))
        
    with open('./error_190417.tsv','w') as f:
        for error in errors:
            text = ','.join(texts[error])
            frame = error.split('-')[0]
            fe = error.split('-')[1]
            line = text+'\t'+frame+'\t'+fe+'\n'
            f.write(line)
    return errors
errors = get_wrong_fe()
print(errors)

['Chemical-sense_description-Perceptual', 'Chemical-sense_description-Sensory']


In [103]:
def get_sent(input_word, input_frame, input_fe):
    for item in kfn:
        tokens, lus, frames, args = item[0], item[1], item[2], item[3]
        text = ' '.join(tokens)
        for idx in range(len(frames)):
            f = frames[idx]
            l = lus[idx]
            if f != '_':
                frame = f
                lu = l
        if input_frame == frame:

            for idx in range(len(tokens)):
                token = tokens[idx]
                arg = args[idx]

                if token == input_word and arg == 'B-'+input_fe:
                    print('lu:', lu)
                    print(text)


input_word = '한'
input_frame = 'Posture'
input_fe = 'Supporting_body_part'
get_sent(input_word, input_frame, input_fe)

lu: 자세.n
플라톤 신학인지 뭔지가 들어오면서, 그런 성서가 만들어졌다고 생각합니다만, 이 소설의 도입은, 그런 말과는 달리 궁지에 몰린 소녀가 한 발로 선 자세로 시작됩니다.


In [119]:
def lines2data(lines):
    result = []
    tsv = {}
    sent = []
    for line in lines:
        line = line.strip()
        if line.startswith('#'):
            if 'text' in line:
                tsv['second'] = line
            else:
                tsv['first'] = line
        else:
            if line != '':
                token = line.split('\t')
                sent.append(token)
            else:
                tsv['conll'] = sent
                result.append(tsv)
                tsv = {}
                sent = []
    return result 

def load_ori_data(fname):
    with open(fname,'r') as f:
        input_data = f.readlines()
    data = lines2data(input_data)    
    return data

In [115]:
a,b,c = dataio.load_framenet_data('1.0')


### loading Korean FrameNet 1.0 data...
	# of instances in training data: 12431
	# of instances in dev data: 624
	# of instances in test data: 4382


In [160]:
def error_correction(fname):
    with open('./fe_910416mapping_table_.csv','r') as f:
        mt = f.readlines()
    mapping_table = {}
    for m in mt:
        m = m.strip()
        frame, ori_fe, new_fe = m.split(',')[0], m.split(',')[1], m.split(',')[2]
        mt_tuple = (ori_fe, new_fe)
        if frame not in mapping_table:
            mt_list = []
            mt_list.append(mt_tuple)
            mapping_table[frame] = mt_list
        else:
            mt_list = mapping_table[frame]
            mt_list.append(mt_tuple)
            mapping_table[frame] = mt_list
        
    orifile = fname+'.bak'
    
    ori_data = load_ori_data(orifile)
    
    for item in ori_data:
            
        for tok in item['conll']:
            if tok[3] != '_':
                frame = tok[3]
        change = False
        if frame in mapping_table:
            mt_tuples = mapping_table[frame]
            for ori_fe, new_fe in mt_tuples:
                

                for tok in item['conll']:
                    bio = 'B-'+ori_fe
                    if tok[4] == bio:
                        if new_fe != 'O':
                            tok[4] = 'B-'+new_fe
                        else:
                            tok[4] = 'O'
                        change = True
                    bio = 'I-'+ori_fe
                    if tok[4] == bio:
                        if new_fe != 'O':
                            tok[4] = 'I-'+new_fe
                        else:
                            tok[4] = 'O'
                        change = True
    with open(fname, 'w') as f:
        for item in ori_data:
            line1 = item['first']
            line2 = item['second']
            f.write(line1+'\n')
            f.write(line2+'\n')
            
            for i in item['conll']:
                line = '\t'.join(i)
                f.write(line+'\n')
            f.write('\n')
        
files = ['../data/1.0/training.tsv', '../data/1.0/dev.tsv', '../data/1.0/test.tsv', '../data/1.1/training.tsv', '../data/1.1/dev.tsv', '../data/1.1/test.tsv']

for fname in files:
    error_correction(fname)