In [1]:
import json

In [2]:
version = 1.0
kfn_path = '../resource/'+str(version)
fn_path = '../resource/info/FN17_frame_id.json'
web_path = '../../koreanframenet-web/data/kolu_'+str(version)+'/'

In [3]:
def load_kfn():
    with open(kfn_path+'/KFN_annotations.json','r') as f:
        kfn = json.load(f)
    return kfn
kfn = load_kfn()

In [4]:
def load_fn17():
    with open(fn_path,'r') as f:
        fn = json.load(f)
    return fn
fn = load_fn17()
def get_fnid(frame):
    for i in fn:
        if i['frame'] == frame:
            return i['id']

In [5]:
def make_key(string):
    return string.replace('-', '').upper()
def get_surface_form(target_span, text):
    return text[target_span['begin']:target_span['end']]
def get_fes(denos):
    fes = []
    for deno in denos:
        if deno['role'] == 'ARGUMENT':
            fes.append(deno['obj'])
    return list(set(fes))
    

def gen_lus():
    lus = {}
    lu_items = []
    for i in kfn:
        d = kfn[i]
        lu_items.append(d['lu'])
    lu_items = list(set(lu_items))
    lu_items.sort(key=make_key)
    lu_index = {}
    for i in lu_items:
        lu_index[i] = len(lu_index)+1
    
    for i in kfn:
        annotation_id = int(i)
        d = kfn[i]
        lu = d['lu']
        pos = lu.split('.')[1]
        frame = d['frame']
        text = d['text']
        lexeme = lu.split('.')[0]
        for deno in d['denotations']:
            if deno['role'] == 'TARGET':
                target_span = deno['span']
        surface_form = get_surface_form(target_span, text)
        fid = get_fnid(frame)
        new_fes = get_fes(d['denotations'])
        
        luid = lu_index[lu]
        
        if luid not in lus:
            item = {}
            item['lu'] = lu
            item['pos'] = pos
            item['lexeme'] = lexeme
            item['frame'] = frame
            item['fid'] = int(fid)
            surface_forms = []
            surface_forms.append(surface_form)
            surface_forms = list(set(surface_forms))
            item['surface_forms'] = surface_forms
            annotation_ids = []
            annotation_ids.append(annotation_id)
            annotation_ids = list(set(annotation_ids))
            item['annotation_ids'] = annotation_ids
            lus[luid] = item
        else:
            item = lus[luid]
            surface_forms = item['surface_forms']
            surface_forms.append(surface_form)
            surface_forms = list(set(surface_forms))
            item['surface_forms'] = surface_forms
            annotation_ids = item['annotation_ids']
            annotation_ids.append(annotation_id)
            annotation_ids = list(set(annotation_ids))
            item['annotation_ids'] = annotation_ids
            lus[luid] = item
    with open(kfn_path+'/KFN_lus.json','w') as f:
        json.dump(lus, f, ensure_ascii=False, indent=4)
    print(len(lus), 'lus are saved')
gen_lus()       

5661 lus are saved


In [7]:
def gen_web_files():
    with open(kfn_path+'/KFN_lus.json','r') as f:
        lus = json.load(f)
    n = 0
    for luid in lus:
        d = lus[luid]
        fname = web_path+luid+'.json'

        lu = d['lu']
        ko_pos = d['pos']
        frameID = d['fid']
        frameName = d['frame']
        lu_id = luid
        
        patterns = []
        for aid in d['annotation_ids']:
            examples = []
            example = {}
            example['denotations'] = kfn[str(aid)]['denotations']
            example['relations'] = kfn[str(aid)]['relations']
            if 'FrameNet' in kfn[str(aid)]['sent_id']:
                example['sent_id'] =  kfn[str(aid)]['sent_id'].split('-')[-1]
            example['lu'] = lu
            example['text'] = kfn[str(aid)]['text']
            examples.append(example)
            
            pattern = {}
            # valence pattern
            pattern['valenceText'] = 'id: '+kfn[str(aid)]['sent_id']
            pattern['examples'] = examples
            patterns.append(pattern)
            
        item = {}
        # origin_lus
        item['en_lus'] = []
        item['ko_lu'] = lu
        item['ko_pos'] = ko_pos
        item['frameID'] = frameID
        item['frameName'] = frameName
        item['lu_id'] = lu_id
        item['patterns'] = patterns
        
        with open(fname, 'w') as f:
            json.dump(item, f, ensure_ascii=False, indent=4)
        n += 1
    print(n, 'files are writed')
    
    lu_items = []
    for i in kfn:
        d = kfn[i]
        lu_items.append(d['lu'])
    lu_items = list(set(lu_items))
    lu_items.sort(key=make_key)

    n=0
    luindex = []
    for i in lu_items:
        n+=1
        item = {}
        item['lu'] = i
        item['pos'] = i.split('.')[1]
        item['id'] = n
        luindex.append(item)
    with open(web_path+'index.json','w') as f:
        json.dump(luindex, f, ensure_ascii=False, indent=4)
        
gen_web_files()

5661 files are writed
['-ㄹ_것이다.a.Certainty', '가게.n.Businesses', '가게.n.Locale_by_use', '가격.n.Commerce_scenario', '가계.n.Kinship', '가공.n.Manufacturing', '가공.n.Processing_materials', '가공되다.v.Ground_up', '가공하다.v.Grinding', '가까워지다.v.Arriving', '가깝다.a.Locative_relation', '가끔.n.Frequency', '가난.n.Wealthiness', '가능성.n.Capability', '가능성.n.Likelihood', '가능성있다.a.Likelihood', '가능하다.a.Capability', '가능하다.a.Existence', '가능하다.a.Likelihood', '가다.v.Arriving', '가다.v.Causation', '가다.v.Locale', '가다.v.Motion', '가다.v.Motion_directional', '가다.v.Operate_vehicle', '가다.v.Self_motion', '가담하다.v.Participation', '가동.n.Using', '가동되다.v.Becoming', '가동되다.v.Using', '가동하다.v.Using', '가득하다.a.Abounding_with', '가라앉다.v.Motion_directional', '가로.n.Direction', '가로막다.v.Intercepting', '가로지르다.v.Motion', '가로채다.v.Intercepting', '가르다.v.Education_teaching', '가르치다.v.Education_teaching', '가리다.v.Evidence', '가만하다.v.Posture', '가문.n.Kinship', '가볍다.a.Position_on_a_scale', '가사.n.Text', '가스.n.Substance', '가슴.n.Body_parts', '가운데.n.Be_subset_of', '가