In [11]:
import json
import preprocessor
import re
from koreanframenet import kfn

# loading DATA
def load_data():
    training, test, training_fe = preprocessor.load_data()
    #result = training + test
    result = training
    return result

koreanFN = load_data()

### loading data now...
# training_data
 - number of full-sentences: 14811
 - number of sentences: 33311 

# test_data
 - number of full-sentences: 1863
 - number of sentences: 2793 

# training_fe (for FE identification)
 - number of full-sentences: 4248
 - number of sentences: 14462 



In [14]:
def dummy():
    for i in koreanFN[0]:
        print(i)
dummy()

['0', '이란은', '이란/NNG+은/JX', '이란은', 'NNG+JX', 'NNG+JX', '_', '_', '8', '8', 'NP_SBJ', 'NP_SBJ', '_', '_', 'S_speaker']
['1', '화학', '화학/NNG', '화학', 'NNG', 'NNG', '_', '_', '2', '2', 'NP', 'NP', '_', '_', 'B_topic']
['2', '시설과', '시설/NNG+과/JC', '시설과', 'NNG+JC', 'NNG+JC', '_', '_', '8', '8', 'NP_CNJ', 'NP_CNJ', '_', '_', 'I_topic']
['3', '과거', '과거/NNG', '과거', 'NNG', 'NNG', '_', '_', '8', '8', 'NP_AJT', 'NP_AJT', '_', '_', 'I_topic']
['4', '화학무기', '화학무기/NNG', '화학무기', 'NNG', 'NNG', '_', '_', '5', '5', 'NP', 'NP', '_', '_', 'I_topic']
['5', '비축에', '비축/NNG+에/JKB', '비축에', 'NNG+JKB', 'NNG+JKB', '_', '_', '6', '6', 'NP_AJT', 'NP_AJT', '_', '_', 'I_topic']
['6', '관한', '관하/VV+ㄴ/ETM', '관한', 'VV+ETM', 'VV+ETM', '_', '_', '7', '7', 'VP_MOD', 'VP_MOD', '_', '_', 'I_topic']
['7', '선언서를', '선언서/NNG+를/JKO', '선언서를', 'NNG+JKO', 'NNG+JKO', '_', '_', '8', '8', 'NP_OBJ', 'NP_OBJ', '선언서.n', 'Statement', 'O']
['8', '제출하였는데,', '제출하/VV+었/EP+는데/EC+,/SP', '제출하였는데,', 'VV+EP+EC+SP', 'VV+EP+EC+SP', '_', '_', '18', '18', 

In [15]:
# for a given sentence list, it identifies 'target' and 'its frame'
def get_target(sent_list):
    token_list = []
    frame = 'None'
    for i in sent_list:
        #print(i)
        if i[12] != '_':
            token_list.append(i[1])
            frame = i[13]
    target = ' '.join(token_list)
    spc = [',','.','!','?']
    if len(target) >1:
        if target[-1] in spc:
            target = re.sub('[,.?!]', '', target)
    return target, frame


In [16]:
# for a given sentence list, it identifies an ID of 'LU'
def get_lu_id(sent_list):
    target, frame = get_target(sent_list)
    print('target:', target, 'frame:', frame)
    lu_id = kfn.surface_to_lu_id(target, frame)
    
    return lu_id

In [17]:
# CoNLL format의 문장 리스트에 대해, valence pattern을 생성하는 함수
def getValencePattern(sent_list):
    result = []
    fes = []
    for token in sent_list:
        if token[14] != 'O':
            fe = token[14].split('_')[1]
            fes.append(fe)
    fes = list(set(fes))
    for fe in fes:
        pos_seq = []
        pt_seq = []
        for token in sent_list:
            if token[14] != 'O':
                fe_in_sent = token[14].split('_')[1]
            else:
                fe_in_sent = token[14]
            if fe == fe_in_sent:
                # 각 argument 에 대해서 패턴 생성
                pos_seq.append(token[4])
                pt_seq.append(token[9])
                pt = token[9]
                pos = token[4]
                if 'J' in pos:
                    suffix = token[2].split('+')[-1]
                    suffix_pos = token[4].split('+')[-1]
                elif 'NN' in pos:
                    suffix = ''
                    suffix_pos = ''
                else:
                    suffix = token[1]+'/'+token[4]
                    suffix_pos = token[4]                    
        valenceUnit = {}
        valenceUnit['FE'] = fe
        valenceUnit['PT'] = pt
        valenceUnit['suffix'] = suffix
        valenceUnit['suffix_pos'] = suffix_pos
        valenceUnit['pos_sequence'] = pos_seq
        valenceUnit['pt_sequence'] = pt_seq
        result.append(valenceUnit)
    return result

In [18]:
def genData():
    result = []
    # koreanFN = trainign data in CONLL format
    for sent_list in koreanFN:
        each_lu = {}
        lu_id = get_lu_id(sent_list)
        lu = kfn.lu(lu_id)
        each_lu['lu_id'] = lu['lu_id']
        # by this process, 'LU' is identified in training data
        isIn = False
        for i in result:
            if i['lu_id'] == lu_id:
                each_lu = i
                vp_list = each_lu['valencePatterns']
                # valence pattern 을 생성하는 함수 호출
                vp = getValencePattern(sent_list)
                vp_list = vp_list + vp
                each_lu['valencePatterns'] = vp_list
                i = each_lu
                isIn = True
                break
            else:
                pass
        if isIn == True:
            pass
            # result list 에 대해 중복 제거하기 위함
        else:
            each_lu['lu'] = lu['lu']
            each_lu['surface_forms'] = lu['surface_forms']
            each_lu['lexeme'] = lu['lexeme']
            vp_list = []
            # valence pattern 을 생성하는 함수 호출
            vp = getValencePattern(sent_list)
            vp_list = vp_list + vp
            each_lu['valencePatterns'] = vp_list        
            result.append(each_lu)
        
        print(lu['lu'])
        print(each_lu)
    # SAVE Valence Pattern to FILE
    with open('./valencePattern_0702.json','w') as f:
        json.dump(result, f, ensure_ascii=False, indent=4)
#genData()

In [21]:
def test_data():
    with open('./valencePattern_0702.json','r') as f:
        d = json.load(f)
    for i in d:
        print(i)
        break
    count = 0
    print(len(d))
    for i in d:
        vp = i['valencePatterns']
        if len(vp) > 0:
            count = count+1
    print(count)
test_data()

{'lexeme': '선언서', 'lu_id': 1, 'surface_forms': ['선언서를'], 'lu': '선언서.n.Statement', 'valencePatterns': [{'pt_sequence': ['8'], 'FE': 'speaker', 'pos_sequence': ['NNG+JX'], 'PT': '8', 'suffix': '은/JX', 'suffix_pos': 'JX'}, {'pt_sequence': ['2', '8', '8', '5', '6', '7'], 'FE': 'topic', 'pos_sequence': ['NNG', 'NNG+JC', 'NNG', 'NNG', 'NNG+JKB', 'VV+ETM'], 'PT': '7', 'suffix': '관한/VV+ETM', 'suffix_pos': 'VV+ETM'}]}
10359
3210
