In [26]:
import json
import glob
from nltk.corpus import framenet as fn
from pytorch_pretrained_bert.file_utils import cached_path

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [24]:
PRETRAINED_VOCAB_ARCHIVE_MAP = {
    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
}

In [79]:
def bios2bio(language):
    if language == 'en':
        ori_data_path = '../data/fn1.7/original/'        
    files = glob.glob(ori_data_path+'*.conll')
    for file in files:
        new_file = file.replace('/original/', '/')
        print(file,'-->', new_file)
        with open(file,'r') as f:
            d = f.readlines()
        new_lines = []
        for line in d:
            new_line = line
            if len(line) > 3:
                i = line.strip().split('\t')

                if i[14].startswith('S'):
                    newtext = i[14].replace('S-', 'B-')
                    i[14] = newtext
                new_line = '\t'.join(i)+'\n'
            new_lines.append(new_line)
        
        with open(new_file, 'w') as f:
            for line in new_lines:
                f.write(line)

# bios2bio('en')

In [63]:
def gen_idxdata(language):
    if language == 'en':
        frdata = fn.frames()
        ludata = fn.lus()

        lu2idx, frame2idx, fe2idx = {},{},{}
#         fe2idx['<PAD>'] = len(fe2idx)
        for fr in frdata:
            frame = fr.name
            if frame not in frame2idx:
                frame2idx[frame] = len(frame2idx)
        with open('../data/fn1.7_frame2idx.json','w') as f:
            json.dump(frame2idx, f, ensure_ascii=False, indent=4)

        for l in ludata:
            lu = l.name
            if lu not in lu2idx:
                lu2idx[lu] = len(lu2idx)
        # manual lu not in FN but FN1.7 dataset
        lu2idx['burgeon.v'] = len(lu2idx)
        lu2idx['but.c'] = len(lu2idx)
        lu2idx['however.adv'] = len(lu2idx)
        with open('../data/fn1.7_lu2idx.json','w') as f:
            json.dump(lu2idx, f, ensure_ascii=False, indent=4)

        fes = []
        for fr in frdata:
            for fe_ori in fr.FE:
                fes.append(fe_ori)
        fes = list(set(fes))
        fes.sort()
        fe2idx['<pad>'] = len(fe2idx)
        fe2idx['O'] = len(fe2idx)
        fe2idx['X'] = len(fe2idx)
        
        for fe in fes:
            b_fe = 'B-'+fe
            i_fe = 'I-'+fe
            if b_fe not in fe2idx:
                fe2idx[b_fe] = len(fe2idx)
            if i_fe not in fe2idx:
                fe2idx[i_fe] = len(fe2idx)        
        with open('../data/fn1.7_fe2idx.json','w') as f:
            json.dump(fe2idx, f, ensure_ascii=False, indent=4)
gen_idxdata('en')

In [64]:
def gen_map_data(language):
    if language == 'en':
        with open('../data/fn1.7_lu2idx.json','r') as f:
            lu2idx = json.load(f)
        with open('../data/fn1.7_frame2idx.json','r') as f:
            frame2idx = json.load(f)
        with open('../data/fn1.7_fe2idx.json','r') as f:
            fe2idx = json.load(f)
            
        frdata = fn.frames()
        ludata = fn.lus()
        lufrmap, frargmap = {},{}
        for l in ludata:
            lu = l.name
            lu_idx = int(lu2idx[lu])
            frame_candi = l.frame.name
            frame_idx = int(frame2idx[frame_candi])
            if not lu_idx in lufrmap:
                frame_candis = []
            else:
                frame_candis = lufrmap[lu_idx]
            frame_candis.append(frame_idx)
            frame_candis = list(set(frame_candis))
            lufrmap[lu_idx] = frame_candis
        lufrmap[10462] = [883] # lu: burgeon.v, frame: Progression
        lufrmap[10463] = [294] # lu: but.c, frame: Concessive
        lufrmap[10464] = [294] # lu: however.adv, frame: Concessive
        with open('../data/fn1.7_lufrmap.json','w') as f:
            json.dump(lufrmap, f, ensure_ascii=False, indent=4)
            
        for fr in frdata:
            frame = fr.name
            frame_idx = int(frame2idx[frame])
            if not frame_idx in frargmap:
                fe_candis = [0,1,2]
            else:
                fe_candis = frargmap[frame_idx]
            for fe in fr.FE:
                b_fe_idx = fe2idx['B-'+fe]
                i_fe_idx = fe2idx['I-'+fe]
                fe_candis.append(b_fe_idx)
                fe_candis.append(i_fe_idx)
            frargmap[frame_idx] = fe_candis
        with open('../data/fn1.7_frargmap.json','w') as f:
            json.dump(frargmap, f, ensure_ascii=False, indent=4)
        
gen_map_data('en')

In [65]:
def gen_lufr_token(language):
    if language == 'en':
        with open('../data/fn1.7_lu2idx.json','r') as f:
            lu2idx = json.load(f)
        with open('../data/fn1.7_frame2idx.json','r') as f:
            frame2idx = json.load(f)
        with open('../data/fn1.7_fe2idx.json','r') as f:
            fe2idx = json.load(f)
        with open('../data/fn1.7_lufrmap.json','r') as f:
            lufrmap = json.load(f)
        idx2frame = dict(zip(frame2idx.values(),frame2idx.keys()))
        idx2lu = dict(zip(lu2idx.values(),lu2idx.keys()))
        idx2fe = dict(zip(fe2idx.values(),fe2idx.keys()))   
        
        lufr_tokens = []
        for i in lufrmap:
            luidx = int(i)
            fr_candis_idx = lufrmap[str(i)]
            lu = idx2lu[luidx]
            fr_candis = [idx2frame[fr] for fr in fr_candis_idx]
            for fr in fr_candis:
                lufr_token = lu+'.'+fr
                lufr_token = lufr_token.replace(' ', '_')
                lufr_token = '['+lufr_token+']'
                lufr_tokens.append(lufr_token)
        with open('../data/fn1.7_lufr_tokens','w') as f:
            json.dump(lufr_tokens, f, ensure_ascii=False, indent=4)
            
        ori_vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP['bert-base-multilingual-cased']
        ori_vocab_cache_file = cached_path(ori_vocab_file)
        with open(ori_vocab_cache_file,'r') as f:
            ori_vocab = f.readlines()
        new_vocab = ori_vocab
        for i in lufr_tokens:
            new_vocab.append(i+'\n')
        with open('../data/fn1.7_lufr_vocab.txt','w') as f:
            for i in new_vocab:
                f.write(i)
            
gen_lufr_token('en')