In [1]:
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet,stopwords
from nltk.stem import WordNetLemmatizer
import thulac
import pickle
import re
import jieba
from tqdm import tqdm

In [None]:
# File paths
babel_data_file = '../data/babel_data_full.txt'    # BaBelNet原始数据
babel_sememe_file = '../data/synset_sememes.txt'   # BabelSememe原始数据
data = '../data/babel_data'                        # BabelNet原始数据词典
clean_data = '../data/data_clean'                  # BabelNet清洗后的数据

In [None]:
# Utils
wnl = WordNetLemmatizer()
englist_stop_words = set(stopwords.words('english'))
pattern = {'zh':re.compile(r'[^\u4e00-\u9fa5]'),'en':re.compile('[^a-z^A-Z^\s]')}
chinese_stop_words = [i[:-1] for i in open('../data/Chinese_stop_words').readlines()]

In [None]:
# Read source file, get babel_data
def read_list(line):
    line = line[:-1].split('\t')
    num = int(line[0])
    assert(num == len(line[1:]))
    return line[1:]

def read_synset(f):
    synset = {}
    synset['id'] = f.readline()
    if not synset['id']:
        return
    synset['id'] = synset['id'][:-1]
    for k in ['w_e','w_c','w_f']:
        synset[k] = read_list(f.readline())
    for k in ['d_e_m','d_c_m','d_f_m']:
        synset[k] = f.readline()[:-1]
    for k in ['d_e','d_c','d_f']:
        synset[k] = read_list(f.readline())
    synset['i_m'] = f.readline()[:-1]
    synset['i'] = read_list(f.readline())
    return synset

def read_babel_data(f):
    babel_data ={}
    while True:
        d = read_synset(f)
        if not d:
            return babel_data
        babel_data[d['id']] = d
    
def read_babel_sememe(f):
    lines = f.readlines()
    babel_sememe = {line[:-1].split()[0] : line[:-1].split()[1:] for line in lines}
    return babel_sememe

def get_babel_data():
    babel_data = read_babel_data(open(babel_data_file))
    babel_sememe = read_babel_sememe(open(babel_sememe_file))

    for k in tqdm(babel_data):
        babel_data[k]['s'] = babel_sememe[k]
    return babel_data


In [None]:
babel_data = get_babel_data()

In [None]:
list(babel_data.values())[0]

In [None]:
def clean_word_list(word_list, lang='en'):
    res = []
    for w in word_list:
        if lang == 'en':
            w = wnl.lemmatize(w.lower())
            w = re.sub(pattern['en'], '', w)
            if w not in englist_stop_words:
                res.append(w)
        elif lang == 'zh':
            w = re.sub(pattern['zh'],'',w)
            if w not in chinese_stop_words:
                res.append(w)
    return res

def split_sentence(sentence, lang='zh'):
    if lang=='zh':
        return jieba.lcut(sentence)
    else:
        return sentence.split(' ')
            
def get_clean_data(babel_data):
    clean_data ={}
    for k in tqdm(babel_data.keys()):
        clean_data_instance = {}
        clean_data_instance['w_e'] = clean_word_list(babel_data[k]['w_e'], lang='en')
        clean_data_instance['w_c'] = clean_word_list(babel_data[k]['w_c'], lang='zh')
        clean_data_instance['w_f'] = babel_data[k]['w_f']
        clean_data_instance['d_e_m'] = clean_word_list(split_sentence(babel_data[k]['d_e_m'],lang='en'), lang='en')
        clean_data_instance['d_c_m'] = clean_word_list(split_sentence(babel_data[k]['d_c_m'],lang='zh'), lang='zh')
        clean_data_instance['d_f_m'] = split_sentence(babel_data[k]['d_f_m'],lang='fr')
        clean_data_instance['d_e'] = [clean_word_list(split_sentence(i,lang='en'), lang='en') for i in babel_data[k]['d_e']]
        clean_data_instance['d_c'] = [clean_word_list(split_sentence(i,lang='zh'), lang='zh') for i in babel_data[k]['d_c']]
        clean_data_instance['d_f'] = [split_sentence(i,lang='fr') for i in babel_data[k]['d_f']]
        for i in ['i_m','i','s']:
            clean_data_instance[i] = babel_data[k][i]
        clean_data[k] = clean_data_instance
    return clean_data

In [None]:
clean_data = get_clean_data(babel_data)

In [None]:
list(clean_data.values())[0]

In [None]:
pickle.dump(clean_data, open('../data/clean_data','wb'))

In [None]:
babel_data = pickle.load(open('../data/babel_data','rb'))
print(len(babel_data))

In [None]:
list(clean_data.values())[0]

In [None]:
triple_lang_data = []
for k in clean_data.keys():
    if len(clean_data[k]['w_e']) > 0 and len(clean_data[k]['w_c']) > 0 and len(clean_data[k]['w_f']) > 0:
        if len(clean_data[k]['d_e_m']) > 0 or len(clean_data[k]['d_e']) > 0:
            if len(clean_data[k]['d_c_m']) > 0 or len(clean_data[k]['d_c']) > 0:
                if len(clean_data[k]['d_f_m']) > 0 or len(clean_data[k]['d_f']) > 0:
                    triple_lang_data.append(k)

In [None]:
len(triple_lang_data)

In [None]:
triple_lang_dataset = {'train':triple_lang_data[:6809], 'valid':triple_lang_data[6809:7660], 'test':triple_lang_data[7660:]}
print(len(triple_lang_dataset['train']))
print(len(triple_lang_dataset['valid']))
print(len(triple_lang_dataset['test']))

In [None]:
pickle.dump(triple_lang_dataset,open('../data_set/triple_lang_data','wb'))