# Introduction

This is a simple tool (beta) to convert your own datasets to ConKADI-format datasets.

This tool includes two notebooks; this is the first one, which generates middle-format datasets.

In [2]:
# Your dataset folders
source_folder = 'dataset/base/'
target_folder = 'dataset/ccm/' 

# Knowlege Fact File, you can reuse the released 'naf_fact_vocab.txt'.
# All facts are directional; hence if you want use other KBs, please follow the format:
# Five items per line: 0:entity_in_post, 1:entity_in_response, 2 head_entity, 3 relation, 4 tail_entity,
# where 0=2&1=3 or 0=3&1=2, entity_in_post/response is only used to match the post/response.
# the first line should be '#N #N #NH #NF #NT'
fact_vocab_path = 'concept/fact_vocab.txt'

# Your entity/relation vocabs, 
# you can reuse the released 'entity/relation_vocab.txt' if you have removed all special tokens, i.e, #XXX.

entity_vocab_path = 'concept/entity.txt'
relation_vocab_path = 'concept/relation.txt'
file_map = {
      'train': 'trainset.txt',
    'test': 'testset.txt',
    'dev': 'validset.txt',
}
valid_vocab_start = 0

with open(target_folder+'entity.txt','w+',encoding='utf-8') as fout:
    with open(entity_vocab_path,'r+',encoding='utf-8') as fin:
        fout.write(''.join(fin.readlines()))
    

with open(target_folder+'relation.txt','w+',encoding='utf-8') as fout:
    with open(relation_vocab_path,'r+',encoding='utf-8') as fin:
        fout.write(''.join(fin.readlines()))
    
    
with open(target_folder+'fact_vocab.txt','w+',encoding='utf-8') as fout:
    with open(fact_vocab_path,'r+',encoding='utf-8') as fin:
        fout.write(''.join(fin.readlines()))

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/ccm/entity.txt'

In [2]:
# 构建索引
from collections import defaultdict

head_fact_set = defaultdict(set)
tail_fact_set = defaultdict(set)

fact_vocabs = [x.strip('\r\n').split() for x in open(fact_vocab_path).readlines()]

for fid, fact in enumerate(fact_vocabs):
    head_entity = fact[0]
    tail_entity = fact[1]
    head_fact_set[head_entity].add(fid)
    tail_fact_set[tail_entity].add(fid)

In [3]:

entity_list = []
entity_to_id = dict()
with open(entity_vocab_path,'r',encoding='utf-8') as fin:
    for line in fin.readlines():
        items = line.split()
        entity_list.append(items[0])
        entity_to_id[items[0]] = len(entity_to_id)

In [4]:

def match_to_kb(post, response):
    post_fid_set = set()
    response_fid_set = set()
    
    for word in post:
        for fid in head_fact_set[word]:
            post_fid_set.add(fid)
    
    for word in response:
        for fid in tail_fact_set[word]:
            response_fid_set.add(fid)
    
    return post_fid_set, post_fid_set & response_fid_set

def match_related(post):
    post_fid_set = set()
    
    for word in post:
        for fid in head_fact_set[word]:
            post_fid_set.add(fid)
    
    return post_fid_set


for dataset in ['train', 'dev', 'test']:
    posts = [x.strip('\r\n').split() for x in open('%s%s.%s' % (source_folder, dataset, 'src')).readlines()]
    responses = [x.strip('\r\n').split() for x in open('%s%s.%s' % (source_folder, dataset, 'tgt')).readlines()]
    with open(target_folder+dataset+'.matched','w+',encoding='utf-8') as fmatched:
        with open(target_folder+dataset+'.related','w+',encoding='utf-8') as frelated:
            for i in range(len(posts)):
                related_fact_ids, matched_fact_ids  = match_to_kb(posts[i], responses[i])
                fmatched.write('%s\n' % ' '.join([str(x) for x in list(matched_fact_ids)]))
                frelated.write('%s\n' % ' '.join([str(x) for x in list(related_fact_ids)]))
    

In [3]:
from multiprocessing.pool import Pool
import random
import json
from collections import defaultdict

MAX_FACT = 50
MAX_GOLDEN = 0

def match_job(bulk_id, posts, responses, matched, related):
    
    print('job%d has started, 1st word: %s' % (bulk_id, posts[0][0]))
    
    random.seed(MAX_FACT)

    cache = []
    for post, response, matched, related in zip(posts, responses, matched, related):
        data = dict()
        
        data['post'] = post
        data['response'] = response
        
        all_facts = matched + related
        selected_triplets = random.sample(all_facts, min(len(all_facts), MAX_FACT))
        
        matched_set = set(matched)
        related_set = set(related)
        
        matched_triplets = []
        related_triplets = []
        
        for triplet in selected_triplets:
            if triplet in matched_set:
                matched_triplets.append(triplet)
            else:
                related_triplets.append(triplet)
                
        if len(matched_triplets) == 0 and len(matched_set) > 0:
            related_triplets = related_triplets[1:]
            matched_triplets = random.sample(matched, 1)
                
        data['match_triples'] = matched_triplets
            
        used_triplets = matched_triplets + related_triplets
        assert len(used_triplets) <= MAX_FACT 
        
    
        head_index = defaultdict(list)
        tail_index = defaultdict(list)
        for fid in used_triplets:
            fact =  fact_vocabs[fid]
            head_entity = fact[0]
            tail_entity = fact[1]
            head_index[head_entity].append(fid)
            tail_index[tail_entity].append(fid)
        matched_triplets = set(matched_triplets)
        matched_triplets_dict = defaultdict(list)
        matched_index_map = defaultdict(list)
        
        post_triples = []
        all_triples = []
        all_entities = []
        
        all_golden_triples = []
        all_golden_entities = []
        count = 0
        for word in post:
            if word in head_index:
                count += 1
                post_triples.append(count)
                tmp_triplets = []
                tmp_entities = []
                golden_triplets = []
                golden_entities = []
                for fid in head_index[word]:
                    if fid in matched_triplets :
                        item  = fact_vocabs[fid]
                        source_entity = item[0]
                        target_entity = item[1]
                        assert word == source_entity
                        matched_index_map[fid].append((count, len(golden_triplets)))
                        matched_triplets_dict[target_entity].append(fid)
                        golden_triplets.append(fid)
                        golden_entities.append(entity_to_id[item[1]])
                    else:
                        item  = fact_vocabs[fid]
                        tmp_triplets.append(fid)
                        tmp_entities.append(entity_to_id[item[1]])
                
                        
                    
                all_triples.append(tmp_triplets)
                all_entities.append(tmp_entities)
        
                all_golden_triples.append(golden_triplets)
                all_golden_entities.append(golden_entities)
                assert len(tmp_triplets) == len(set(tmp_triplets))
            else:
                post_triples.append(0)
                
        for i in range(len(all_triples)):
            all_triples[i] = all_golden_triples[i] +  all_triples[i]
            all_entities[i] = all_golden_entities[i] +  all_entities[i]
        
        data['post_triples'] = post_triples
        data['all_triples'] =  all_triples
        data['all_entities'] =  all_entities
        
        # Response Triplets
        response_triplets = []
        match_index = []
        for word in response:
            if word in matched_triplets_dict:
                matched_triplet = random.sample(matched_triplets_dict[word],1)[0]
                response_triplets.append(matched_triplet)
                if len(matched_index_map[matched_triplet]) > 0: # 可能有被抛弃的Golden Entity
                    tmp = random.sample(matched_index_map[matched_triplet],1)[0]
                    match_index.append([tmp[0],tmp[1]])
                    assert entity_list[all_entities[tmp[0]-1][tmp[1]]] == word, '%s %s' % (entity_list[all_entities[tmp[0]-1][tmp[1]]], word)
                else:
                    response_triplets.append(-1)
                    match_index.append([-1,-1])
            else:
                response_triplets.append(-1)
                match_index.append([-1,-1])
        data['response_triples'] = response_triplets
        data['match_index'] = match_index
        cache.append(data)
        
        assert len(data['response_triples'])  == len(data['match_index'])
        assert len(data['response'])  == len(data['match_index'])
#     print(' Cache Count and Line Number %d %d' % (len(cache), len(posts)))
    assert len(posts) == len(cache) , '%d %d' % (len(cache), len(posts))
    print('job%d has done, 1st word: %s' % (bulk_id, posts[0][0]))
    return cache


for file in file_map.keys():
    with open(source_folder + file + '.src', 'r+', encoding='utf-8') as fin:
        posts = [x.strip('\r\n').split() for x in fin.readlines()]
#         print(posts[0:10])
    print('Loaded-1')
    with open(source_folder + file + '.tgt', 'r+', encoding='utf-8') as fin:
        responses = [x.strip('\r\n').split() for x in fin.readlines()]
#         print(responses[0:10])
    print('Loaded-2')
    with open(target_folder + file + '.matched', 'r+', encoding='utf-8') as fin:
        matched = [x.strip('\r\n').split() for x in fin.readlines()]
        matched = [[int(x) for x in y] for y in matched]
#          print(matched[0:10])
    print('Loaded-3')
    with open(target_folder + file + '.related', 'r+', encoding='utf-8') as fin:
        related = [x.strip('\r\n').split() for x in fin.readlines()]
        related = [[int(x) for x in y] for y in related]
#          print(related[0:10])
    print('Loaded-4')
    
    line_number = len(posts)
    num_thread = 3
    bulk_num = line_number // num_thread + 1
    pool = Pool(num_thread)
    jobs = []
    cache = []
    for i in range(0, line_number, bulk_num):
#         match_job(i, posts[i:i+bulk_num],responses[i:i+bulk_num],matched[i:i+bulk_num],related[i:i+bulk_num])
        job = pool.apply_async(match_job, (i, posts[i:i+bulk_num],responses[i:i+bulk_num],matched[i:i+bulk_num],related[i:i+bulk_num]))
        jobs.append(job)
    pool.close()
    pool.join()
    
    del posts
    del responses
    del matched
    del related
    
    with open(target_folder+file_map[file], 'w+', encoding='utf-8') as fout:
        for job in jobs:
            for data in job.get():
                line = json.dumps(data, ensure_ascii=False)
                fout.write(line+'\n')
    print('Done',file)


        

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/base/train.src'

In [4]:

csk_triples = []
csk_entities = []
raw_vocab = dict()
kb_dict = defaultdict(list)

with open(fact_vocab_path,'r',encoding='utf-8') as fin:
    for i,line in enumerate(fin.readlines()):
        items = line.strip('\r\n').split(' ')
        fact = '%s, %s, %s' % (items[2], items[3], items[4])
        csk_triples.append(fact)
        kb_dict[items[0]].append(fact)
        kb_dict[items[2]].append(fact)
        
print(csk_triples[0:10])

with open(entity_vocab_path,'r',encoding='utf-8') as fin:
        for line in fin.readlines():
            items = line.split()
            csk_entities.append(items[0])
print(csk_entities[0:10])

with open(source_folder+'vocab.txt', 'r+') as fin:
    vocabs = [x.strip('\r\n') for x in fin.readlines()]
    for item in vocabs[valid_vocab_start:]:
        raw_vocab[item] = 9999
    print(len(vocabs))
print(kb_dict['0分'])

data = {
    'csk_triples':csk_triples,
    'csk_entities':csk_entities,
    'vocab_dict':raw_vocab,
    'dict_csk':kb_dict,
}

json.dump(data, open(target_folder+'resource.txt', 'w+', encoding='utf-8'), ensure_ascii=False)

FileNotFoundError: [Errno 2] No such file or directory: 'concept/fact_vocab.txt'