In [None]:
import os
import numpy as np
import json
from scipy import sparse

In [None]:
dataset = 'toy100'
dim = 100
random_seed = 9999744
config = f'data_{dataset}-dim_{dim}-rand_{random_seed}'
save_dir = f'../result/{config}'

In [None]:
term_doc_sparse = sparse.load_npz(os.path.join(save_dir, f'termDocSparse.npz'))
term_array = np.load(os.path.join(save_dir, f'termArray.npy'))
doc_array  = np.load(os.path.join(save_dir, f'docArray.npy'))
with open(os.path.join(save_dir, 'term-doc.json'), mode='r', encoding='utf8') as f:
    term_doc_text = json.load(f)
    terms = term_doc_text['terms']
    docs = term_doc_text['docs']
with open(os.path.join(save_dir, 'frobenius-norm-approx-raw.txt'), mode='r', encoding='utf8') as f:
    frob_norm = float(f.read())


In [None]:
print('term_doc_sparse.shape:', term_doc_sparse.shape)
print('term_array.shape', term_array.shape)
print('doc_array.shape', doc_array.shape)
print('#term_text:', len(terms))
print('#doc_text:', len(docs))
print('feature_dim:', doc_array.shape[1])
print('frobenius norm:', frob_norm)

In [None]:
synonyms = ['中华民族', '中国人']
other_words = ['依法', '音乐']
words = synonyms + other_words

In [None]:
def get_doc(target_in_cnt, target_num, target_dict):
    target_doc_list = []
    for i in range(len(docs)):
        in_cnt = 0
        # WARNING: words are always matched from the begininig of the list
        for w in words:
            if w in docs[i]:
                in_cnt += 1
        if in_cnt == target_in_cnt:
            target_doc_list.append({
                'index': i,
                '#appear': target_in_cnt,
                'doc_text': docs[i],
                'doc_array': doc_array[i],
            })
            if len(target_doc_list) == target_num:
                break
    if len(target_doc_list) != target_num:
        print('***cannot find enough required docs containing {} words. found {}, want {}'.format(target_in_cnt, len(target_doc_list), target_num))
    target_dict[target_in_cnt] = target_doc_list
    return target_dict



In [None]:
doc_dict = {}  # key: #words appeared in doc, value: list of doc info
doc_dict = get_doc(0, 2, doc_dict)
doc_dict = get_doc(1, 2, doc_dict)
doc_dict = get_doc(2, 2, doc_dict)
doc_dict = get_doc(3, 1, doc_dict)
doc_dict = get_doc(4, 1, doc_dict)
doc_info_list = [d for i in range(5) for d in doc_dict[i]]
print('#found docs:', len(doc_info_list))


In [None]:
# assert items in terms and docs list are unique, respectively
words_info_list = []
for word in words:
    index = terms.index(word)
    words_info_list.append({
        'index': index,
        'word_text': word,
        'word_array': term_array[index]
    })
