In [None]:
import os
import numpy as np
import json
from scipy import sparse
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [None]:
dataset = 'toy100'
dim = 100
random_seed = 9999744
config = f'data_{dataset}-dim_{dim}-rand_{random_seed}'
save_dir = f'../result/{config}'

In [None]:
term_doc_sparse = sparse.load_npz(os.path.join(save_dir, f'termDocSparse.npz'))
term_array = np.load(os.path.join(save_dir, f'termArray.npy'))
doc_array  = np.load(os.path.join(save_dir, f'docArray.npy'))
with open(os.path.join(save_dir, 'term-doc.json'), mode='r', encoding='utf8') as f:
    term_doc_text = json.load(f)
    terms = term_doc_text['terms']
    docs = term_doc_text['docs']
with open(os.path.join(save_dir, 'frobenius-norm-approx-raw.txt'), mode='r', encoding='utf8') as f:
    frob_norm = float(f.read())


In [None]:
print('term_doc_sparse.shape:', term_doc_sparse.shape)
print('term_array.shape', term_array.shape)
print('doc_array.shape', doc_array.shape)
print('#term_text:', len(terms))
print('#doc_text:', len(docs))
print('feature_dim:', doc_array.shape[1])
print('frobenius norm:', frob_norm)

In [None]:
t = 8
arr = term_array
np.dot(arr[t], arr[t])

In [None]:
synonyms = ['中华民族', '中国人']
other_words = ['依法', '音乐']
words = synonyms + other_words

In [None]:
def get_doc(target_in_cnt, target_num, target_dict):
    target_doc_list = []
    for i in range(len(docs)):
        in_cnt = 0
        # WARNING: words are always matched from the begininig of the list
        for w in words:
            if w in docs[i]:
                in_cnt += 1
        if in_cnt == target_in_cnt:
            target_doc_list.append({
                'id': i,
                '#appear': target_in_cnt,
                'text': docs[i],
                'array': doc_array[i],
                'type': 'd',
            })
            if len(target_doc_list) == target_num:
                break
    if len(target_doc_list) != target_num:
        print('***cannot find enough required docs containing {} words. found {}, want {}'.format(target_in_cnt, len(target_doc_list), target_num))
    target_dict[target_in_cnt] = target_doc_list
    return target_dict



In [None]:
doc_dict = {}  # key: #words appeared in doc, value: list of doc info
doc_dict = get_doc(0, 2, doc_dict)
doc_dict = get_doc(1, 2, doc_dict)
doc_dict = get_doc(2, 2, doc_dict)
doc_dict = get_doc(3, 1, doc_dict)
doc_dict = get_doc(4, 1, doc_dict)
doc_info_list = [d for i in range(5) for d in doc_dict[i]]
for i, d in enumerate(doc_info_list):
    d['index'] = i
print('#found docs:', len(doc_info_list))


In [None]:
# assert items in terms and docs list are unique, respectively
words_info_list = []
for word in words:
    index = terms.index(word)
    words_info_list.append({
        'id': index,
        'index': len(words_info_list),
        'text': word,
        'array': term_array[index],
        'type': 'w',
    })


In [None]:
info_list = words_info_list + doc_info_list
similarity_list = [{
                    'item1': x['type'] + str(x['index']), 
                    'item2': y['type'] + str(y['index']), 
                    'similarity': np.dot(x['array'], y['array']) / np.linalg.norm(x['array'], ord=2) / np.linalg.norm(y['array'], ord=2),
                    } 
                    for x in info_list for y in info_list]


In [None]:
similarity_df = pd.DataFrame(similarity_list)
similarity_pivot_df = similarity_df.pivot(index='item1', columns='item2', values='similarity')
col_order = ['w' + str(i) for i in range(len(words_info_list))] + ['d' + str(i) for i in range(len(doc_info_list))]
similarity_pivot_df = similarity_pivot_df.reindex(col_order, axis=0).reindex(col_order, axis=1)
similarity_pivot_df

In [None]:
similarity_pivot_df.to_csv(os.path.join(save_dir, 'similarity.csv'), sep='\t')

In [None]:
ax = sns.heatmap(similarity_pivot_df, cmap="YlGnBu", square=True)
ax.xaxis.tick_top()
ax.set(xlabel=None)
ax.set(ylabel=None)
plt.title('\n'.join(['Cosine Similarity Heatmap Between Words and Docs', f'dataset={dataset}, dim={dim}, random_seed={random_seed}']))
plt.savefig(os.path.join(save_dir, 'cos-sim-heatmap.pdf'), bbox_inches='tight')

In [None]:
tsne = TSNE(n_components=2, random_state=random_seed)
# vec2d = tsne.fit_transform([x['array'] / np.linalg.norm(x['array'], ord=2) for x in info_list])
vec2d = tsne.fit_transform([x['array'] for x in info_list])


In [None]:
# visualizing vectors with legend. 
# ref: https://stackoverflow.com/questions/42281966/how-to-plot-vectors-in-python-using-matplotlib
rows,cols = vec2d.T.shape

# Get absolute maxes for axis ranges to center origin
# This is optional
maxes = 1.1*np.amax(abs(vec2d), axis = 0)

for i,l in enumerate(range(0,cols)):
    xs = [0,vec2d[i,0]]
    ys = [0,vec2d[i,1]]
    plt.plot(xs,ys)

# plt.plot(0,0,'ok') #<-- plot a black point at the origin
plt.axis('equal')  #<-- set the axes to the same scale
plt.xlim([-maxes[0],maxes[0]]) #<-- set the x axis limits
plt.ylim([-maxes[1],maxes[1]]) #<-- set the y axis limits
plt.legend([x['type'] + str(x['index']) for x in info_list]) #<-- give a legend
plt.grid(b=True, which='major') #<-- plot grid lines
plt.title('\n'.join(['Word and Doc Embedding Visualization with t-SNE', f'dataset={dataset}, dim={dim}, random_seed={random_seed}']))
plt.savefig(os.path.join(save_dir, 'word-doc-emb-visualization.pdf'), bbox_inches='tight')
plt.show()
