In [1]:
import json
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors

In [2]:
vocab = {}
with open('Simplified_Chinese_LIWC2015_Dictionary.dic') as f:
    for line in f.readlines()[83:]:
        x = line.strip().split('\t')
        vocab[x[0]] = x[1:]
print(len(vocab))

9720


In [3]:
vectors_zhihu = KeyedVectors.load_word2vec_format('./word2vec/sgns.zhihu.word', binary=False)
print('Ratio of vocab in word vectors:', np.mean([word in vectors_zhihu for word in vocab]))

Ratio of vocab in word vectors: 0.9275720164609054


In [4]:
vectors_renmin = KeyedVectors.load_word2vec_format('./word2vec/sgns.renmin.word', binary=False)
print('Ratio of vocab in word vectors:', np.mean([word in vectors_renmin for word in vocab]))

Ratio of vocab in word vectors: 0.9125514403292181


In [5]:
vectors_weibo = KeyedVectors.load_word2vec_format('./word2vec/sgns.weibo.word', binary=False)
print('Ratio of vocab in word vectors:', np.mean([word in vectors_weibo for word in vocab]))

Ratio of vocab in word vectors: 0.9116255144032922


In [6]:
vectors_literature = KeyedVectors.load_word2vec_format('./word2vec/sgns.literature.word', binary=False)
print('Ratio of vocab in word vectors:', np.mean([word in vectors_literature for word in vocab]))

Ratio of vocab in word vectors: 0.9189300411522634


In [7]:
vectors_wiki = KeyedVectors.load_word2vec_format('./word2vec/sgns.wiki.word', binary=False)
print('Ratio of vocab in word vectors:', np.mean([word in vectors_wiki for word in vocab]))

Ratio of vocab in word vectors: 0.9096707818930041


In [8]:
vectors_sogou = KeyedVectors.load_word2vec_format('./word2vec/sgns.sogou.word', binary=False)
print('Ratio of vocab in word vectors:', np.mean([word in vectors_sogou for word in vocab]))

Ratio of vocab in word vectors: 0.9180041152263374


In [9]:
vectors_baidu = KeyedVectors.load_word2vec_format('./word2vec/sgns.baidu.word', binary=False)
print('Ratio of vocab in word vectors:', np.mean([word in vectors_baidu for word in vocab]))

Ratio of vocab in word vectors: 0.931275720164609


In [10]:
def get_enrich_words(word_vectors, vocab, topn):
    enrich = {}
    for word in vocab:
        if word in word_vectors:
            enrich[word] = word_vectors.most_similar(word, topn=topn)
    return enrich

In [11]:
def get_extended_words(topn, vocab):
    vector_names = ['zhihu', 'renmin', 'weibo', 'literature', 'wiki', 'sogou', 'baidu']
    vector_list = [vectors_zhihu, vectors_renmin, vectors_weibo, vectors_literature,
                   vectors_wiki, vectors_sogou, vectors_baidu]
    for i in range(len(vector_list)):
        name = vector_names[i]
        vector = vector_list[i]
        print('Searching in {}'.format(name))
        extended = get_enrich_words(vector, vocab, topn)
        num_new_words = sum([len(extended[word]) for word in extended])
        print('New words: ', num_new_words)
        with open("./extended_vocab/extended_{}_{}.json".format(name, topn), "w") as f:
            f.write(json.dumps(extended))

In [12]:
get_extended_words(100, vocab)

Searching in zhihu
New words:  901600
Searching in renmin
New words:  887000
Searching in weibo
New words:  886100
Searching in literature
New words:  893200
Searching in wiki
New words:  884200
Searching in sogou
New words:  892300
Searching in baidu
New words:  905200
