In [1]:
import numpy as np

def w2v_reader(path: str):
    w2v: dict[str, np.ndarray] = {}
    with open(path, 'r') as f:
        n, dim = map(int, f.readline().split())
        while True:
            line = f.readline()
            if not line:
                break
            word, vec = line.split(" ", 1)
            vec = np.asarray(vec.split(' '), dtype=np.float32)
            w2v[word] = vec
    return n, dim, w2v

n_words, n_dim, w2v = w2v_reader("./w2v/tencent-ailab-embedding-zh-d100-v0.2.0-s.txt")
words, matrix = zip(*w2v.items())
wordsNdArr = np.array(words)
matrix = np.array(matrix)

In [None]:
def findNNByQ(q: np.ndarray, matrix: np.ndarray, k: int = 10):
    dist = np.linalg.norm(matrix - q, ord=2, axis=1)
    indices = np.argsort(dist)
    return indices[:k]

def getWordIdx(words: list[str], w: str):
    if w not in w2v:
        raise Exception(f"Unknown {w}")
    return words.index(w)

def findNN(word: str, words: list[str], matrix: np.ndarray, k: int = 10):
    q = matrix[getWordIdx(words, word)]
    return findNNByQ(q, matrix, k)


In [None]:
indices = findNN('小米', words, matrix)
wordsNdArr[indices]

array(['小米', '小米啊', '红米', '苹果', 'm9', '小米3', '魅族', '小米吧', '小米手机', '平板'],
      dtype='<U30')

In [None]:
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.manifold import Isomap

# decomposition = PCA(20)
# lowDimMatrix = decomposition.fit_transform(matrix)

manifold = Isomap(n_neighbors=5, n_components=50)
lowDimMatrix = manifold.fit_transform(matrix)

In [None]:
import umap

manifold = umap.UMAP(n_neighbors=15, n_components=50)
lowDimMatrix = manifold.fit_transform(matrix)

In [None]:
def findSentence(sentence: list[str], words: list[str], matrix: np.ndarray, k: int = 10):
    idxs = []
    for w in sentence:
        if w in words:
            idxs.append(words.index(w))
        else:
            print(f"Unknown {w}")
    q = matrix[idxs]
    return findNNByQ(q.mean(axis=0), matrix, k)

print(wordsNdArr[findSentence(['苹果公司', "产品"], words ,lowDimMatrix)])
print(wordsNdArr[findSentence(['苹果公司', "产品"], words ,matrix)])
print("---")
print(wordsNdArr[findSentence(['小米', '华为'], words ,lowDimMatrix)])
print(wordsNdArr[findSentence(['小米', '华为'], words ,matrix)])
print("---")
print(wordsNdArr[findSentence(['woman', 'king'], words ,lowDimMatrix)])
print(wordsNdArr[findSentence(['woman', 'king'], words ,matrix)])
print("---")
print(wordsNdArr[findSentence(['watch', 'apple'], words ,lowDimMatrix)])
print(wordsNdArr[findSentence(['watch', 'apple'], words ,matrix)])

['某些产品' '厂商会' '类似产品' '某个产品' '类似的产品' '三星产品' '所有产品' '任何产品' '此产品' '旧产品']
['产品' '苹果公司' '某些产品' '那些产品' '许多产品' '公司的产品' '苹果的产品' '其他产品' '苹果产品' '新产品']
---
['华为' '小米' '手机' '操作系统' 'zcwdz' '搬瓦工香港' '9taw5hbnq' 'nsa双模5g' '微软edge浏览器'
 '开发者预览版']
['小米' '华为' '手机' '操作系统' '移动2g' '联通2g' '联通的2g' 'balong' 'u880' '移动3g']
---
['kendrick' 'nicki' 'quavo' 'timberlake' 'colman' 'gambino' 'monsta'
 'secret' 'lannister' 'styles']
['king' 'woman' 'queen' 'lord' 'villain' 'clown' 'legendary' 'beast' 'big'
 'evil']
---
['apple' 'watch' 'smartwatch' 'tabs' 'touch' 'finder' 'snap' 'quest'
 'applewatch' 'amaz']
['watch' 'apple' 'applewatch' 'iwatch' '苹果智能手表' '苹果手表' '苹果apple' 'touch'
 'smartwatch' 'phone']
