In [1]:
import re
import jieba
import numpy as np
from gensim.models import Word2Vec
from scipy.stats import pearsonr

In [11]:
def load_model(fname: str):
    return Word2Vec.load(fname)

def compute_sentence_embedding(sentence, model, a):
    """
    如果词向量不存在应该如何处理？（目前的处理是忽略该词向量）(out-of-word)
    """
    words = cut_words(sentence)
    # 词向量加权求和
    word_embeddings = np.array([a / (a + (model.wv.vocab[word].count / model.corpus_total_words)) * model.wv[word] for word in words if word in model.wv])
    return np.sum(word_embeddings, axis=0) / word_embeddings.shape[0]

def cut_words(content: str):
    return [word for word in list(jieba.cut(clean_data(content))) if word != ' ']

def clean_data(content: str):
    chinese_punctuation = '＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·！？｡。'
    english_punctuation = ',.?;:\'"`~!'
    special_char = r'<>/\\|\[\]{}@#\$%\^&\*\(\)-\+=_\n'
    return re.sub('(?P<punctuation>[{}]|[{}])|(?P<special_char>[{}])'.format(chinese_punctuation, english_punctuation, special_char), ' ', content)

def cos_dist(vec1,vec2):
    """
    :param vec1: 向量1
    :param vec2: 向量2
    :return: 返回两个向量的余弦相似度
    """
    dist1=float(np.dot(vec1,vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2)))
    return dist1

In [6]:
path = '/Users/mengzeyu/Downloads/word2vec_normal_full.model'
model = load_model(path)

In [7]:
lines = []
with open('/Users/mengzeyu/Downloads/simtrain_to05sts.txt', 'r') as f:
    lines = f.readlines()

In [8]:
sentences_A =[]
sentences_B=[]
scores = []
for line in lines:
    parts = line.split('\t')
    sentences_A.append(parts[1])
    sentences_B.append(parts[3])
    score = float(parts[4].replace('\n',''))/5.0
    scores.append(score)

In [12]:
for a in np.arange(1e-4, 1.1e-3, 1e-4):
    truth =[]
    predicts = []
    print(a)
    for i in range(len(sentences_A)):
        sentence_a=compute_sentence_embedding(sentences_A[i], model, a)
        sentence_b=compute_sentence_embedding(sentences_B[i], model, a)
        if isinstance(sentence_a,np.ndarray) and isinstance(sentence_b,np.ndarray):
            truth.append(scores[i])
            predict = cos_dist(sentence_a,sentence_b)
            predicts.append(predict)
    print(pearsonr(truth, predicts)[0])

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/k2/t_qs4tk92z73ct1c5lyhs_bm0000gq/T/jieba.cache


0.0001


Loading model cost 0.760 seconds.
Prefix dict has been built succesfully.
  # This is added back by InteractiveShellApp.init_path()


0.8405600351675138
0.0002
0.8244173574311611
0.00030000000000000003
0.8133818943603435
0.0004
0.8050785496203428
0.0005
0.798484895654453
0.0006000000000000001
0.7930559810505952
0.0007000000000000001
0.7884673029443493
0.0008
0.784510494249119
0.0009000000000000001
0.7810441495318403
0.001
0.7779681752448138
