In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.decomposition import TruncatedSVD

In [None]:
tokenized_file = '../data/news_tokenized.csv'
word_embedding_file = '../data/no_symbol.word2vec'
weight_file = '../../main/data/total_vocab.txt'
similarity_file = '../data/similar_evaluation.txt'

* tokenized_file - 预处理好的csv文件
* word_embedding_file - word_embedding_train生成好的词向量文件
* weight_file - word_embedding_train生成好的词频文件
* similarity_file - 最后生成的相似度文件，只是有个view，不是最后的文件

In [None]:
def remove_symbols(text):
    new_text = ''
    text_sentences = text.split('\n')
    for sentence in text_sentences:
        sentence_phrases = re.findall(r'[\w\s]+', sentence)
        new_text += ' '.join(sentence_phrases) + '\n'
    return new_text

In [None]:
def getWordmap(textfile):
    words={}
    We = []
    f = open(textfile,'r')
    lines = f.readlines()
    if len(lines[0].split()) < 5:
        lines = lines[1:]
    for (n,i) in enumerate(lines):
        i=i.split()
        j = 1
        v = []
        while j < len(i):
            v.append(float(i[j]))
            j += 1
        words[i[0]]=n
        We.append(v)
    return (words, np.array(We))

*getWordmap*载入word embedding到内存

In [None]:
def getWordWeight(weightfile, a=1e-3):
    if a <=0: # when the parameter makes no sense, use unweighted
        a = 1.0

    word2weight = {}
    with open(weightfile) as f:
        lines = f.readlines()
    N = 0
    for i in lines:
        if "appearance" in i:
            continue
        i=i.strip()
        if(len(i) > 0):
            i=i.split()
            if(len(i) == 2):
                word2weight[i[0]] = float(i[1])
                N += float(i[1])
            else:
                print(i)
    for key, value in word2weight.items():
        word2weight[key] = a / (a + value/N)
    return word2weight

*getWordWeight*利用词频计算每一个词的权重 $\frac{a}{a+Pr(word)}$

In [None]:
def getWeight(words, word2weight):
    weight4ind = {}
    for word, ind in words.items():
        if word in word2weight:
            weight4ind[ind] = word2weight[word]
        else:
            weight4ind[ind] = 1.0
    return weight4ind

In [None]:
def map_initialize(word_embedding_file, weight_file, weight_para):
    (words2index, words_embedding) = getWordmap(word_embedding_file)
    word2weight = getWordWeight(weight_file, weight_para)
    weight4ind = getWeight(words2index, word2weight)
    return weight4ind, words2index, words_embedding

weight4ind, words2index, words_embedding = map_initialize(word_embedding_file, weight_file, weight_para)

In [None]:
def lookupIDX(words,w):
    w = w.lower()
    if len(w) > 1 and w[0] == '#':
        w = w.replace("#","")
    if w in words:
        return words[w]
    elif 'UUUNKKK' in words:
        return words['UUUNKKK']
    else:
        return len(words) - 1

def getSeq(p1,words):
    p1 = p1.split()
    X1 = []
    for i in p1:
        X1.append(lookupIDX(words,i))
    return X1

def prepare_data(list_of_seqs):
    lengths = [len(s) for s in list_of_seqs]
    n_samples = len(list_of_seqs)
    maxlen = np.max(lengths)
    x = np.zeros((n_samples, maxlen)).astype('int32')
    x_mask = np.zeros((n_samples, maxlen)).astype('float32')
    for idx, s in enumerate(list_of_seqs):
        x[idx, :lengths[idx]] = s
        x_mask[idx, :lengths[idx]] = 1.
    x_mask = np.asarray(x_mask, dtype='float32')
    return x, x_mask

def sentences2idx(sentences, words):
    """
    Given a list of sentences, output array of word indices that can be fed into the algorithms.
    :param sentences: a list of sentences
    :param words: a dictionary, words['str'] is the indices of the word 'str'
    :return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location)
    """
    seq1 = []
    for i in sentences:
        seq1.append(getSeq(i,words))
    x1,m1 = prepare_data(seq1)
    return x1, m1

*sentences2inx*的作用是将每个句子中分好的词用words2index变量里的word对应的index占位，x1用index占位，m1用1来占位。  
例如： “日 与 月” ：array[9, 8, 6]

In [None]:
def seq2weight(seq, mask, weight4ind):
    weight = np.zeros(seq.shape).astype('float32')
    for i in range(seq.shape[0]):
        for j in range(seq.shape[1]):
            if mask[i,j] > 0 and seq[i,j] >= 0:
                weight[i,j] = weight4ind[seq[i,j]]
    weight = np.asarray(weight, dtype='float32')
    return weight

*seq2weight*将上面index占好的位置用权重替换。

In [None]:
def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_

def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    """
    pc = compute_pc(X, npc)
    if npc==1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX

def SIF_embedding(We, x, w, params):
    """
    Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in the i-th sentence
    :param w: w[i, :] are the weights for the words in the i-th sentence
    :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
    :return: emb, emb[i, :] is the embedding for sentence i
    """
    emb = get_weighted_average(We, x, w)
    if  params.rmpc > 0:
        emb = remove_pc(emb, params.rmpc)
    return emb

*SIF_embedding*建立句子向量

In [None]:
def get_sentences_embedding(sentences):

    """
    return: embedding: ndarray, shape (n_samples, vector_space_dim)
    """

    sequence_matrix, mask_matrix = sentences2idx(sentences, words2index)
    weight_matrix = seq2weight(sequence_matrix, mask_matrix, weight4ind)
    params = sparams.params()
    # 移除前多少个主成分
    params.rmpc = rm_pc

    embedding = SIF_embedding(words_embedding, sequence_matrix, weight_matrix, params)
    return embedding

In [None]:
def get_vectors_from_content(title, content, sentences):
    title_embedding = get_sentences_embedding([title])
    sentences_embedding = get_sentences_embedding(sentences)
    contents_embedding = get_sentences_embedding([remove_symbols(content.replace('\n', ' '))])

    return title_embedding, sentences_embedding, contents_embedding

In [None]:
def cal_vector_distance(vector1, vector2):
    inner_prod = (vector1 * vector2).sum()
    vctr1_norm = np.sqrt((vector1 * vector1).sum())
    vctr2_norm = np.sqrt((vector2 * vector2).sum())
    cos_distance = inner_prod / (vctr1_norm * vctr2_norm)
    return cos_distance


def eval_vector_similar(vector1, vector2):
    return (cal_vector_distance(vector1, vector2) + 1) / 2

计算vector cos距离

In [None]:
def get_vectors_from_file():

    news_token_df = pd.read_csv(tokenized_file)

    for i in news_token_df.index:
        sentences = [remove_symbols(sen) for sen in news_token_df.loc[i]['content'].split('\n')]
        title_embedding, sentences_embedding, contents_embedding = \
            get_vectors_from_content(news_token_df.loc[i]['title'],
                                     news_token_df.loc[i]['content'],
                                     sentences)
        index = 0
        for sentence in sentences_embedding:
            # 计算句子与content、title相似度
            content_sim = eval_vector_similar(sentence, contents_embedding[0])
            title_sim = eval_vector_similar(sentence, title_embedding[0])
            # 标题和内容如何加权？
            total_sim = (content_sim + title_sim) / 2
            save(news_token_df.loc[i]['doc_id'], sentences[index], content_sim, title_sim, total_sim)
            index += 1

In [None]:
def save(doc_id, sentence, content_sim, title_sim, total_sim):
    if not os.path.exists(similarity_file):
        with open(similarity_file, 'w') as sewh:
            sewh.write('doc_id,sentence,content_sim,title_sim,total_sim')
    with open(similarity_file, 'a') as sewh:
        sewh.write('{},{},{},{},{}\n'.format(doc_id, sentence.strip(), content_sim, title_sim, total_sim))

In [None]:
get_vectors_from_file()