# Page Similarity Calculation

## 1. Use Word2Vec model to calculate the similarity of 2 pages

1. 提取关键词.将2篇文章的关键词提取出来
2. 计算文章的词向量: 直接用每个关键词的词向量之和作为该文章的词向量
3. 计算2篇文章词向量的相似度: (这里采用Cosine Similarity, 详见[this](https://en.wikipedia.org/wiki/Cosine_similarity))假设2篇文章的vector分别是$v1 = (\alpha_1, \alpha_2, ..., \alpha_n)$, $v2 = (\beta_1, \beta_2, ..., \beta_n)$, 那么相似度计算公式就是
$\text{sim} = \dfrac{\alpha_1 \beta_1 + \alpha_2 \beta_2 + ... + \alpha_n \beta_n}{\sqrt{\alpha_1^2 + \alpha_2^2 + ... + \alpha_n^2} \sqrt{\beta_1^2 + \beta_2^2 + ... + \beta_n^2}}$

In [1]:
# -*- coding: utf-8 -*-
import codecs
import numpy
import gensim
import numpy as np
import jieba.posseg as pseg
from jieba import analyse

def extract_keywords(data, file_name):
   tfidf = analyse.extract_tags
   keywords = tfidf(data)
   return keywords

def get_keywords(docpath, savepath):
   with open(docpath, 'r') as docf, open(savepath, 'w') as outf:
      for data in docf:
         data = data[:len(data)-1]
         keywords = extract_keywords(data, savepath)
         for word in keywords:
            outf.write(word + ' ')
         outf.write('\n')

wordvec_size = 192

def get_char_pos(string, char):
    chPos = []
    try:
        chPos = list(((pos) for pos,val in enumerate(string) if(val == char)))
    except:
        pass
    return chPos

def word2vec(keywords_filename, model):
    with codecs.open(keywords_filename, 'r') as f:
        word_vec_all = numpy.zeros(wordvec_size)
        for data in f:
            space_pos = get_char_pos(data, ' ')
            first_word = data[0:space_pos[0]]
            
            if model.__contains__(first_word):
                word_vec_all = word_vec_all + model[first_word]

            for i in range(len(space_pos) - 1):
                word = data[space_pos[i]:space_pos[i + 1]]
                if model.__contains__(word):
                    word_vec_all = word_vec_all + model[word]
        return word_vec_all

def calculate_similarity(v1, v2):
    try:
        simlarity = (v1.dot(v2)) / (np.sqrt(v1.dot(v1)) * np.sqrt(v2.dot(v2))) # cosine similarity
    except ZeroDivisionError:
        simlarity = 0
    return simlarity

def word2vec_page_sim():
    model = gensim.models.Word2Vec.load('data/zhiwiki_news.word2vec')
    p1 = './data/P1.txt'
    p2 = './data/P2.txt'
    p1_keywords = './data/P1_keywords.txt'
    p2_keywords = './data/P2_keywords.txt'
    get_keywords(p1, p1_keywords)
    get_keywords(p2, p2_keywords)
    p1_vec = word2vec(p1_keywords, model)
    p2_vec = word2vec(p2_keywords, model)
    
    print('p1_vec:', p1_vec)
    print('p2_vec:', p2_vec)
    
    print(calculate_similarity(p1_vec,p2_vec))

word2vec_page_sim()


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.870 seconds.
Prefix dict has been built succesfully.


p1_vec: [ 4.23841228e+00  3.71827054e+00  7.05909024e+00  6.10349774e-01
 -2.72767605e+00 -5.81446905e+00  1.02061155e+00  4.58275934e+00
 -4.35048804e-01  3.77729899e+00  2.61981661e+00  4.97605101e+00
  5.87876064e+00 -6.71862587e+00 -1.85858855e+00  8.41255665e-01
 -1.64739894e+00 -2.57261125e+00  4.65867245e+00 -1.41205895e+00
  1.01225559e+00  3.71518169e-01  2.14275423e+00 -6.55527748e-01
  2.87099004e+00 -5.12713595e+00 -8.88802856e-02 -6.96311814e+00
 -5.31244956e+00 -4.50435445e-01 -6.89822335e-01 -3.87607894e+00
  9.68066527e-01  4.40337323e-03  5.27700410e-01  3.43308275e+00
 -7.20372159e+00 -1.64023352e+00 -1.21408324e+00  2.99637560e+00
 -2.67214400e-01  2.60646333e+00 -1.09813169e+00  1.19340914e+00
  1.29825438e-01 -3.32067263e+00 -9.52173507e+00  5.61991916e+00
  9.57661591e-01 -4.82677294e+00 -7.57778227e+00  6.30030897e-01
 -1.44364119e+00  2.95114477e+00  1.78126559e+00 -8.07126786e+00
 -5.92282398e+00  6.69494358e+00  6.23454630e-01 -6.47798018e+00
 -3.54597037e+00 

## 2. Use Doc2Vec to calculate the similarity of 2 pages

详见[this](https://github.com/nlpinaction/learning-nlp/tree/master/chapter-7/doc2vec%E8%AE%AD%E7%BB%83%E4%B8%8E%E7%9B%B8%E4%BC%BC%E5%BA%A6%E8%AE%A1%E7%AE%97). 需要重新训练Doc2Vec模型,也是利用gensim的models内的函数来计算的,所以这里就不重复了