# 以TF-IDF实作问答配对

## 载入相关套件

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np

## 语料：最后一句为问题，其他为回答

In [9]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
]

In [10]:
# 将语料转换为词频矩阵，计算各个字词出现的次数。
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# 生字表
word = vectorizer.get_feature_names()
print ("Vocabulary：", word)

Vocabulary： ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


## 查看四句话的 BOW

In [13]:
print ("BOW=\n", X.toarray())

BOW=
 [[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


## TF-IDF 转换

In [15]:
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)
print ("TF-IDF=\n", np.around(tfidf.toarray(), 4))

TF-IDF=
 [[0.     0.4388 0.542  0.4388 0.     0.     0.3587 0.     0.4388]
 [0.     0.2723 0.     0.2723 0.     0.8532 0.2226 0.     0.2723]
 [0.5528 0.     0.     0.     0.5528 0.     0.2885 0.5528 0.    ]
 [0.     0.4388 0.542  0.4388 0.     0.     0.3587 0.     0.4388]]


## 最后一句与其他句的相似度比较

In [17]:
from sklearn.metrics.pairwise import cosine_similarity
print (cosine_similarity(tfidf[-1], tfidf[:-1], dense_output=False))

  (0, 2)	0.1034849000930086
  (0, 1)	0.43830038447620107
  (0, 0)	1.0
