# 以TF-IDF實作問答配對

In [1]:
# 載入相關套件
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# 語料
corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "Never jump over the lazy dog quickly.",
    "A completely different sentence unrelated to the dog or fox."
]

In [3]:
# 將語料轉換為詞頻矩陣，計算各個字詞出現的次數。
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

# 生字表
word = vectorizer.get_feature_names_out()
print ("Vocabulary：", word)

Vocabulary： ['brown' 'completely' 'different' 'dog' 'fox' 'jump' 'jumps' 'lazy'
 'never' 'or' 'over' 'quick' 'quickly' 'sentence' 'the' 'to' 'unrelated']


In [4]:
# 查看3句話的 TF-IDF 向量
print ("TF-IDF vector=\n", tfidf_matrix.toarray())

TF-IDF vector=
 [[0.39285725 0.         0.         0.23202782 0.29877806 0.
  0.39285725 0.29877806 0.         0.         0.29877806 0.39285725
  0.         0.         0.46405564 0.         0.        ]
 [0.         0.         0.         0.26806191 0.         0.45386827
  0.         0.34517852 0.45386827 0.         0.34517852 0.
  0.45386827 0.         0.26806191 0.         0.        ]
 [0.         0.37072514 0.37072514 0.21895624 0.28194602 0.
  0.         0.         0.         0.37072514 0.         0.
  0.         0.37072514 0.21895624 0.37072514 0.37072514]]


In [6]:
# 混淆矩陣
cosine_similarity(tfidf_matrix)

array([[1.        , 0.392857  , 0.2366511 ],
       [0.392857  , 1.        , 0.11738766],
       [0.2366511 , 0.11738766, 1.        ]])

In [10]:
# 比較第一句與其他例句的相似度
print(cosine_similarity(tfidf_matrix[0], tfidf_matrix[1:], dense_output=False))

  (0, 1)	0.23665110399591885
  (0, 0)	0.39285699958564646
