# Define Our Documents

In [1]:
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

# Preprocess

In [2]:
# defaultdict: collections的一類，用於查找字典遇到不存在的key值時，自動賦予該key一初始值(透過函數產生)並回傳
from collections import defaultdict

# 列舉常見的停用字，並將documents中的停用字刪除 & 切分
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# 利用字典計算documents中的所有詞出現的頻率
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
# 刪除出現頻率<=1的字詞
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

In [3]:
# 將前面處理好的documents轉成字典(字詞：token id)，並以此將前面處理好的documents轉為BoW vector表示
from gensim import corpora
dictionary = corpora.Dictionary(texts)
bow_corpus = [dictionary.doc2bow(text) for text in texts]



# Train with TF-IDF

In [6]:
# 將經過處理的documents(去停用字並轉成vector的corpus)丟到TF-IDF模型訓練
from gensim import models
tfidf = models.TfidfModel(bow_corpus)

In [14]:
# 可透過gensim的similarities類計算相似度
# 將corpus透過TF-IDF模型表示，並作為參數餵入到MatrixSimilarity
from gensim import similarities
corpus_tfidf = tfidf[bow_corpus]
index = similarities.MatrixSimilarity(corpus_tfidf)

# 將你想檢測的documents轉換成TF-IDF表示
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
vec_tdidf = tfidf[new_vec]

# 計算相似度分數(方法類似array 的 index，將你想計算相似度的documents作為index即可)
sims = index[vec_tdidf]
sims = sorted(enumerate(sims), key= lambda item: -item[1])
print(sims,'\n')

print('{:8} {:} {:}'.format('Score', '\t', 'Document'))
print('{:8} {:} {:}'.format('---','\t','---'))
for doc_position, doc_score in sims:
    print('{:.6f} {:} {:}'.format(doc_score, '\t', documents[doc_position]))

[(0, 0.81649655), (3, 0.3477732), (1, 0.31412902), (2, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)] 

Score    	 Document
---      	 ---
0.816497 	 Human machine interface for lab abc computer applications
0.347773 	 System and human system engineering testing of EPS
0.314129 	 A survey of user opinion of computer system response time
0.000000 	 The EPS user interface management system
0.000000 	 Relation of user perceived response time to error measurement
0.000000 	 The generation of random binary unordered trees
0.000000 	 The intersection graph of paths in trees
0.000000 	 Graph minors IV Widths of trees and well quasi ordering
0.000000 	 Graph minors A survey


# Train with LSA (LSI)

原先TF-IDF認定「Human machine interface for lab abc computer」與我們的document(Human computer interaction)最為相似，但仍有改良空間

(例：The EPS user interface management system 也應被列入)

故試著使用別的模型處理

In [19]:
from gensim import models
lsi = models.LsiModel(bow_corpus, id2word=dictionary, num_topics=2)

In [20]:
# 同TF-IDF，只是我們模型變成了LSI
corpus_lsi = lsi[bow_corpus]
index = similarities.MatrixSimilarity(corpus_lsi)


new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
vec_lsi = lsi[new_vec]

sims = index[vec_lsi]
sims = sorted(enumerate(sims), key= lambda item: -item[1])
print(sims,'\n')

print('{:8} {:} {:}'.format('Score', '\t', 'Document'))
print('{:8} {:} {:}'.format('---','\t','---'))
for doc_position, doc_score in sims:
    print('{:.6f} {:} {:}'.format(doc_score, '\t', documents[doc_position]))

[(2, 0.9984453), (0, 0.998093), (3, 0.9865886), (1, 0.93748635), (4, 0.90755945), (8, 0.050041765), (7, -0.09879464), (6, -0.10639259), (5, -0.12416792)] 

Score    	 Document
---      	 ---
0.998445 	 The EPS user interface management system
0.998093 	 Human machine interface for lab abc computer applications
0.986589 	 System and human system engineering testing of EPS
0.937486 	 A survey of user opinion of computer system response time
0.907559 	 Relation of user perceived response time to error measurement
0.050042 	 Graph minors A survey
-0.098795 	 Graph minors IV Widths of trees and well quasi ordering
-0.106393 	 The intersection graph of paths in trees
-0.124168 	 The generation of random binary unordered trees


# Optional：Modify hyperparameter

In [36]:
# 調整corpus的topic分類數量
# num_topics用於決定lsi分解出的矩陣其中之一的維度
lsi = models.LsiModel(bow_corpus, id2word=dictionary, num_topics=5)

In [37]:
# 比較看看與原先有甚麼不同
corpus_lsi = lsi[bow_corpus]
index = similarities.MatrixSimilarity(corpus_lsi)


new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
vec_lsi = lsi[new_vec]

sims = index[vec_lsi]
sims = sorted(enumerate(sims), key= lambda item: -item[1])
print(sims,'\n')

print('{:8} {:} {:}'.format('Score', '\t', 'Document'))
print('{:8} {:} {:}'.format('---','\t','---'))
for doc_position, doc_score in sims:
    print('{:.6f} {:} {:}'.format(doc_score, '\t', documents[doc_position]))

[(0, 0.97272426), (1, 0.31977737), (2, 0.3044809), (3, 0.24447846), (8, 0.15309832), (7, -0.03274583), (4, -0.04838741), (6, -0.09127023), (5, -0.16497499)] 

Score    	 Document
---      	 ---
0.972724 	 Human machine interface for lab abc computer applications
0.319777 	 A survey of user opinion of computer system response time
0.304481 	 The EPS user interface management system
0.244478 	 System and human system engineering testing of EPS
0.153098 	 Graph minors A survey
-0.032746 	 Graph minors IV Widths of trees and well quasi ordering
-0.048387 	 Relation of user perceived response time to error measurement
-0.091270 	 The intersection graph of paths in trees
-0.164975 	 The generation of random binary unordered trees


因為維度增加了，在相似度辨識的程度上產生了更多變化(別於num_topics=2時趨近於二分法的分數，更多的topcis能用更多的層級說明不同documents間的相似度)

# Optional：Merge TF-IDF & LSI

In [49]:
# 改用tf-idf形式的corpus
lsi = models.LsiModel(corpus_tfidf, num_topics=2)

In [50]:
corpus_lsi = lsi[corpus_tfidf]
index = similarities.MatrixSimilarity(corpus_lsi)

# 將欲檢測的document也轉成tf-idf表示
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
vec_tfidf = tfidf[new_vec]
vec_lsidf = lsi[vec_tfidf]

sims = index[vec_lsidf]
sims = sorted(enumerate(sims), key= lambda item: -item[1])
print(sims,'\n')

print('{:8} {:} {:}'.format('Score', '\t', 'Document'))
print('{:8} {:} {:}'.format('---','\t','---'))
for doc_position, doc_score in sims:
    print('{:.6f} {:} {:}'.format(doc_score, '\t', documents[doc_position]))

[(0, 0.9999408), (2, 0.99990785), (3, 0.99984384), (4, 0.9992786), (1, 0.99330217), (8, 0.2224844), (7, -0.016480923), (6, -0.0515742), (5, -0.08804217)] 

Score    	 Document
---      	 ---
0.999941 	 Human machine interface for lab abc computer applications
0.999908 	 The EPS user interface management system
0.999844 	 System and human system engineering testing of EPS
0.999279 	 Relation of user perceived response time to error measurement
0.993302 	 A survey of user opinion of computer system response time
0.222484 	 Graph minors A survey
-0.016481 	 Graph minors IV Widths of trees and well quasi ordering
-0.051574 	 The intersection graph of paths in trees
-0.088042 	 The generation of random binary unordered trees


與前面的lsi x BoW相比，lsi x TF-IDF的分數整體變高，尤其是對於那些擁有關鍵字詞的documents