# 以Gensim使用進行相似性比較

In [3]:
# 載入相關套件
import pprint  # 較美觀的列印函數
import gensim
from collections import defaultdict
from gensim import corpora

## 測試的語料庫(Corpus)

In [5]:
# 語料庫
documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

## 分詞，轉小寫

In [7]:
# 任意設定一些停用詞
stoplist = set('for a of the and to in'.split())

# 分詞，轉小寫
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]
texts

[['human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications'],
 ['survey', 'user', 'opinion', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'management', 'system'],
 ['system', 'human', 'system', 'engineering', 'testing', 'eps'],
 ['relation', 'user', 'perceived', 'response', 'time', 'error', 'measurement'],
 ['generation', 'random', 'binary', 'unordered', 'trees'],
 ['intersection', 'graph', 'paths', 'trees'],
 ['graph', 'minors', 'iv', 'widths', 'trees', 'well', 'quasi', 'ordering'],
 ['graph', 'minors', 'survey']]

## 單字出現次數統計

In [8]:
# 單字出現次數統計
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1  
frequency        

defaultdict(int,
            {'human': 2,
             'machine': 1,
             'interface': 2,
             'lab': 1,
             'abc': 1,
             'computer': 2,
             'applications': 1,
             'survey': 2,
             'user': 3,
             'opinion': 1,
             'system': 4,
             'response': 2,
             'time': 2,
             'eps': 2,
             'management': 1,
             'engineering': 1,
             'testing': 1,
             'relation': 1,
             'perceived': 1,
             'error': 1,
             'measurement': 1,
             'generation': 1,
             'random': 1,
             'binary': 1,
             'unordered': 1,
             'trees': 3,
             'intersection': 1,
             'graph': 3,
             'paths': 1,
             'minors': 2,
             'iv': 1,
             'widths': 1,
             'well': 1,
             'quasi': 1,
             'ordering': 1})

In [9]:
# 移除只出現一次的單字
texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]
texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [10]:
# 轉為字典
dictionary = corpora.Dictionary(texts)

# 轉為 BOW
corpus = [dictionary.doc2bow(text) for text in texts]
corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (5, 1), (7, 1), (8, 1)],
 [(1, 1), (5, 2), (8, 1)],
 [(3, 1), (6, 1), (7, 1)],
 [(9, 1)],
 [(9, 1), (10, 1)],
 [(9, 1), (10, 1), (11, 1)],
 [(4, 1), (10, 1), (11, 1)]]

## 建立 LSI (Latent semantic indexing) 模型

In [26]:
# 建立 LSI (Latent semantic indexing) 模型
from gensim import models

# num_topics=2：取二維，即兩個議題
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

# 兩個議題的 LSI 公式
lsi.print_topics(2)

[(0,
  '0.644*"system" + 0.404*"user" + 0.301*"eps" + 0.265*"time" + 0.265*"response" + 0.240*"computer" + 0.221*"human" + 0.206*"survey" + 0.198*"interface" + 0.036*"graph"'),
 (1,
  '0.623*"graph" + 0.490*"trees" + 0.451*"minors" + 0.274*"survey" + -0.167*"system" + -0.141*"eps" + -0.113*"human" + 0.107*"response" + 0.107*"time" + -0.072*"interface"')]

## 測試 LSI (Latent semantic indexing) 模型

In [27]:
# 例句
doc = "Human computer interaction"

# 測試 LSI (Latent semantic indexing) 模型
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  
print(vec_lsi)

[(0, 0.4618210045327157), (1, -0.07002766527900067)]


## 比較例句與語料庫每一句的相似機率

In [19]:
# 比較例句與語料庫每一句的相似機率
from gensim import similarities

# 比較例句與語料庫的相似性索引
index = similarities.MatrixSimilarity(lsi[corpus])  

# 比較例句與語料庫的相似機率
sims = index[vec_lsi]  

# 顯示語料庫的索引值及相似機率
print(list(enumerate(sims)))  

[(0, 0.998093), (1, 0.93748635), (2, 0.9984453), (3, 0.98658866), (4, 0.90755945), (5, -0.12416792), (6, -0.1063926), (7, -0.09879464), (8, 0.05004177)]


## 依相似機率降冪排序

In [20]:
# 依相似機率降冪排序
sims = sorted(enumerate(sims), key=lambda item: -item[1])
for doc_position, doc_score in sims:
    print(doc_score, documents[doc_position])

0.9984453 The EPS user interface management system
0.998093 Human machine interface for lab abc computer applications
0.98658866 System and human system engineering testing of EPS
0.93748635 A survey of user opinion of computer system response time
0.90755945 Relation of user perceived response time to error measurement
0.05004177 Graph minors A survey
-0.09879464 Graph minors IV Widths of trees and well quasi ordering
-0.1063926 The intersection graph of paths in trees
-0.12416792 The generation of random binary unordered trees
