# HW: Gensim - Similarity Queries By LSA model

In [1]:
# Install Gensim

! pip install gensim



In [2]:
# Corpus data

documents = [
    "Human machine interface for lab abc computer applications",
    "A survey of user opinion of computer system response time",
    "The EPS user interface management system",
    "System and human system engineering testing of EPS",
    "Relation of user perceived response time to error measurement",
    "The generation of random binary unordered trees",
    "The intersection graph of paths in trees",
    "Graph minors IV Widths of trees and well quasi ordering",
    "Graph minors A survey",
]

In [3]:
# Pre-prepare corpus

from collections import defaultdict
# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

In [4]:
# Make Dictionary

from gensim import corpora
dictionary = corpora.Dictionary(texts)

In [5]:
# Transform corpus to bag-of-word

bow_corpus = [dictionary.doc2bow(text) for text in texts]

In [7]:
# Train the model

from gensim import models
lsi= models.LsiModel(corpus = bow_corpus, num_topics=2, id2word=dictionary)
corpus_lsi = lsi[bow_corpus]

In [10]:
# Input query document

new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
vec_lsi = lsi[new_vec]

In [14]:
# Similarity Queries

from gensim import similarities

index = similarities.MatrixSimilarity(corpus_lsi)
sims = index[vec_lsi]

sims = sorted(enumerate(sims), key= lambda item: -item[1])

print('{:8} {:} {:}'.format('Score', '\t', 'Document'))
print('{:8} {:} {:}'.format('---','\t','---'))
for doc_position, doc_score in sims:
    print('{:.6f} {:} {:}'.format(doc_score, '\t', documents[doc_position]))

Score    	 Document
---      	 ---
0.998445 	 The EPS user interface management system
0.998093 	 Human machine interface for lab abc computer applications
0.986589 	 System and human system engineering testing of EPS
0.937486 	 A survey of user opinion of computer system response time
0.907559 	 Relation of user perceived response time to error measurement
0.050042 	 Graph minors A survey
-0.098795 	 Graph minors IV Widths of trees and well quasi ordering
-0.106393 	 The intersection graph of paths in trees
-0.124168 	 The generation of random binary unordered trees


# Another HW: Edit the num_topic

In [18]:
# Edit the num_topic
from gensim import models
lsi= models.LsiModel(corpus = bow_corpus, num_topics=3, id2word=dictionary)
corpus_lsi = lsi[bow_corpus]

# Input query document
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
vec_lsi = lsi[new_vec]

# Similarity Queries
from gensim import similarities

index = similarities.MatrixSimilarity(corpus_lsi)
sims = index[vec_lsi]

sims = sorted(enumerate(sims), key= lambda item: -item[1])

print('{:8} {:} {:}'.format('Score', '\t', 'Document'))
print('{:8} {:} {:}'.format('---','\t','---'))
for doc_position, doc_score in sims:
    print('{:.6f} {:} {:}'.format(doc_score, '\t', documents[doc_position]))

Score    	 Document
---      	 ---
0.997788 	 The EPS user interface management system
0.992586 	 Human machine interface for lab abc computer applications
0.927698 	 System and human system engineering testing of EPS
0.661374 	 A survey of user opinion of computer system response time
0.355441 	 Relation of user perceived response time to error measurement
0.082568 	 Graph minors A survey
0.002334 	 Graph minors IV Widths of trees and well quasi ordering
0.002063 	 The intersection graph of paths in trees
0.001308 	 The generation of random binary unordered trees


# Another HW: Bow -> Tf-idf -> LSA

In [24]:
from gensim import models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

lsi= models.LsiModel(corpus = corpus_tfidf, num_topics=4, id2word=dictionary)
corpus_lsi = lsi[corpus_tfidf]

In [25]:
new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())
vec_tfidf = tfidf[new_vec]
vec_lsi = lsi[vec_tfidf]

In [26]:
from gensim import similarities

index = similarities.MatrixSimilarity(corpus_lsi)
sims = index[vec_lsi]

sims = sorted(enumerate(sims), key= lambda item: -item[1])

print('{:8} {:} {:}'.format('Score', '\t', 'Document'))
print('{:8} {:} {:}'.format('---','\t','---'))
for doc_position, doc_score in sims:
    print('{:.6f} {:} {:}'.format(doc_score, '\t', documents[doc_position]))

Score    	 Document
---      	 ---
0.994644 	 Human machine interface for lab abc computer applications
0.983076 	 The EPS user interface management system
0.981626 	 System and human system engineering testing of EPS
0.437642 	 A survey of user opinion of computer system response time
0.116106 	 Relation of user perceived response time to error measurement
0.111599 	 Graph minors A survey
0.024265 	 Graph minors IV Widths of trees and well quasi ordering
-0.025438 	 The intersection graph of paths in trees
-0.063449 	 The generation of random binary unordered trees
