# Try gensim's similarity queries

https://radimrehurek.com/gensim/auto_examples/core/run_similarity_queries.html

In [8]:
from model import Page

documents = []
pages = Page.select().where(Page.assistant_id == 3)

for page in pages:
    documents.append(page.text_content)


In [9]:
print(len(documents))

22


In [10]:
from collections import defaultdict
from gensim import corpora


# remove common words and tokenize
stoplist = set('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]

# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [
    [token for token in text if frequency[token] > 1]
    for text in texts
]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


In [11]:
from gensim import models
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=3)

In [12]:
doc = "queue"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]  # convert the query to LSI space
print(vec_lsi)

[(0, 0.11100203383226366), (1, -0.022087234880819824), (2, -0.011007239827805911)]


In [14]:
sims = lsi[vec_lsi]  # perform a similarity query against the corpus
print(list(enumerate(sims)))  # print (document_number, document_similarity) 2-tuples

[(0, (0, 0.01942976723234607)), (1, (1, -0.002353473654483718)), (2, (2, 0.00047003093472670215))]


In [17]:
sims = sorted(enumerate(sims), key=lambda item: item[1])
for doc_position, doc_score in sims:
    print('doc_score', doc_score) 
    print('doc_position', documents[doc_position])

doc_score (0, (0, 0.01942976723234607))
doc_position [![Fork me on GitHub](https://github.blog/wp-
content/uploads/2008/12/forkme_right_red_aa0000.png)](http://git.io/rq)

  * [Home](/)
  * [Docs](/docs/)
  * [Patterns](/patterns/)
  * [Contributing](/contrib/)
  * [Chat](/chat/)

RQ (_Redis Queue_) is a simple Python library for queueing jobs and processing
them in the background with workers. It is backed by Redis and it is designed
to have a low barrier to entry. It can be integrated in your web stack easily.

RQ requires Redis >= 3.0.0.

## Getting Started

First, run a Redis server. You can use an existing one. To put jobs on queues,
you don’t have to do anything special, just define your typically lengthy or
blocking function:

    
    
    import requests
    
    def count_words_at_url(url):
        resp = requests.get(url)
        return len(resp.text.split())
    

Then, create a RQ queue:

    
    
    from redis import Redis
    from rq import Queue
    
    q = Queue(con