Skip to content

Commit

Permalink
better handling on max_results/num_best/topsims settings
Browse files Browse the repository at this point in the history
* but still too complex, will rework and simplify the logic later
  • Loading branch information
piskvorky committed Nov 2, 2012
1 parent 3cce4c0 commit e7e59e8
Showing 1 changed file with 21 additions and 10 deletions.
31 changes: 21 additions & 10 deletions simserver/simserver.py
Expand Up @@ -46,15 +46,17 @@



def merge_sims(oldsims, newsims, clip=TOP_SIMS):
def merge_sims(oldsims, newsims, clip=None):
"""Merge two precomputed similarity lists, truncating the result to `clip` most similar items."""
if oldsims is None:
result = newsims or []
elif newsims is None:
result = oldsims
else:
result = sorted(oldsims + newsims, key=lambda item: -item[1])
return result[: clip]
if clip is not None:
result = result[:clip]
return result



Expand Down Expand Up @@ -255,7 +257,7 @@ def merge(self, other):
# ignore masked entries (deleted, overwritten documents)
docid = self.pos2id[pos]
sims = self.sims2scores(sims)
self.id2sims[docid] = merge_sims(self.id2sims[docid], sims)
self.id2sims[docid] = merge_sims(self.id2sims[docid], sims, self.topsims)
pos += 1
if pos % 10000 == 0:
logger.info("PROGRESS: updated doc #%i/%i" % (pos, lenself))
Expand Down Expand Up @@ -615,7 +617,7 @@ def index(self, corpus=None, clear_buffer=True):
for docid in self.fresh_docs:
payload = self.fresh_docs[docid].get('payload', None)
if payload is None:
# TODO HACK: exit on first doc without a payload (=assume all docs have payload, or none does)
# HACK: exit on first doc without a payload (=assume all docs have payload, or none does)
break
self.payload[docid] = payload
self.flush(save_index=True, clear_buffer=clear_buffer)
Expand Down Expand Up @@ -711,22 +713,28 @@ def vec_by_id(self, docid):

def find_similar(self, doc, min_score=0.0, max_results=100):
"""
Find at most `max_results` most similar articles in the index,
each having similarity score of at least `min_score`.
Find `max_results` most similar articles in the index, each having similarity
score of at least `min_score`. The resulting list may be shorter than `max_results`,
in case there are not enough matching documents.
`doc` is either a string (document id, previously indexed) or a
`doc` is either a string (=document id, previously indexed) or a
dict containing a 'tokens' key. These tokens are processed to produce a
vector, which is then used as a query.
vector, which is then used as a query against the index.
The similar documents are returned in decreasing similarity order, as
(doc_id, doc_score) pairs.
`(doc_id, similarity_score, doc_payload)` 3-tuples. The payload returned
is identical to what was supplied for this document during indexing.
"""
logger.debug("received query call with %r" % doc)
if self.is_locked():
msg = "cannot query while the server is being updated"
logger.error(msg)
raise RuntimeError(msg)
sims_opt, sims_fresh = None, None
for index in [self.fresh_index, self.opt_index]:
if index is not None:
index.topsims = max_results
if isinstance(doc, basestring):
# query by direct document id
docid = doc
Expand Down Expand Up @@ -754,8 +762,11 @@ def find_similar(self, doc, min_score=0.0, max_results=100):
if self.fresh_index is not None:
sims_fresh = self.fresh_index.sims_by_vec(vec)

merged = merge_sims(sims_opt, sims_fresh)
logger.debug("got %s raw similars, pruning with max_results=%s, min_score=%s" %
(len(merged), max_results, min_score))
result = []
for docid, score in merge_sims(sims_opt, sims_fresh):
for docid, score in merged:
if score < min_score or 0 < max_results <= len(result):
break
result.append((docid, float(score), self.payload.get(docid, None)))
Expand Down

0 comments on commit e7e59e8

Please sign in to comment.