Permalink
Browse files

Fix Issue #53 - Implement more like this.

Adds a MLT class for making _mlt API calls.
  • Loading branch information...
1 parent 3009e08 commit 591bed3c3e9bed0bd853d208105695111923b820 @rlr rlr committed Aug 30, 2012
Showing with 152 additions and 0 deletions.
  1. +68 −0 elasticutils/__init__.py
  2. +84 −0 elasticutils/tests/test_mlt.py
View
@@ -660,6 +660,74 @@ def facet_counts(self):
return facets
+class MLT(object):
+ """
+ Represents a lazy ElasticSearch more like this API call.
+ """
+ def __init__(self, s, id, fields=None, index=None, doc_type=None,
+ **query_params):
+ """
+ When the MLT is evaluated, it generates a list of dict results.
+
+ :arg s: An instance of an S. The query is passed in the body of
+ the more like this request.
+ :arg id: The id of the document we want to find more like.
+ :arg fields: A list of fields to use for more like this.
+ :arg index: The index to use. Falls back to the first index
+ listed in s.
+ :arg doc_type: The doctype to use. Falls back to the first
+ doctype listed in s.
+ :arg query_params: Any additional query parameters for the
+ more like this call.
+ """
+ self.s = s
+ # If an index or doctype isn't given, we use the first one
+ # in the S.
+ self.index = index or s.get_indexes()[0]
+ self.doc_type = doc_type or s.get_doctypes()[0]
+ self.id = id
+ self.fields = fields
+ self.query_params = query_params
+ self._results_cache = None
+ self.type = s.type
+
+ def __iter__(self):
+ return iter(self._do_search())
+
+ def __len__(self):
+ return len(self._do_search())
+
+ def raw(self):
+ """
+ Build query and passes to ElasticSearch, then returns the raw
+ format returned.
+ """
+ qs = self.s._build_query()
+ es = self.s.get_es()
+ try:
+ path = es._make_path([self.index, self.doc_type, self.id, '_mlt'])
+ if self.fields:
+ self.query_params['mlt_fields'] = ','.join(self.fields)
+ hits = es._send_request(
+ 'GET', path, body=qs, params=self.query_params)
+ log.debug(hits)
+ except Exception:
+ log.error(qs)
+ raise
+ log.debug('[%s] %s' % (hits['took'], qs))
+ return hits
+
+ def _do_search(self):
+ """
+ Perform the mlt call, then convert that raw format into a
+ SearchResults instance and return it.
+ """
+ if not self._results_cache:
+ hits = self.raw()
+ self._results_cache = DictSearchResults(self.type, hits, None)
+ return self._results_cache
+
+
class SearchResults(object):
def __init__(self, type, results, fields):
self.type = type
@@ -0,0 +1,84 @@
+from nose.tools import eq_
+
+from elasticutils import MLT, S
+from elasticutils.tests import FakeModel, ElasticTestCase, facet_counts_dict
+
+
+class HasDataTestCase(ElasticTestCase):
+ @classmethod
+ def setup_class(cls):
+ super(HasDataTestCase, cls).setup_class()
+ if cls.skip_tests:
+ return
+
+ es = cls.get_es()
+ es.delete_index_if_exists(cls.index_name)
+
+ data = []
+ data.append(FakeModel(id=1, foo='bar', tag='awesome'))
+ data.append(FakeModel(id=2, foo='bar', tag='boring'))
+ data.append(FakeModel(id=3, foo='bar', tag='awesome'))
+ data.append(FakeModel(id=4, foo='bar', tag='boring'))
+ data.append(FakeModel(id=5, foo='bar', tag='elite'))
+ data.append(FakeModel(id=6, foo='notbar', tag='gross'))
+ data.append(FakeModel(id=7, foo='notbar', tag='awesome'))
+
+ for datum in data:
+ es.index(datum.__dict__, cls.index_name, FakeModel._meta.db_table,
+ bulk=True, id=datum.id)
+ es.refresh()
+
+ @classmethod
+ def teardown_class(cls):
+ super(HasDataTestCase, cls).teardown_class()
+ if cls.skip_tests:
+ return
+
+ es = cls.get_es()
+ es.delete_index(cls.index_name)
+
+ def get_s(self):
+ return S().indexes(
+ self.index_name).doctypes(FakeModel._meta.db_table).values_dict()
+
+
+class MoreLikeThisTest(HasDataTestCase):
+ def test_mlt_on_foo(self):
+ """Verify MLT with the foo field."""
+ # We need to pass min_term_freq and min_doc_freq, because the terms
+ # we are using are only once in each document.
+ mlt = MLT(self.get_s(), 1, ['foo'], min_term_freq=1, min_doc_freq=1)
+ eq_(len(mlt), 4)
+
+ def test_mlt_on_tag(self):
+ """Verify MLT with the tag field."""
+ # We need to pass min_term_freq and min_doc_freq, because the terms
+ # we are using are only once in each document.
+ mlt = MLT(self.get_s(), 1, ['tag'], min_term_freq=1, min_doc_freq=1)
+ eq_(len(mlt), 2)
+
+ def test_mlt_on_two_fields(self):
+ """Verify MLT on tag and foo fields."""
+ mlt = MLT(self.get_s(), 1, ['tag', 'foo'],
+ min_term_freq=1, min_doc_freq=1)
+ eq_(len(mlt), 5)
+
+ def test_mlt_on_foo_with_filter(self):
+ """Verify MLT with the foo field while filtering on tag."""
+ # We need to pass min_term_freq and min_doc_freq, because the terms
+ # we are using are only once in each document.
+ mlt = MLT(self.get_s().filter(tag='boring'), 1, ['foo'],
+ min_term_freq=1, min_doc_freq=1)
+ eq_(len(mlt), 2)
+
+ mlt = MLT(self.get_s().filter(tag='elite'), 1, ['foo'],
+ min_term_freq=1, min_doc_freq=1)
+ eq_(len(mlt), 1)
+
+ mlt = MLT(self.get_s().filter(tag='awesome'), 1, ['foo'],
+ min_term_freq=1, min_doc_freq=1)
+ eq_(len(mlt), 1)
+
+ mlt = MLT(self.get_s().filter(tag='gross'), 1, ['foo'],
+ min_term_freq=1, min_doc_freq=1)
+ eq_(len(mlt), 0)

0 comments on commit 591bed3

Please sign in to comment.