Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

[bug 781177] Implement ES->morelikethis for related documents.

* Drop wiki_relateddocument table
* Remove cron job for calculating related documents
* Use _mlt with mlt_fields=document_content,document_title,document_summary
  • Loading branch information...
commit 39841f51379da3aaa2d511f5b065dc7b98ae1d41 1 parent ca7a679
@rlr rlr authored
View
16 apps/search/models.py
@@ -8,9 +8,10 @@
from django.db.models.signals import pre_delete, post_save, m2m_changed
from django.dispatch import receiver
-from search.tasks import index_task, unindex_task
-from search import es_utils
+from elasticutils import MLT
+from search import es_utils
+from search.tasks import index_task, unindex_task
from sumo.models import ModelBase
log = logging.getLogger('k.search.es')
@@ -171,6 +172,17 @@ def unindex(cls, id_, es=None):
# not there.
pass
+ @classmethod
+ def get_s(cls):
+ """Get an S."""
+ return es_utils.Sphilastic(object).values_dict()
+
+ @classmethod
+ def morelikethis(cls, id_, s, fields):
+ """morelikethis query."""
+ return list(MLT(
+ id_, s=s, fields=fields, min_term_freq=1, min_doc_freq=1))
+
_identity = lambda s: s
View
59 apps/wiki/cron.py
@@ -1,14 +1,10 @@
import logging
import os
-import time
import urllib2
from django.conf import settings
-from django.db import connection, transaction
import cronjobs
-from multidb.pinning import use_master
-from statsd import statsd
import waffle
from search.tasks import index_task
@@ -20,61 +16,6 @@
@cronjobs.register
-def calculate_related_documents():
- """Calculates all related documents based on common tags."""
-
- cursor = connection.cursor()
-
- cursor.execute('DELETE FROM wiki_relateddocument')
- cursor.execute("""
- INSERT INTO
- wiki_relateddocument (document_id, related_id, in_common)
- SELECT
- t1.object_id,
- t2.object_id,
- COUNT(*) AS common_tags
- FROM
- wiki_document d1 JOIN
- taggit_taggeditem t1 JOIN
- taggit_taggeditem t2 JOIN
- wiki_document d2
- WHERE
- d1.id = t1.object_id AND
- t1.tag_id = t2.tag_id AND
- t1.object_id <> t2.object_id AND
- t1.content_type_id = (
- SELECT
- id
- FROM
- django_content_type
- WHERE
- app_label = 'wiki' AND
- model = 'document'
- ) AND
- t2.content_type_id = (
- SELECT
- id
- FROM
- django_content_type
- WHERE
- app_label = 'wiki' AND
- model = 'document'
- ) AND
- d2.id = t2.object_id AND
- d2.locale = d1.locale AND
- d2.category = d1.category AND
- d1.current_revision_id IS NOT NULL AND
- d2.current_revision_id IS NOT NULL AND
- d2.is_archived = 0
- GROUP BY
- t1.object_id,
- t2.object_id
- HAVING
- common_tags > 1""")
- transaction.commit_unless_managed()
-
-
-@cronjobs.register
def rebuild_kb():
# If rebuild on demand switch is on, do nothing.
if waffle.switch_is_active('wiki-rebuild-on-demand'):
View
43 apps/wiki/models.py
@@ -4,6 +4,7 @@
from django.conf import settings
from django.contrib.auth.models import User
+from django.core.cache import cache
from django.core.exceptions import ValidationError, ObjectDoesNotExist
from django.core.urlresolvers import resolve
from django.db import models
@@ -15,6 +16,7 @@
from tower import ugettext_lazy as _lazy, ugettext as _
from products.models import Product
+from search.es_utils import ESTimeoutError, ESMaxRetryError, ESException
from search.models import SearchMixin, register_for_indexing
from sumo import ProgrammingError
from sumo.models import ModelBase, LocaleField
@@ -72,13 +74,6 @@ class Document(NotificationsMixin, ModelBase, BigVocabTaggableMixin,
parent = models.ForeignKey('self', related_name='translations',
null=True, blank=True)
- # Related documents, based on tags in common.
- # The RelatedDocument table is populated by
- # wiki.cron.calculate_related_documents.
- related_documents = models.ManyToManyField('self',
- through='RelatedDocument',
- symmetrical=False)
-
# Cached HTML rendering of approved revision's wiki markup:
html = models.TextField(editable=False)
@@ -526,6 +521,31 @@ def recent_helpful_votes(self):
return HelpfulVote.objects.filter(
revision__document=self, created__gt=start, helpful=True).count()
+ @property
+ def related_documents(self):
+ """Return documents that are 'morelikethis' one."""
+ # First try to get the results from the cache
+ key = 'wiki_document:related_docs:%s' % self.id
+ documents = cache.get(key)
+ if documents:
+ return documents
+
+ try:
+ documents = self.morelikethis(
+ self.get_document_id(self.id),
+ s=self.get_s().filter(
+ model=self.get_model_name(),
+ document_locale=self.locale),
+ fields=[
+ 'document_title',
+ 'document_summary',
+ 'document_content'])
+ cache.add(key, documents)
+ except (ESTimeoutError, ESMaxRetryError, ESException):
+ documents = []
+
+ return documents
+
@classmethod
def get_query_fields(cls):
return ['document_title__text',
@@ -871,15 +891,6 @@ class ImportantDate(ModelBase):
date = models.DateField(db_index=True)
-class RelatedDocument(ModelBase):
- document = models.ForeignKey(Document, related_name='related_from')
- related = models.ForeignKey(Document, related_name='related_to')
- in_common = models.IntegerField()
-
- class Meta(object):
- ordering = ['-in_common']
-
-
def _doc_components_from_url(url, required_locale=None, check_host=True):
"""Return (locale, path, slug) if URL is a Document, False otherwise.
View
4 apps/wiki/templates/wiki/includes/document_macros.html
@@ -7,7 +7,7 @@
<section>
<ul>
{% for art in related %}
- <li><a href="{{ art.get_absolute_url() }}">{{ art.title }}</a></li>
+ <li><a href="{{ art['url'] }}">{{ art['document_title'] }}</a></li>
{% endfor %}
</ul>
</section>
@@ -19,7 +19,7 @@
<h1>{{ _('Related Articles') }}</h1>
<ul>
{% for art in related %}
- <li><a href="{{ art.get_absolute_url() }}">{{ art.title }}</a></li>
+ <li><a href="{{ art['url'] }}">{{ art['document_title'] }}</a></li>
{% endfor %}
</ul>
</section>
View
48 apps/wiki/tests/test_models.py
@@ -9,14 +9,12 @@
from sumo import ProgrammingError
from sumo.tests import TestCase
from topics.tests import topic
-from wiki.cron import calculate_related_documents
-from wiki.models import Document, RelatedDocument
+from wiki.models import Document
from wiki.config import (REDIRECT_SLUG, REDIRECT_TITLE, REDIRECT_HTML,
MAJOR_SIGNIFICANCE, CATEGORIES, TYPO_SIGNIFICANCE,
REDIRECT_CONTENT)
from wiki.parser import wiki_to_html
from wiki.tests import document, revision, doc_rev, translated_revision
-from wiki.utils import find_related_documents
def _objects_eq(manager, list_):
@@ -625,47 +623,3 @@ def test_delete_rendering(self):
# Now delete the final revision. It still shouldn't crash.
unapproved.delete()
eq_('', d.content_parsed)
-
-
-class RelatedDocumentTests(TestCase):
- fixtures = ['users.json', 'wiki/documents.json']
-
- def test_related_documents_calculated(self):
- d = Document.uncached.get(pk=1)
- eq_(0, d.related_documents.count())
-
- calculate_related_documents()
-
- d = Document.uncached.get(pk=1)
- eq_(1, d.related_documents.count())
-
- def test_related_only_locale(self):
- calculate_related_documents()
- d = Document.uncached.get(pk=1)
- for rd in d.related_documents.all():
- eq_('en-US', rd.locale)
-
- def test_find_related_documents(self):
- trans1 = translated_revision(is_approved=True)
- trans1.save()
- d1 = trans1.document
- trans2 = translated_revision(is_approved=True)
- trans2.save()
- d2 = trans2.document
- RelatedDocument.objects.create(document=d1.parent,
- related=d2.parent, in_common=2)
- # Assert the English versions still match
- assert list(find_related_documents(d1.parent)) == [d2.parent]
- # Assert that the translation matches
- assert list(find_related_documents(d1)) == [d2]
-
- def test_only_approved_revisions(self):
- calculate_related_documents()
- d = Document.uncached.get(pk=1)
- for rd in d.related_documents.all():
- assert rd.current_revision
-
- def test_only_approved_have_related(self):
- calculate_related_documents()
- d = Document.uncached.get(pk=3)
- eq_(0, d.related_documents.count())
View
42 apps/wiki/tests/test_templates.py
@@ -17,12 +17,12 @@
from wikimarkup.parser import ALLOWED_TAGS, ALLOWED_ATTRIBUTES
from products.tests import product
+from search.tests.test_es import ElasticTestCase
from sumo.helpers import urlparams
from sumo.tests import post, get, attrs_eq, MobileTestCase
from sumo.urlresolvers import reverse
from topics.tests import topic
from users.tests import user, add_permission
-from wiki.cron import calculate_related_documents
from wiki.events import (EditDocumentEvent, ReadyRevisionEvent,
ReviewableRevisionInLocaleEvent,
ApproveRevisionInLocaleEvent)
@@ -2045,24 +2045,44 @@ def test_page_renders_locales(self):
len(doc('#select-locale ul.locales li')))
-class RelatedDocumentTestCase(TestCaseBase):
- fixtures = ['users.json', 'wiki/documents.json']
-
+class RelatedDocumentTestCase(ElasticTestCase):
def setUp(self):
super(RelatedDocumentTestCase, self).setUp()
product(save=True)
- def test_related_order(self):
- calculate_related_documents()
- d = Document.uncached.get(pk=1)
- response = self.client.get(d.get_absolute_url())
+ def test_related_documents(self):
+ # The document
+ d1 = document(title='lorem ipsum', save=True)
+ r1 = revision(document=d1, summary='lorem',
+ content='lorem ipsum dolor',
+ is_approved=True, save=True)
+ d1.current_revision = r1
+ d1.save()
+
+ # A document that is similar.
+ d2 = document(title='lorem ipsum sit', save=True)
+ r2 = revision(document=d2, summary='lorem',
+ content='lorem ipsum dolor sit amet',
+ is_approved=True, save=True)
+ d2.current_revision = r2
+ d2.save()
+
+ # A document that is similar but a different locale.
+ d3 = document(title='lorem ipsum sit', locale='es', save=True)
+ r3 = revision(document=d3, summary='lorem',
+ content='lorem ipsum dolor sit amet',
+ is_approved=True, save=True)
+ d3.current_revision = r3
+ d3.save()
+
+ self.refresh()
+
+ response = self.client.get(d1.get_absolute_url())
doc = pq(response.content)
related = doc('section#related-articles li a')
eq_(1, len(related))
-
- # If 'an article title 2' is first, the other must be second.
- eq_('an article title 2', related[0].text)
+ eq_('lorem ipsum sit', related.text())
class RevisionDeleteTestCase(TestCaseBase):
View
59 apps/wiki/utils.py
@@ -1,59 +0,0 @@
-import logging
-
-from django.conf import settings
-
-from statsd import statsd
-
-from sumo.redis_utils import redis_client, RedisError
-from wiki.models import Document
-
-
-log = logging.getLogger('k.wiki')
-
-
-def related_translated_documents(doc):
- if doc.parent:
- # Use a list because this version of
- # MySQL doesnt like LIMIT (the [0:5]) in a subquery.
- parent_related = doc.parent.related_documents
- parent_related = parent_related.order_by('-related_to__in_common')
- ids = list(parent_related.values_list('id', flat=True)[0:5])
- return Document.objects.filter(locale=doc.locale, parent__in=ids)
- return Document.objects.get_empty_query_set()
-
-
-def find_related_documents(doc):
- """
- Returns a QuerySet of related_docuemnts or of the
- parent's related_documents in the case of translations
- """
- if doc.locale == settings.WIKI_DEFAULT_LANGUAGE:
- return doc.related_documents.order_by('-related_to__in_common')[0:5]
-
- # Not English, so may need related docs which are
- # stored on the English version.
- try:
- redis = redis_client('default')
- except RedisError as e:
- # Problem with Redis. Log and return the related docs.
- statsd.incr('redis.errror')
- log.error('Redis error: %s' % e)
- return related_translated_documents(doc)
-
- doc_key = 'translated_doc_id:%s' % doc.id
- related_ids = redis.lrange(doc_key, 0, -1)
- if related_ids == ['0']:
- return Document.objects.get_empty_query_set()
- if related_ids:
- return Document.objects.filter(id__in=related_ids)
-
- related = related_translated_documents(doc)
- if not related:
- # Add '0' to prevent recalulation on a known empty set.
- redis.lpush(doc_key, 0)
- else:
- for r in related:
- redis.lpush(doc_key, r.id)
- # Cache expires in 2 hours.
- redis.expire(doc_key, 60 * 60 * 2)
- return related
View
3  apps/wiki/views.py
@@ -41,7 +41,6 @@
from wiki.parser import wiki_to_html
from wiki.tasks import (send_reviewed_notification, schedule_rebuild_kb,
send_contributor_notification)
-from wiki.utils import find_related_documents
log = logging.getLogger('k.wiki')
@@ -155,7 +154,7 @@ def document(request, document_slug, template=None):
except Document.DoesNotExist:
pass
- related = find_related_documents(doc)
+ related = doc.related_documents[:5]
contributors = doc.contributors.all()
View
1  migrations/171-drop-related-documents.sql
@@ -0,0 +1 @@
+DROP TABLE IF EXISTS `wiki_relateddocument`;
View
3  scripts/crontab/crontab.tpl
@@ -12,9 +12,6 @@ HOME = /tmp
# Every hour.
42 * * * * {{ django }} cleanup
-# Every 2 hours.
-1 */2 * * * {{ cron }} calculate_related_documents
-
# Every 6 hours.
0 */6 * * * {{ django }} update_product_details -q > /dev/null
10 */6 * * * {{ cron }} rebuild_kb
2  vendor/src/elasticutils
@@ -1 +1 @@
-Subproject commit 34ee6f4c8e3ee8bf517c4bc38553fb23592df206
+Subproject commit cc3e17cd186be05d76fc4c9044f5526bc2f854c0
Please sign in to comment.
Something went wrong with that request. Please try again.